polyphony 0.85 → 0.86

Sign up to get free protection for your applications and to get access to all the features.
Files changed (230) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/Gemfile.lock +1 -1
  4. data/ext/polyphony/io_extensions.c +2 -3
  5. data/lib/polyphony/version.rb +1 -1
  6. data/polyphony.gemspec +1 -1
  7. data/test/test_backend.rb +1 -1
  8. data/test/test_signal.rb +3 -3
  9. data/vendor/liburing/.github/pull_request_template.md +86 -0
  10. data/vendor/liburing/.github/workflows/build.yml +85 -0
  11. data/vendor/liburing/.github/workflows/shellcheck.yml +20 -0
  12. data/vendor/liburing/.gitignore +149 -0
  13. data/vendor/liburing/COPYING +502 -0
  14. data/vendor/liburing/COPYING.GPL +339 -0
  15. data/vendor/liburing/LICENSE +7 -0
  16. data/vendor/liburing/Makefile +82 -0
  17. data/vendor/liburing/Makefile.common +5 -0
  18. data/vendor/liburing/Makefile.quiet +11 -0
  19. data/vendor/liburing/README +46 -0
  20. data/vendor/liburing/configure +486 -0
  21. data/vendor/liburing/debian/README.Debian +7 -0
  22. data/vendor/liburing/debian/changelog +27 -0
  23. data/vendor/liburing/debian/compat +1 -0
  24. data/vendor/liburing/debian/control +48 -0
  25. data/vendor/liburing/debian/copyright +49 -0
  26. data/vendor/liburing/debian/liburing-dev.install +4 -0
  27. data/vendor/liburing/debian/liburing-dev.manpages +6 -0
  28. data/vendor/liburing/debian/liburing1-udeb.install +1 -0
  29. data/vendor/liburing/debian/liburing1.install +1 -0
  30. data/vendor/liburing/debian/liburing1.symbols +32 -0
  31. data/vendor/liburing/debian/patches/series +1 -0
  32. data/vendor/liburing/debian/rules +81 -0
  33. data/vendor/liburing/debian/source/format +1 -0
  34. data/vendor/liburing/debian/source/local-options +2 -0
  35. data/vendor/liburing/debian/source/options +1 -0
  36. data/vendor/liburing/debian/watch +3 -0
  37. data/vendor/liburing/examples/Makefile +38 -0
  38. data/vendor/liburing/examples/io_uring-cp.c +282 -0
  39. data/vendor/liburing/examples/io_uring-test.c +112 -0
  40. data/vendor/liburing/examples/link-cp.c +193 -0
  41. data/vendor/liburing/examples/ucontext-cp.c +273 -0
  42. data/vendor/liburing/liburing.pc.in +12 -0
  43. data/vendor/liburing/liburing.spec +66 -0
  44. data/vendor/liburing/make-debs.sh +53 -0
  45. data/vendor/liburing/man/io_uring.7 +754 -0
  46. data/vendor/liburing/man/io_uring_cq_advance.3 +35 -0
  47. data/vendor/liburing/man/io_uring_cq_ready.3 +25 -0
  48. data/vendor/liburing/man/io_uring_cqe_get_data.3 +34 -0
  49. data/vendor/liburing/man/io_uring_cqe_seen.3 +32 -0
  50. data/vendor/liburing/man/io_uring_enter.2 +1483 -0
  51. data/vendor/liburing/man/io_uring_free_probe.3 +24 -0
  52. data/vendor/liburing/man/io_uring_get_probe.3 +29 -0
  53. data/vendor/liburing/man/io_uring_get_sqe.3 +38 -0
  54. data/vendor/liburing/man/io_uring_opcode_supported.3 +29 -0
  55. data/vendor/liburing/man/io_uring_prep_msg_ring.3 +58 -0
  56. data/vendor/liburing/man/io_uring_prep_read.3 +50 -0
  57. data/vendor/liburing/man/io_uring_prep_read_fixed.3 +54 -0
  58. data/vendor/liburing/man/io_uring_prep_readv.3 +51 -0
  59. data/vendor/liburing/man/io_uring_prep_readv2.3 +79 -0
  60. data/vendor/liburing/man/io_uring_prep_write.3 +50 -0
  61. data/vendor/liburing/man/io_uring_prep_write_fixed.3 +54 -0
  62. data/vendor/liburing/man/io_uring_prep_writev.3 +51 -0
  63. data/vendor/liburing/man/io_uring_prep_writev2.3 +78 -0
  64. data/vendor/liburing/man/io_uring_queue_exit.3 +27 -0
  65. data/vendor/liburing/man/io_uring_queue_init.3 +44 -0
  66. data/vendor/liburing/man/io_uring_register.2 +688 -0
  67. data/vendor/liburing/man/io_uring_register_buffers.3 +41 -0
  68. data/vendor/liburing/man/io_uring_register_files.3 +35 -0
  69. data/vendor/liburing/man/io_uring_setup.2 +534 -0
  70. data/vendor/liburing/man/io_uring_sq_ready.3 +25 -0
  71. data/vendor/liburing/man/io_uring_sq_space_left.3 +25 -0
  72. data/vendor/liburing/man/io_uring_sqe_set_data.3 +30 -0
  73. data/vendor/liburing/man/io_uring_sqe_set_flags.3 +60 -0
  74. data/vendor/liburing/man/io_uring_sqring_wait.3 +30 -0
  75. data/vendor/liburing/man/io_uring_submit.3 +29 -0
  76. data/vendor/liburing/man/io_uring_submit_and_wait.3 +34 -0
  77. data/vendor/liburing/man/io_uring_submit_and_wait_timeout.3 +49 -0
  78. data/vendor/liburing/man/io_uring_unregister_buffers.3 +26 -0
  79. data/vendor/liburing/man/io_uring_unregister_files.3 +26 -0
  80. data/vendor/liburing/man/io_uring_wait_cqe.3 +33 -0
  81. data/vendor/liburing/man/io_uring_wait_cqe_nr.3 +36 -0
  82. data/vendor/liburing/man/io_uring_wait_cqe_timeout.3 +39 -0
  83. data/vendor/liburing/man/io_uring_wait_cqes.3 +46 -0
  84. data/vendor/liburing/src/Makefile +89 -0
  85. data/vendor/liburing/src/arch/aarch64/syscall.h +95 -0
  86. data/vendor/liburing/src/arch/generic/lib.h +21 -0
  87. data/vendor/liburing/src/arch/generic/syscall.h +87 -0
  88. data/vendor/liburing/src/arch/syscall-defs.h +67 -0
  89. data/vendor/liburing/src/arch/x86/lib.h +32 -0
  90. data/vendor/liburing/src/arch/x86/syscall.h +160 -0
  91. data/vendor/liburing/src/include/liburing/barrier.h +81 -0
  92. data/vendor/liburing/src/include/liburing/io_uring.h +442 -0
  93. data/vendor/liburing/src/include/liburing.h +921 -0
  94. data/vendor/liburing/src/int_flags.h +8 -0
  95. data/vendor/liburing/src/lib.h +57 -0
  96. data/vendor/liburing/src/liburing.map +53 -0
  97. data/vendor/liburing/src/nolibc.c +48 -0
  98. data/vendor/liburing/src/queue.c +403 -0
  99. data/vendor/liburing/src/register.c +293 -0
  100. data/vendor/liburing/src/setup.c +332 -0
  101. data/vendor/liburing/src/syscall.c +47 -0
  102. data/vendor/liburing/src/syscall.h +103 -0
  103. data/vendor/liburing/test/232c93d07b74-test.c +306 -0
  104. data/vendor/liburing/test/35fa71a030ca-test.c +329 -0
  105. data/vendor/liburing/test/500f9fbadef8-test.c +89 -0
  106. data/vendor/liburing/test/7ad0e4b2f83c-test.c +93 -0
  107. data/vendor/liburing/test/8a9973408177-test.c +106 -0
  108. data/vendor/liburing/test/917257daa0fe-test.c +53 -0
  109. data/vendor/liburing/test/Makefile +244 -0
  110. data/vendor/liburing/test/a0908ae19763-test.c +58 -0
  111. data/vendor/liburing/test/a4c0b3decb33-test.c +180 -0
  112. data/vendor/liburing/test/accept-link.c +254 -0
  113. data/vendor/liburing/test/accept-reuse.c +164 -0
  114. data/vendor/liburing/test/accept-test.c +79 -0
  115. data/vendor/liburing/test/accept.c +477 -0
  116. data/vendor/liburing/test/across-fork.c +283 -0
  117. data/vendor/liburing/test/b19062a56726-test.c +53 -0
  118. data/vendor/liburing/test/b5837bd5311d-test.c +77 -0
  119. data/vendor/liburing/test/ce593a6c480a-test.c +136 -0
  120. data/vendor/liburing/test/close-opath.c +122 -0
  121. data/vendor/liburing/test/config +10 -0
  122. data/vendor/liburing/test/connect.c +398 -0
  123. data/vendor/liburing/test/cq-full.c +96 -0
  124. data/vendor/liburing/test/cq-overflow.c +294 -0
  125. data/vendor/liburing/test/cq-peek-batch.c +102 -0
  126. data/vendor/liburing/test/cq-ready.c +94 -0
  127. data/vendor/liburing/test/cq-size.c +64 -0
  128. data/vendor/liburing/test/d4ae271dfaae-test.c +96 -0
  129. data/vendor/liburing/test/d77a67ed5f27-test.c +65 -0
  130. data/vendor/liburing/test/defer.c +307 -0
  131. data/vendor/liburing/test/double-poll-crash.c +185 -0
  132. data/vendor/liburing/test/drop-submit.c +92 -0
  133. data/vendor/liburing/test/eeed8b54e0df-test.c +114 -0
  134. data/vendor/liburing/test/empty-eownerdead.c +45 -0
  135. data/vendor/liburing/test/eventfd-disable.c +151 -0
  136. data/vendor/liburing/test/eventfd-reg.c +76 -0
  137. data/vendor/liburing/test/eventfd-ring.c +97 -0
  138. data/vendor/liburing/test/eventfd.c +112 -0
  139. data/vendor/liburing/test/exec-target.c +6 -0
  140. data/vendor/liburing/test/exit-no-cleanup.c +117 -0
  141. data/vendor/liburing/test/fadvise.c +202 -0
  142. data/vendor/liburing/test/fallocate.c +249 -0
  143. data/vendor/liburing/test/fc2a85cb02ef-test.c +131 -0
  144. data/vendor/liburing/test/file-register.c +858 -0
  145. data/vendor/liburing/test/file-update.c +173 -0
  146. data/vendor/liburing/test/file-verify.c +629 -0
  147. data/vendor/liburing/test/files-exit-hang-poll.c +128 -0
  148. data/vendor/liburing/test/files-exit-hang-timeout.c +134 -0
  149. data/vendor/liburing/test/fixed-link.c +90 -0
  150. data/vendor/liburing/test/fpos.c +252 -0
  151. data/vendor/liburing/test/fsync.c +224 -0
  152. data/vendor/liburing/test/hardlink.c +136 -0
  153. data/vendor/liburing/test/helpers.c +135 -0
  154. data/vendor/liburing/test/helpers.h +67 -0
  155. data/vendor/liburing/test/io-cancel.c +550 -0
  156. data/vendor/liburing/test/io_uring_enter.c +296 -0
  157. data/vendor/liburing/test/io_uring_register.c +676 -0
  158. data/vendor/liburing/test/io_uring_setup.c +192 -0
  159. data/vendor/liburing/test/iopoll.c +372 -0
  160. data/vendor/liburing/test/lfs-openat-write.c +119 -0
  161. data/vendor/liburing/test/lfs-openat.c +275 -0
  162. data/vendor/liburing/test/link-timeout.c +1107 -0
  163. data/vendor/liburing/test/link.c +496 -0
  164. data/vendor/liburing/test/link_drain.c +229 -0
  165. data/vendor/liburing/test/madvise.c +195 -0
  166. data/vendor/liburing/test/mkdir.c +108 -0
  167. data/vendor/liburing/test/msg-ring.c +234 -0
  168. data/vendor/liburing/test/multicqes_drain.c +387 -0
  169. data/vendor/liburing/test/nop-all-sizes.c +99 -0
  170. data/vendor/liburing/test/nop.c +115 -0
  171. data/vendor/liburing/test/open-close.c +261 -0
  172. data/vendor/liburing/test/openat2.c +308 -0
  173. data/vendor/liburing/test/personality.c +204 -0
  174. data/vendor/liburing/test/pipe-eof.c +83 -0
  175. data/vendor/liburing/test/pipe-reuse.c +105 -0
  176. data/vendor/liburing/test/poll-cancel-ton.c +135 -0
  177. data/vendor/liburing/test/poll-cancel.c +228 -0
  178. data/vendor/liburing/test/poll-link.c +230 -0
  179. data/vendor/liburing/test/poll-many.c +208 -0
  180. data/vendor/liburing/test/poll-mshot-update.c +273 -0
  181. data/vendor/liburing/test/poll-ring.c +48 -0
  182. data/vendor/liburing/test/poll-v-poll.c +353 -0
  183. data/vendor/liburing/test/poll.c +109 -0
  184. data/vendor/liburing/test/pollfree.c +426 -0
  185. data/vendor/liburing/test/probe.c +135 -0
  186. data/vendor/liburing/test/read-write.c +876 -0
  187. data/vendor/liburing/test/register-restrictions.c +633 -0
  188. data/vendor/liburing/test/rename.c +135 -0
  189. data/vendor/liburing/test/ring-leak.c +173 -0
  190. data/vendor/liburing/test/ring-leak2.c +249 -0
  191. data/vendor/liburing/test/rsrc_tags.c +449 -0
  192. data/vendor/liburing/test/runtests-loop.sh +16 -0
  193. data/vendor/liburing/test/runtests.sh +170 -0
  194. data/vendor/liburing/test/rw_merge_test.c +97 -0
  195. data/vendor/liburing/test/self.c +91 -0
  196. data/vendor/liburing/test/send_recv.c +286 -0
  197. data/vendor/liburing/test/send_recvmsg.c +345 -0
  198. data/vendor/liburing/test/sendmsg_fs_cve.c +200 -0
  199. data/vendor/liburing/test/shared-wq.c +84 -0
  200. data/vendor/liburing/test/short-read.c +75 -0
  201. data/vendor/liburing/test/shutdown.c +165 -0
  202. data/vendor/liburing/test/sigfd-deadlock.c +74 -0
  203. data/vendor/liburing/test/skip-cqe.c +429 -0
  204. data/vendor/liburing/test/socket-rw-eagain.c +158 -0
  205. data/vendor/liburing/test/socket-rw-offset.c +157 -0
  206. data/vendor/liburing/test/socket-rw.c +145 -0
  207. data/vendor/liburing/test/splice.c +512 -0
  208. data/vendor/liburing/test/sq-full-cpp.cc +45 -0
  209. data/vendor/liburing/test/sq-full.c +45 -0
  210. data/vendor/liburing/test/sq-poll-dup.c +204 -0
  211. data/vendor/liburing/test/sq-poll-kthread.c +169 -0
  212. data/vendor/liburing/test/sq-poll-share.c +137 -0
  213. data/vendor/liburing/test/sq-space_left.c +159 -0
  214. data/vendor/liburing/test/sqpoll-cancel-hang.c +157 -0
  215. data/vendor/liburing/test/sqpoll-disable-exit.c +196 -0
  216. data/vendor/liburing/test/sqpoll-exit-hang.c +78 -0
  217. data/vendor/liburing/test/sqpoll-sleep.c +69 -0
  218. data/vendor/liburing/test/statx.c +172 -0
  219. data/vendor/liburing/test/stdout.c +232 -0
  220. data/vendor/liburing/test/submit-link-fail.c +154 -0
  221. data/vendor/liburing/test/submit-reuse.c +239 -0
  222. data/vendor/liburing/test/symlink.c +116 -0
  223. data/vendor/liburing/test/teardowns.c +58 -0
  224. data/vendor/liburing/test/thread-exit.c +143 -0
  225. data/vendor/liburing/test/timeout-new.c +252 -0
  226. data/vendor/liburing/test/timeout-overflow.c +204 -0
  227. data/vendor/liburing/test/timeout.c +1523 -0
  228. data/vendor/liburing/test/unlink.c +112 -0
  229. data/vendor/liburing/test/wakeup-hang.c +162 -0
  230. metadata +223 -2
@@ -0,0 +1,754 @@
1
+ .\" Copyright (C) 2020 Shuveb Hussain <shuveb@gmail.com>
2
+ .\" SPDX-License-Identifier: LGPL-2.0-or-later
3
+ .\"
4
+
5
+ .TH IO_URING 7 2020-07-26 "Linux" "Linux Programmer's Manual"
6
+ .SH NAME
7
+ io_uring \- Asynchronous I/O facility
8
+ .SH SYNOPSIS
9
+ .nf
10
+ .B "#include <linux/io_uring.h>"
11
+ .fi
12
+ .PP
13
+ .SH DESCRIPTION
14
+ .PP
15
+ .B io_uring
16
+ is a Linux-specific API for asynchronous I/O.
17
+ It allows the user to submit one or more I/O requests,
18
+ which are processed asynchronously without blocking the calling process.
19
+ .B io_uring
20
+ gets its name from ring buffers which are shared between user space and
21
+ kernel space. This arrangement allows for efficient I/O,
22
+ while avoiding the overhead of copying buffers between them,
23
+ where possible.
24
+ This interface makes
25
+ .B io_uring
26
+ different from other UNIX I/O APIs,
27
+ wherein,
28
+ rather than just communicate between kernel and user space with system calls,
29
+ ring buffers are used as the main mode of communication.
30
+ This arrangement has various performance benefits which are discussed in a
31
+ separate section below.
32
+ This man page uses the terms shared buffers, shared ring buffers and
33
+ queues interchangeably.
34
+ .PP
35
+ The general programming model you need to follow for
36
+ .B io_uring
37
+ is outlined below
38
+ .IP \(bu
39
+ Set up shared buffers with
40
+ .BR io_uring_setup (2)
41
+ and
42
+ .BR mmap (2),
43
+ mapping into user space shared buffers for the submission queue (SQ) and the
44
+ completion queue (CQ).
45
+ You place I/O requests you want to make on the SQ,
46
+ while the kernel places the results of those operations on the CQ.
47
+ .IP \(bu
48
+ For every I/O request you need to make (like to read a file, write a file,
49
+ accept a socket connection, etc), you create a submission queue entry,
50
+ or SQE,
51
+ describe the I/O operation you need to get done and add it to the tail of
52
+ the submission queue (SQ).
53
+ Each I/O operation is,
54
+ in essence,
55
+ the equivalent of a system call you would have made otherwise,
56
+ if you were not using
57
+ .BR io_uring .
58
+ You can add more than one SQE to the queue depending on the number of
59
+ operations you want to request.
60
+ .IP \(bu
61
+ After you add one or more SQEs,
62
+ you need to call
63
+ .BR io_uring_enter (2)
64
+ to tell the kernel to dequeue your I/O requests off the SQ and begin
65
+ processing them.
66
+ .IP \(bu
67
+ For each SQE you submit,
68
+ once it is done processing the request,
69
+ the kernel places a completion queue event or CQE at the tail of the
70
+ completion queue or CQ.
71
+ The kernel places exactly one matching CQE in the CQ for every SQE you
72
+ submit on the SQ.
73
+ After you retrieve a CQE,
74
+ minimally,
75
+ you might be interested in checking the
76
+ .I res
77
+ field of the CQE structure,
78
+ which corresponds to the return value of the system
79
+ call's equivalent,
80
+ had you used it directly without using
81
+ .BR io_uring .
82
+ For instance,
83
+ a read operation under
84
+ .BR io_uring ,
85
+ started with the
86
+ .BR IORING_OP_READ
87
+ operation, issues the equivalent of the
88
+ .BR read (2)
89
+ system call. In practice, it mixes the semantics of
90
+ .BR pread (2)
91
+ and
92
+ .BR preadv2 (2)
93
+ in that it takes an explicit offset, and supports using -1 for the offset to
94
+ indicate that the current file position should be used instead of passing in
95
+ an explicit offset. See the opcode documentation for more details. Given that
96
+ io_uring is an async interface,
97
+ .I errno
98
+ is never used for passing back error information. Instead,
99
+ .I res
100
+ will contain what the equivalent system call would have returned in case
101
+ of success, and in case of error
102
+ .I res
103
+ will contain
104
+ .I -errno .
105
+ For example, if the normal read system call would have returned -1 and set
106
+ .I errno
107
+ to
108
+ .B EINVAL ,
109
+ then
110
+ .I res
111
+ would contain
112
+ .B -EINVAL .
113
+ If the normal system call would have returned a read size of 1024, then
114
+ .I res
115
+ would contain 1024.
116
+ .IP \(bu
117
+ Optionally,
118
+ .BR io_uring_enter (2)
119
+ can also wait for a specified number of requests to be processed by the kernel
120
+ before it returns.
121
+ If you specified a certain number of completions to wait for,
122
+ the kernel would have placed at least those many number of CQEs on the CQ,
123
+ which you can then readily read,
124
+ right after the return from
125
+ .BR io_uring_enter (2).
126
+ .IP \(bu
127
+ It is important to remember that I/O requests submitted to the kernel can
128
+ complete in any order.
129
+ It is not necessary for the kernel to process one request after another,
130
+ in the order you placed them.
131
+ Given that the interface is a ring,
132
+ the requests are attempted in order,
133
+ however that doesn't imply any sort of ordering on their completion.
134
+ When more than one request is in flight,
135
+ it is not possible to determine which one will complete first.
136
+ When you dequeue CQEs off the CQ,
137
+ you should always check which submitted request it corresponds to.
138
+ The most common method for doing so is utilizing the
139
+ .I user_data
140
+ field in the request, which is passed back on the completion side.
141
+ .PP
142
+ Adding to and reading from the queues:
143
+ .IP \(bu
144
+ You add SQEs to the tail of the SQ.
145
+ The kernel reads SQEs off the head of the queue.
146
+ .IP \(bu
147
+ The kernel adds CQEs to the tail of the CQ.
148
+ You read CQEs off the head of the queue.
149
+ .SS Submission queue polling
150
+ One of the goals of
151
+ .B io_uring
152
+ is to provide a means for efficient I/O.
153
+ To this end,
154
+ .B io_uring
155
+ supports a polling mode that lets you avoid the call to
156
+ .BR io_uring_enter (2),
157
+ which you use to inform the kernel that you have queued SQEs on to the SQ.
158
+ With SQ Polling,
159
+ .B io_uring
160
+ starts a kernel thread that polls the submission queue for any I/O
161
+ requests you submit by adding SQEs.
162
+ With SQ Polling enabled,
163
+ there is no need for you to call
164
+ .BR io_uring_enter (2),
165
+ letting you avoid the overhead of system calls.
166
+ A designated kernel thread dequeues SQEs off the SQ as you add them and
167
+ dispatches them for asynchronous processing.
168
+ .SS Setting up io_uring
169
+ .PP
170
+ The main steps in setting up
171
+ .B io_uring
172
+ consist of mapping in the shared buffers with
173
+ .BR mmap (2)
174
+ calls.
175
+ In the example program included in this man page,
176
+ the function
177
+ .BR app_setup_uring ()
178
+ sets up
179
+ .B io_uring
180
+ with a QUEUE_DEPTH deep submission queue.
181
+ Pay attention to the 2
182
+ .BR mmap (2)
183
+ calls that set up the shared submission and completion queues.
184
+ If your kernel is older than version 5.4,
185
+ three
186
+ .BR mmap(2)
187
+ calls are required.
188
+ .PP
189
+ .SS Submitting I/O requests
190
+ The process of submitting a request consists of describing the I/O
191
+ operation you need to get done using an
192
+ .B io_uring_sqe
193
+ structure instance.
194
+ These details describe the equivalent system call and its parameters.
195
+ Because the range of I/O operations Linux supports are very varied and the
196
+ .B io_uring_sqe
197
+ structure needs to be able to describe them,
198
+ it has several fields,
199
+ some packed into unions for space efficiency.
200
+ Here is a simplified version of struct
201
+ .B io_uring_sqe
202
+ with some of the most often used fields:
203
+ .PP
204
+ .in +4n
205
+ .EX
206
+ struct io_uring_sqe {
207
+ __u8 opcode; /* type of operation for this sqe */
208
+ __s32 fd; /* file descriptor to do IO on */
209
+ __u64 off; /* offset into file */
210
+ __u64 addr; /* pointer to buffer or iovecs */
211
+ __u32 len; /* buffer size or number of iovecs */
212
+ __u64 user_data; /* data to be passed back at completion time */
213
+ __u8 flags; /* IOSQE_ flags */
214
+ ...
215
+ };
216
+ .EE
217
+ .in
218
+
219
+ Here is struct
220
+ .B io_uring_sqe
221
+ in full:
222
+
223
+ .in +4n
224
+ .EX
225
+ struct io_uring_sqe {
226
+ __u8 opcode; /* type of operation for this sqe */
227
+ __u8 flags; /* IOSQE_ flags */
228
+ __u16 ioprio; /* ioprio for the request */
229
+ __s32 fd; /* file descriptor to do IO on */
230
+ union {
231
+ __u64 off; /* offset into file */
232
+ __u64 addr2;
233
+ };
234
+ union {
235
+ __u64 addr; /* pointer to buffer or iovecs */
236
+ __u64 splice_off_in;
237
+ };
238
+ __u32 len; /* buffer size or number of iovecs */
239
+ union {
240
+ __kernel_rwf_t rw_flags;
241
+ __u32 fsync_flags;
242
+ __u16 poll_events; /* compatibility */
243
+ __u32 poll32_events; /* word-reversed for BE */
244
+ __u32 sync_range_flags;
245
+ __u32 msg_flags;
246
+ __u32 timeout_flags;
247
+ __u32 accept_flags;
248
+ __u32 cancel_flags;
249
+ __u32 open_flags;
250
+ __u32 statx_flags;
251
+ __u32 fadvise_advice;
252
+ __u32 splice_flags;
253
+ };
254
+ __u64 user_data; /* data to be passed back at completion time */
255
+ union {
256
+ struct {
257
+ /* pack this to avoid bogus arm OABI complaints */
258
+ union {
259
+ /* index into fixed buffers, if used */
260
+ __u16 buf_index;
261
+ /* for grouped buffer selection */
262
+ __u16 buf_group;
263
+ } __attribute__((packed));
264
+ /* personality to use, if used */
265
+ __u16 personality;
266
+ __s32 splice_fd_in;
267
+ };
268
+ __u64 __pad2[3];
269
+ };
270
+ };
271
+ .EE
272
+ .in
273
+ .PP
274
+ To submit an I/O request to
275
+ .BR io_uring ,
276
+ you need to acquire a submission queue entry (SQE) from the submission
277
+ queue (SQ),
278
+ fill it up with details of the operation you want to submit and call
279
+ .BR io_uring_enter (2).
280
+ If you want to avoid calling
281
+ .BR io_uring_enter (2),
282
+ you have the option of setting up Submission Queue Polling.
283
+ .PP
284
+ SQEs are added to the tail of the submission queue.
285
+ The kernel picks up SQEs off the head of the SQ.
286
+ The general algorithm to get the next available SQE and update the tail is
287
+ as follows.
288
+ .PP
289
+ .in +4n
290
+ .EX
291
+ struct io_uring_sqe *sqe;
292
+ unsigned tail, index;
293
+ tail = *sqring->tail;
294
+ index = tail & (*sqring->ring_mask);
295
+ sqe = &sqring->sqes[index];
296
+ /* fill up details about this I/O request */
297
+ describe_io(sqe);
298
+ /* fill the sqe index into the SQ ring array */
299
+ sqring->array[index] = index;
300
+ tail++;
301
+ atomic_store_release(sqring->tail, tail);
302
+ .EE
303
+ .in
304
+ .PP
305
+ To get the index of an entry,
306
+ the application must mask the current tail index with the size mask of the
307
+ ring.
308
+ This holds true for both SQs and CQs.
309
+ Once the SQE is acquired,
310
+ the necessary fields are filled in,
311
+ describing the request.
312
+ While the CQ ring directly indexes the shared array of CQEs,
313
+ the submission side has an indirection array between them.
314
+ The submission side ring buffer is an index into this array,
315
+ which in turn contains the index into the SQEs.
316
+ .PP
317
+ The following code snippet demonstrates how a read operation,
318
+ an equivalent of a
319
+ .BR preadv2 (2)
320
+ system call is described by filling up an SQE with the necessary
321
+ parameters.
322
+ .PP
323
+ .in +4n
324
+ .EX
325
+ struct iovec iovecs[16];
326
+ ...
327
+ sqe->opcode = IORING_OP_READV;
328
+ sqe->fd = fd;
329
+ sqe->addr = (unsigned long) iovecs;
330
+ sqe->len = 16;
331
+ sqe->off = offset;
332
+ sqe->flags = 0;
333
+ .EE
334
+ .in
335
+ .TP
336
+ .B Memory ordering
337
+ Modern compilers and CPUs freely reorder reads and writes without
338
+ affecting the program's outcome to optimize performance.
339
+ Some aspects of this need to be kept in mind on SMP systems since
340
+ .B io_uring
341
+ involves buffers shared between kernel and user space.
342
+ These buffers are both visible and modifiable from kernel and user space.
343
+ As heads and tails belonging to these shared buffers are updated by kernel
344
+ and user space,
345
+ changes need to be coherently visible on either side,
346
+ irrespective of whether a CPU switch took place after the kernel-user mode
347
+ switch happened.
348
+ We use memory barriers to enforce this coherency.
349
+ Being significantly large subjects on their own,
350
+ memory barriers are out of scope for further discussion on this man page.
351
+ .TP
352
+ .B Letting the kernel know about I/O submissions
353
+ Once you place one or more SQEs on to the SQ,
354
+ you need to let the kernel know that you've done so.
355
+ You can do this by calling the
356
+ .BR io_uring_enter (2)
357
+ system call.
358
+ This system call is also capable of waiting for a specified count of
359
+ events to complete.
360
+ This way,
361
+ you can be sure to find completion events in the completion queue without
362
+ having to poll it for events later.
363
+ .SS Reading completion events
364
+ Similar to the submission queue (SQ),
365
+ the completion queue (CQ) is a shared buffer between the kernel and user
366
+ space.
367
+ Whereas you placed submission queue entries on the tail of the SQ and the
368
+ kernel read off the head,
369
+ when it comes to the CQ,
370
+ the kernel places completion queue events or CQEs on the tail of the CQ and
371
+ you read off its head.
372
+ .PP
373
+ Submission is flexible (and thus a bit more complicated) since it needs to
374
+ be able to encode different types of system calls that take various
375
+ parameters.
376
+ Completion,
377
+ on the other hand is simpler since we're looking only for a return value
378
+ back from the kernel.
379
+ This is easily understood by looking at the completion queue event
380
+ structure,
381
+ struct
382
+ .BR io_uring_cqe :
383
+ .PP
384
+ .in +4n
385
+ .EX
386
+ struct io_uring_cqe {
387
+ __u64 user_data; /* sqe->data submission passed back */
388
+ __s32 res; /* result code for this event */
389
+ __u32 flags;
390
+ };
391
+ .EE
392
+ .in
393
+ .PP
394
+ Here,
395
+ .I user_data
396
+ is custom data that is passed unchanged from submission to completion.
397
+ That is,
398
+ from SQEs to CQEs.
399
+ This field can be used to set context,
400
+ uniquely identifying submissions that got completed.
401
+ Given that I/O requests can complete in any order,
402
+ this field can be used to correlate a submission with a completion.
403
+ .I res
404
+ is the result from the system call that was performed as part of the
405
+ submission;
406
+ its return value.
407
+ The
408
+ .I flags
409
+ field could carry request-specific metadata in the future,
410
+ but is currently unused.
411
+ .PP
412
+ The general sequence to read completion events off the completion queue is
413
+ as follows:
414
+ .PP
415
+ .in +4n
416
+ .EX
417
+ unsigned head;
418
+ head = *cqring->head;
419
+ if (head != atomic_load_acquire(cqring->tail)) {
420
+ struct io_uring_cqe *cqe;
421
+ unsigned index;
422
+ index = head & (cqring->mask);
423
+ cqe = &cqring->cqes[index];
424
+ /* process completed CQE */
425
+ process_cqe(cqe);
426
+ /* CQE consumption complete */
427
+ head++;
428
+ }
429
+ atomic_store_release(cqring->head, head);
430
+ .EE
431
+ .in
432
+ .PP
433
+ It helps to be reminded that the kernel adds CQEs to the tail of the CQ,
434
+ while you need to dequeue them off the head.
435
+ To get the index of an entry at the head,
436
+ the application must mask the current head index with the size mask of the
437
+ ring.
438
+ Once the CQE has been consumed or processed,
439
+ the head needs to be updated to reflect the consumption of the CQE.
440
+ Attention should be paid to the read and write barriers to ensure
441
+ successful read and update of the head.
442
+ .SS io_uring performance
443
+ Because of the shared ring buffers between kernel and user space,
444
+ .B io_uring
445
+ can be a zero-copy system.
446
+ Copying buffers to and from becomes necessary when system calls that
447
+ transfer data between kernel and user space are involved.
448
+ But since the bulk of the communication in
449
+ .B io_uring
450
+ is via buffers shared between the kernel and user space,
451
+ this huge performance overhead is completely avoided.
452
+ .PP
453
+ While system calls may not seem like a significant overhead,
454
+ in high performance applications,
455
+ making a lot of them will begin to matter.
456
+ While workarounds the operating system has in place to deal with Spectre
457
+ and Meltdown are ideally best done away with,
458
+ unfortunately,
459
+ some of these workarounds are around the system call interface,
460
+ making system calls not as cheap as before on affected hardware.
461
+ While newer hardware should not need these workarounds,
462
+ hardware with these vulnerabilities can be expected to be in the wild for a
463
+ long time.
464
+ While using synchronous programming interfaces or even when using
465
+ asynchronous programming interfaces under Linux,
466
+ there is at least one system call involved in the submission of each
467
+ request.
468
+ In
469
+ .BR io_uring ,
470
+ on the other hand,
471
+ you can batch several requests in one go,
472
+ simply by queueing up multiple SQEs,
473
+ each describing an I/O operation you want and make a single call to
474
+ .BR io_uring_enter (2).
475
+ This is possible due to
476
+ .BR io_uring 's
477
+ shared buffers based design.
478
+ .PP
479
+ While this batching in itself can avoid the overhead associated with
480
+ potentially multiple and frequent system calls,
481
+ you can reduce even this overhead further with Submission Queue Polling,
482
+ by having the kernel poll and pick up your SQEs for processing as you add
483
+ them to the submission queue. This avoids the
484
+ .BR io_uring_enter (2)
485
+ call you need to make to tell the kernel to pick SQEs up.
486
+ For high-performance applications,
487
+ this means even lesser system call overheads.
488
+ .SH CONFORMING TO
489
+ .B io_uring
490
+ is Linux-specific.
491
+ .SH EXAMPLES
492
+ The following example uses
493
+ .B io_uring
494
+ to copy stdin to stdout.
495
+ Using shell redirection,
496
+ you should be able to copy files with this example.
497
+ Because it uses a queue depth of only one,
498
+ this example processes I/O requests one after the other.
499
+ It is purposefully kept this way to aid understanding.
500
+ In real-world scenarios however,
501
+ you'll want to have a larger queue depth to parallelize I/O request
502
+ processing so as to gain the kind of performance benefits
503
+ .B io_uring
504
+ provides with its asynchronous processing of requests.
505
+ .PP
506
+ .EX
507
+ #include <stdio.h>
508
+ #include <stdlib.h>
509
+ #include <sys/stat.h>
510
+ #include <sys/ioctl.h>
511
+ #include <sys/syscall.h>
512
+ #include <sys/mman.h>
513
+ #include <sys/uio.h>
514
+ #include <linux/fs.h>
515
+ #include <fcntl.h>
516
+ #include <unistd.h>
517
+ #include <string.h>
518
+ #include <stdatomic.h>
519
+
520
+ #include <linux/io_uring.h>
521
+
522
+ #define QUEUE_DEPTH 1
523
+ #define BLOCK_SZ 1024
524
+
525
+ /* Macros for barriers needed by io_uring */
526
+ #define io_uring_smp_store_release(p, v) \\
527
+ atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \\
528
+ memory_order_release)
529
+ #define io_uring_smp_load_acquire(p) \\
530
+ atomic_load_explicit((_Atomic typeof(*(p)) *)(p), \\
531
+ memory_order_acquire)
532
+
533
+ int ring_fd;
534
+ unsigned *sring_tail, *sring_mask, *sring_array,
535
+ *cring_head, *cring_tail, *cring_mask;
536
+ struct io_uring_sqe *sqes;
537
+ struct io_uring_cqe *cqes;
538
+ char buff[BLOCK_SZ];
539
+ off_t offset;
540
+
541
+ /*
542
+ * System call wrappers provided since glibc does not yet
543
+ * provide wrappers for io_uring system calls.
544
+ * */
545
+
546
+ int io_uring_setup(unsigned entries, struct io_uring_params *p)
547
+ {
548
+ return (int) syscall(__NR_io_uring_setup, entries, p);
549
+ }
550
+
551
+ int io_uring_enter(int ring_fd, unsigned int to_submit,
552
+ unsigned int min_complete, unsigned int flags)
553
+ {
554
+ return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete,
555
+ flags, NULL, 0);
556
+ }
557
+
558
+ int app_setup_uring(void) {
559
+ struct io_uring_params p;
560
+ void *sq_ptr, *cq_ptr;
561
+
562
+ /* See io_uring_setup(2) for io_uring_params.flags you can set */
563
+ memset(&p, 0, sizeof(p));
564
+ ring_fd = io_uring_setup(QUEUE_DEPTH, &p);
565
+ if (ring_fd < 0) {
566
+ perror("io_uring_setup");
567
+ return 1;
568
+ }
569
+
570
+ /*
571
+ * io_uring communication happens via 2 shared kernel-user space ring
572
+ * buffers, which can be jointly mapped with a single mmap() call in
573
+ * kernels >= 5.4.
574
+ */
575
+
576
+ int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
577
+ int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
578
+
579
+ /* Rather than check for kernel version, the recommended way is to
580
+ * check the features field of the io_uring_params structure, which is a
581
+ * bitmask. If IORING_FEAT_SINGLE_MMAP is set, we can do away with the
582
+ * second mmap() call to map in the completion ring separately.
583
+ */
584
+ if (p.features & IORING_FEAT_SINGLE_MMAP) {
585
+ if (cring_sz > sring_sz)
586
+ sring_sz = cring_sz;
587
+ cring_sz = sring_sz;
588
+ }
589
+
590
+ /* Map in the submission and completion queue ring buffers.
591
+ * Kernels < 5.4 only map in the submission queue, though.
592
+ */
593
+ sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
594
+ MAP_SHARED | MAP_POPULATE,
595
+ ring_fd, IORING_OFF_SQ_RING);
596
+ if (sq_ptr == MAP_FAILED) {
597
+ perror("mmap");
598
+ return 1;
599
+ }
600
+
601
+ if (p.features & IORING_FEAT_SINGLE_MMAP) {
602
+ cq_ptr = sq_ptr;
603
+ } else {
604
+ /* Map in the completion queue ring buffer in older kernels separately */
605
+ cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
606
+ MAP_SHARED | MAP_POPULATE,
607
+ ring_fd, IORING_OFF_CQ_RING);
608
+ if (cq_ptr == MAP_FAILED) {
609
+ perror("mmap");
610
+ return 1;
611
+ }
612
+ }
613
+ /* Save useful fields for later easy reference */
614
+ sring_tail = sq_ptr + p.sq_off.tail;
615
+ sring_mask = sq_ptr + p.sq_off.ring_mask;
616
+ sring_array = sq_ptr + p.sq_off.array;
617
+
618
+ /* Map in the submission queue entries array */
619
+ sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
620
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
621
+ ring_fd, IORING_OFF_SQES);
622
+ if (sqes == MAP_FAILED) {
623
+ perror("mmap");
624
+ return 1;
625
+ }
626
+
627
+ /* Save useful fields for later easy reference */
628
+ cring_head = cq_ptr + p.cq_off.head;
629
+ cring_tail = cq_ptr + p.cq_off.tail;
630
+ cring_mask = cq_ptr + p.cq_off.ring_mask;
631
+ cqes = cq_ptr + p.cq_off.cqes;
632
+
633
+ return 0;
634
+ }
635
+
636
+ /*
637
+ * Read from completion queue.
638
+ * In this function, we read completion events from the completion queue.
639
+ * We dequeue the CQE, update and head and return the result of the operation.
640
+ * */
641
+
642
+ int read_from_cq() {
643
+ struct io_uring_cqe *cqe;
644
+ unsigned head;
645
+
646
+ /* Read barrier */
647
+ head = io_uring_smp_load_acquire(cring_head);
648
+ /*
649
+ * Remember, this is a ring buffer. If head == tail, it means that the
650
+ * buffer is empty.
651
+ * */
652
+ if (head == *cring_tail)
653
+ return -1;
654
+
655
+ /* Get the entry */
656
+ cqe = &cqes[head & (*cring_mask)];
657
+ if (cqe->res < 0)
658
+ fprintf(stderr, "Error: %s\\n", strerror(abs(cqe->res)));
659
+
660
+ head++;
661
+
662
+ /* Write barrier so that update to the head are made visible */
663
+ io_uring_smp_store_release(cring_head, head);
664
+
665
+ return cqe->res;
666
+ }
667
+
668
+ /*
669
+ * Submit a read or a write request to the submission queue.
670
+ * */
671
+
672
+ int submit_to_sq(int fd, int op) {
673
+ unsigned index, tail;
674
+
675
+ /* Add our submission queue entry to the tail of the SQE ring buffer */
676
+ tail = *sring_tail;
677
+ index = tail & *sring_mask;
678
+ struct io_uring_sqe *sqe = &sqes[index];
679
+ /* Fill in the parameters required for the read or write operation */
680
+ sqe->opcode = op;
681
+ sqe->fd = fd;
682
+ sqe->addr = (unsigned long) buff;
683
+ if (op == IORING_OP_READ) {
684
+ memset(buff, 0, sizeof(buff));
685
+ sqe->len = BLOCK_SZ;
686
+ }
687
+ else {
688
+ sqe->len = strlen(buff);
689
+ }
690
+ sqe->off = offset;
691
+
692
+ sring_array[index] = index;
693
+ tail++;
694
+
695
+ /* Update the tail */
696
+ io_uring_smp_store_release(sring_tail, tail);
697
+
698
+ /*
699
+ * Tell the kernel we have submitted events with the io_uring_enter() system
700
+ * call. We also pass in the IOURING_ENTER_GETEVENTS flag which causes the
701
+ * io_uring_enter() call to wait until min_complete (the 3rd param) events
702
+ * complete.
703
+ * */
704
+ int ret = io_uring_enter(ring_fd, 1,1,
705
+ IORING_ENTER_GETEVENTS);
706
+ if(ret < 0) {
707
+ perror("io_uring_enter");
708
+ return -1;
709
+ }
710
+
711
+ return ret;
712
+ }
713
+
714
+ int main(int argc, char *argv[]) {
715
+ int res;
716
+
717
+ /* Setup io_uring for use */
718
+ if(app_setup_uring()) {
719
+ fprintf(stderr, "Unable to setup uring!\\n");
720
+ return 1;
721
+ }
722
+
723
+ /*
724
+ * A while loop that reads from stdin and writes to stdout.
725
+ * Breaks on EOF.
726
+ */
727
+ while (1) {
728
+ /* Initiate read from stdin and wait for it to complete */
729
+ submit_to_sq(STDIN_FILENO, IORING_OP_READ);
730
+ /* Read completion queue entry */
731
+ res = read_from_cq();
732
+ if (res > 0) {
733
+ /* Read successful. Write to stdout. */
734
+ submit_to_sq(STDOUT_FILENO, IORING_OP_WRITE);
735
+ read_from_cq();
736
+ } else if (res == 0) {
737
+ /* reached EOF */
738
+ break;
739
+ }
740
+ else if (res < 0) {
741
+ /* Error reading file */
742
+ fprintf(stderr, "Error: %s\\n", strerror(abs(res)));
743
+ break;
744
+ }
745
+ offset += res;
746
+ }
747
+
748
+ return 0;
749
+ }
750
+ .EE
751
+ .SH SEE ALSO
752
+ .BR io_uring_enter (2)
753
+ .BR io_uring_register (2)
754
+ .BR io_uring_setup (2)