polyphony 0.84 → 0.86

Sign up to get free protection for your applications and to get access to all the features.
Files changed (241) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -0
  3. data/Gemfile.lock +1 -1
  4. data/Rakefile +1 -1
  5. data/examples/core/multi_suspend.rb +39 -0
  6. data/examples/core/shutdown_all_children.rb +41 -0
  7. data/examples/io/gzip.rb +8 -0
  8. data/examples/io/splice_echo_server.rb +15 -0
  9. data/ext/polyphony/backend_io_uring.c +57 -31
  10. data/ext/polyphony/io_extensions.c +137 -26
  11. data/lib/polyphony/extensions/fiber.rb +3 -1
  12. data/lib/polyphony/extensions/io.rb +4 -0
  13. data/lib/polyphony/extensions/pipe.rb +4 -0
  14. data/lib/polyphony/extensions/socket.rb +4 -0
  15. data/lib/polyphony/version.rb +1 -1
  16. data/polyphony.gemspec +1 -1
  17. data/test/test_backend.rb +1 -1
  18. data/test/test_fiber.rb +5 -2
  19. data/test/test_signal.rb +3 -3
  20. data/vendor/liburing/.github/pull_request_template.md +86 -0
  21. data/vendor/liburing/.github/workflows/build.yml +85 -0
  22. data/vendor/liburing/.github/workflows/shellcheck.yml +20 -0
  23. data/vendor/liburing/.gitignore +149 -0
  24. data/vendor/liburing/COPYING +502 -0
  25. data/vendor/liburing/COPYING.GPL +339 -0
  26. data/vendor/liburing/LICENSE +7 -0
  27. data/vendor/liburing/Makefile +82 -0
  28. data/vendor/liburing/Makefile.common +5 -0
  29. data/vendor/liburing/Makefile.quiet +11 -0
  30. data/vendor/liburing/README +46 -0
  31. data/vendor/liburing/configure +486 -0
  32. data/vendor/liburing/debian/README.Debian +7 -0
  33. data/vendor/liburing/debian/changelog +27 -0
  34. data/vendor/liburing/debian/compat +1 -0
  35. data/vendor/liburing/debian/control +48 -0
  36. data/vendor/liburing/debian/copyright +49 -0
  37. data/vendor/liburing/debian/liburing-dev.install +4 -0
  38. data/vendor/liburing/debian/liburing-dev.manpages +6 -0
  39. data/vendor/liburing/debian/liburing1-udeb.install +1 -0
  40. data/vendor/liburing/debian/liburing1.install +1 -0
  41. data/vendor/liburing/debian/liburing1.symbols +32 -0
  42. data/vendor/liburing/debian/patches/series +1 -0
  43. data/vendor/liburing/debian/rules +81 -0
  44. data/vendor/liburing/debian/source/format +1 -0
  45. data/vendor/liburing/debian/source/local-options +2 -0
  46. data/vendor/liburing/debian/source/options +1 -0
  47. data/vendor/liburing/debian/watch +3 -0
  48. data/vendor/liburing/examples/Makefile +38 -0
  49. data/vendor/liburing/examples/io_uring-cp.c +282 -0
  50. data/vendor/liburing/examples/io_uring-test.c +112 -0
  51. data/vendor/liburing/examples/link-cp.c +193 -0
  52. data/vendor/liburing/examples/ucontext-cp.c +273 -0
  53. data/vendor/liburing/liburing.pc.in +12 -0
  54. data/vendor/liburing/liburing.spec +66 -0
  55. data/vendor/liburing/make-debs.sh +53 -0
  56. data/vendor/liburing/man/io_uring.7 +754 -0
  57. data/vendor/liburing/man/io_uring_cq_advance.3 +35 -0
  58. data/vendor/liburing/man/io_uring_cq_ready.3 +25 -0
  59. data/vendor/liburing/man/io_uring_cqe_get_data.3 +34 -0
  60. data/vendor/liburing/man/io_uring_cqe_seen.3 +32 -0
  61. data/vendor/liburing/man/io_uring_enter.2 +1483 -0
  62. data/vendor/liburing/man/io_uring_free_probe.3 +24 -0
  63. data/vendor/liburing/man/io_uring_get_probe.3 +29 -0
  64. data/vendor/liburing/man/io_uring_get_sqe.3 +38 -0
  65. data/vendor/liburing/man/io_uring_opcode_supported.3 +29 -0
  66. data/vendor/liburing/man/io_uring_prep_msg_ring.3 +58 -0
  67. data/vendor/liburing/man/io_uring_prep_read.3 +50 -0
  68. data/vendor/liburing/man/io_uring_prep_read_fixed.3 +54 -0
  69. data/vendor/liburing/man/io_uring_prep_readv.3 +51 -0
  70. data/vendor/liburing/man/io_uring_prep_readv2.3 +79 -0
  71. data/vendor/liburing/man/io_uring_prep_write.3 +50 -0
  72. data/vendor/liburing/man/io_uring_prep_write_fixed.3 +54 -0
  73. data/vendor/liburing/man/io_uring_prep_writev.3 +51 -0
  74. data/vendor/liburing/man/io_uring_prep_writev2.3 +78 -0
  75. data/vendor/liburing/man/io_uring_queue_exit.3 +27 -0
  76. data/vendor/liburing/man/io_uring_queue_init.3 +44 -0
  77. data/vendor/liburing/man/io_uring_register.2 +688 -0
  78. data/vendor/liburing/man/io_uring_register_buffers.3 +41 -0
  79. data/vendor/liburing/man/io_uring_register_files.3 +35 -0
  80. data/vendor/liburing/man/io_uring_setup.2 +534 -0
  81. data/vendor/liburing/man/io_uring_sq_ready.3 +25 -0
  82. data/vendor/liburing/man/io_uring_sq_space_left.3 +25 -0
  83. data/vendor/liburing/man/io_uring_sqe_set_data.3 +30 -0
  84. data/vendor/liburing/man/io_uring_sqe_set_flags.3 +60 -0
  85. data/vendor/liburing/man/io_uring_sqring_wait.3 +30 -0
  86. data/vendor/liburing/man/io_uring_submit.3 +29 -0
  87. data/vendor/liburing/man/io_uring_submit_and_wait.3 +34 -0
  88. data/vendor/liburing/man/io_uring_submit_and_wait_timeout.3 +49 -0
  89. data/vendor/liburing/man/io_uring_unregister_buffers.3 +26 -0
  90. data/vendor/liburing/man/io_uring_unregister_files.3 +26 -0
  91. data/vendor/liburing/man/io_uring_wait_cqe.3 +33 -0
  92. data/vendor/liburing/man/io_uring_wait_cqe_nr.3 +36 -0
  93. data/vendor/liburing/man/io_uring_wait_cqe_timeout.3 +39 -0
  94. data/vendor/liburing/man/io_uring_wait_cqes.3 +46 -0
  95. data/vendor/liburing/src/Makefile +89 -0
  96. data/vendor/liburing/src/arch/aarch64/syscall.h +95 -0
  97. data/vendor/liburing/src/arch/generic/lib.h +21 -0
  98. data/vendor/liburing/src/arch/generic/syscall.h +87 -0
  99. data/vendor/liburing/src/arch/syscall-defs.h +67 -0
  100. data/vendor/liburing/src/arch/x86/lib.h +32 -0
  101. data/vendor/liburing/src/arch/x86/syscall.h +160 -0
  102. data/vendor/liburing/src/include/liburing/barrier.h +81 -0
  103. data/vendor/liburing/src/include/liburing/io_uring.h +442 -0
  104. data/vendor/liburing/src/include/liburing.h +921 -0
  105. data/vendor/liburing/src/int_flags.h +8 -0
  106. data/vendor/liburing/src/lib.h +57 -0
  107. data/vendor/liburing/src/liburing.map +53 -0
  108. data/vendor/liburing/src/nolibc.c +48 -0
  109. data/vendor/liburing/src/queue.c +403 -0
  110. data/vendor/liburing/src/register.c +293 -0
  111. data/vendor/liburing/src/setup.c +332 -0
  112. data/vendor/liburing/src/syscall.c +47 -0
  113. data/vendor/liburing/src/syscall.h +103 -0
  114. data/vendor/liburing/test/232c93d07b74-test.c +306 -0
  115. data/vendor/liburing/test/35fa71a030ca-test.c +329 -0
  116. data/vendor/liburing/test/500f9fbadef8-test.c +89 -0
  117. data/vendor/liburing/test/7ad0e4b2f83c-test.c +93 -0
  118. data/vendor/liburing/test/8a9973408177-test.c +106 -0
  119. data/vendor/liburing/test/917257daa0fe-test.c +53 -0
  120. data/vendor/liburing/test/Makefile +244 -0
  121. data/vendor/liburing/test/a0908ae19763-test.c +58 -0
  122. data/vendor/liburing/test/a4c0b3decb33-test.c +180 -0
  123. data/vendor/liburing/test/accept-link.c +254 -0
  124. data/vendor/liburing/test/accept-reuse.c +164 -0
  125. data/vendor/liburing/test/accept-test.c +79 -0
  126. data/vendor/liburing/test/accept.c +477 -0
  127. data/vendor/liburing/test/across-fork.c +283 -0
  128. data/vendor/liburing/test/b19062a56726-test.c +53 -0
  129. data/vendor/liburing/test/b5837bd5311d-test.c +77 -0
  130. data/vendor/liburing/test/ce593a6c480a-test.c +136 -0
  131. data/vendor/liburing/test/close-opath.c +122 -0
  132. data/vendor/liburing/test/config +10 -0
  133. data/vendor/liburing/test/connect.c +398 -0
  134. data/vendor/liburing/test/cq-full.c +96 -0
  135. data/vendor/liburing/test/cq-overflow.c +294 -0
  136. data/vendor/liburing/test/cq-peek-batch.c +102 -0
  137. data/vendor/liburing/test/cq-ready.c +94 -0
  138. data/vendor/liburing/test/cq-size.c +64 -0
  139. data/vendor/liburing/test/d4ae271dfaae-test.c +96 -0
  140. data/vendor/liburing/test/d77a67ed5f27-test.c +65 -0
  141. data/vendor/liburing/test/defer.c +307 -0
  142. data/vendor/liburing/test/double-poll-crash.c +185 -0
  143. data/vendor/liburing/test/drop-submit.c +92 -0
  144. data/vendor/liburing/test/eeed8b54e0df-test.c +114 -0
  145. data/vendor/liburing/test/empty-eownerdead.c +45 -0
  146. data/vendor/liburing/test/eventfd-disable.c +151 -0
  147. data/vendor/liburing/test/eventfd-reg.c +76 -0
  148. data/vendor/liburing/test/eventfd-ring.c +97 -0
  149. data/vendor/liburing/test/eventfd.c +112 -0
  150. data/vendor/liburing/test/exec-target.c +6 -0
  151. data/vendor/liburing/test/exit-no-cleanup.c +117 -0
  152. data/vendor/liburing/test/fadvise.c +202 -0
  153. data/vendor/liburing/test/fallocate.c +249 -0
  154. data/vendor/liburing/test/fc2a85cb02ef-test.c +131 -0
  155. data/vendor/liburing/test/file-register.c +858 -0
  156. data/vendor/liburing/test/file-update.c +173 -0
  157. data/vendor/liburing/test/file-verify.c +629 -0
  158. data/vendor/liburing/test/files-exit-hang-poll.c +128 -0
  159. data/vendor/liburing/test/files-exit-hang-timeout.c +134 -0
  160. data/vendor/liburing/test/fixed-link.c +90 -0
  161. data/vendor/liburing/test/fpos.c +252 -0
  162. data/vendor/liburing/test/fsync.c +224 -0
  163. data/vendor/liburing/test/hardlink.c +136 -0
  164. data/vendor/liburing/test/helpers.c +135 -0
  165. data/vendor/liburing/test/helpers.h +67 -0
  166. data/vendor/liburing/test/io-cancel.c +550 -0
  167. data/vendor/liburing/test/io_uring_enter.c +296 -0
  168. data/vendor/liburing/test/io_uring_register.c +676 -0
  169. data/vendor/liburing/test/io_uring_setup.c +192 -0
  170. data/vendor/liburing/test/iopoll.c +372 -0
  171. data/vendor/liburing/test/lfs-openat-write.c +119 -0
  172. data/vendor/liburing/test/lfs-openat.c +275 -0
  173. data/vendor/liburing/test/link-timeout.c +1107 -0
  174. data/vendor/liburing/test/link.c +496 -0
  175. data/vendor/liburing/test/link_drain.c +229 -0
  176. data/vendor/liburing/test/madvise.c +195 -0
  177. data/vendor/liburing/test/mkdir.c +108 -0
  178. data/vendor/liburing/test/msg-ring.c +234 -0
  179. data/vendor/liburing/test/multicqes_drain.c +387 -0
  180. data/vendor/liburing/test/nop-all-sizes.c +99 -0
  181. data/vendor/liburing/test/nop.c +115 -0
  182. data/vendor/liburing/test/open-close.c +261 -0
  183. data/vendor/liburing/test/openat2.c +308 -0
  184. data/vendor/liburing/test/personality.c +204 -0
  185. data/vendor/liburing/test/pipe-eof.c +83 -0
  186. data/vendor/liburing/test/pipe-reuse.c +105 -0
  187. data/vendor/liburing/test/poll-cancel-ton.c +135 -0
  188. data/vendor/liburing/test/poll-cancel.c +228 -0
  189. data/vendor/liburing/test/poll-link.c +230 -0
  190. data/vendor/liburing/test/poll-many.c +208 -0
  191. data/vendor/liburing/test/poll-mshot-update.c +273 -0
  192. data/vendor/liburing/test/poll-ring.c +48 -0
  193. data/vendor/liburing/test/poll-v-poll.c +353 -0
  194. data/vendor/liburing/test/poll.c +109 -0
  195. data/vendor/liburing/test/pollfree.c +426 -0
  196. data/vendor/liburing/test/probe.c +135 -0
  197. data/vendor/liburing/test/read-write.c +876 -0
  198. data/vendor/liburing/test/register-restrictions.c +633 -0
  199. data/vendor/liburing/test/rename.c +135 -0
  200. data/vendor/liburing/test/ring-leak.c +173 -0
  201. data/vendor/liburing/test/ring-leak2.c +249 -0
  202. data/vendor/liburing/test/rsrc_tags.c +449 -0
  203. data/vendor/liburing/test/runtests-loop.sh +16 -0
  204. data/vendor/liburing/test/runtests.sh +170 -0
  205. data/vendor/liburing/test/rw_merge_test.c +97 -0
  206. data/vendor/liburing/test/self.c +91 -0
  207. data/vendor/liburing/test/send_recv.c +286 -0
  208. data/vendor/liburing/test/send_recvmsg.c +345 -0
  209. data/vendor/liburing/test/sendmsg_fs_cve.c +200 -0
  210. data/vendor/liburing/test/shared-wq.c +84 -0
  211. data/vendor/liburing/test/short-read.c +75 -0
  212. data/vendor/liburing/test/shutdown.c +165 -0
  213. data/vendor/liburing/test/sigfd-deadlock.c +74 -0
  214. data/vendor/liburing/test/skip-cqe.c +429 -0
  215. data/vendor/liburing/test/socket-rw-eagain.c +158 -0
  216. data/vendor/liburing/test/socket-rw-offset.c +157 -0
  217. data/vendor/liburing/test/socket-rw.c +145 -0
  218. data/vendor/liburing/test/splice.c +512 -0
  219. data/vendor/liburing/test/sq-full-cpp.cc +45 -0
  220. data/vendor/liburing/test/sq-full.c +45 -0
  221. data/vendor/liburing/test/sq-poll-dup.c +204 -0
  222. data/vendor/liburing/test/sq-poll-kthread.c +169 -0
  223. data/vendor/liburing/test/sq-poll-share.c +137 -0
  224. data/vendor/liburing/test/sq-space_left.c +159 -0
  225. data/vendor/liburing/test/sqpoll-cancel-hang.c +157 -0
  226. data/vendor/liburing/test/sqpoll-disable-exit.c +196 -0
  227. data/vendor/liburing/test/sqpoll-exit-hang.c +78 -0
  228. data/vendor/liburing/test/sqpoll-sleep.c +69 -0
  229. data/vendor/liburing/test/statx.c +172 -0
  230. data/vendor/liburing/test/stdout.c +232 -0
  231. data/vendor/liburing/test/submit-link-fail.c +154 -0
  232. data/vendor/liburing/test/submit-reuse.c +239 -0
  233. data/vendor/liburing/test/symlink.c +116 -0
  234. data/vendor/liburing/test/teardowns.c +58 -0
  235. data/vendor/liburing/test/thread-exit.c +143 -0
  236. data/vendor/liburing/test/timeout-new.c +252 -0
  237. data/vendor/liburing/test/timeout-overflow.c +204 -0
  238. data/vendor/liburing/test/timeout.c +1523 -0
  239. data/vendor/liburing/test/unlink.c +112 -0
  240. data/vendor/liburing/test/wakeup-hang.c +162 -0
  241. metadata +227 -2
@@ -0,0 +1,1483 @@
1
+ .\" Copyright (C) 2019 Jens Axboe <axboe@kernel.dk>
2
+ .\" Copyright (C) 2019 Red Hat, Inc.
3
+ .\"
4
+ .\" SPDX-License-Identifier: LGPL-2.0-or-later
5
+ .\"
6
+ .TH IO_URING_ENTER 2 2019-01-22 "Linux" "Linux Programmer's Manual"
7
+ .SH NAME
8
+ io_uring_enter \- initiate and/or complete asynchronous I/O
9
+ .SH SYNOPSIS
10
+ .nf
11
+ .BR "#include <linux/io_uring.h>"
12
+ .PP
13
+ .BI "int io_uring_enter(unsigned int " fd ", unsigned int " to_submit ,
14
+ .BI " unsigned int " min_complete ", unsigned int " flags ,
15
+ .BI " sigset_t *" sig );
16
+ .fi
17
+ .PP
18
+ .SH DESCRIPTION
19
+ .PP
20
+ .BR io_uring_enter ()
21
+ is used to initiate and complete I/O using the shared submission and
22
+ completion queues setup by a call to
23
+ .BR io_uring_setup (2).
24
+ A single call can both submit new I/O and wait for completions of I/O
25
+ initiated by this call or previous calls to
26
+ .BR io_uring_enter ().
27
+
28
+ .I fd
29
+ is the file descriptor returned by
30
+ .BR io_uring_setup (2).
31
+ .I to_submit
32
+ specifies the number of I/Os to submit from the submission queue.
33
+ .I flags
34
+ is a bitmask of the following values:
35
+ .TP
36
+ .B IORING_ENTER_GETEVENTS
37
+ If this flag is set, then the system call will wait for the specificied
38
+ number of events in
39
+ .I min_complete
40
+ before returning. This flag can be set along with
41
+ .I to_submit
42
+ to both submit and complete events in a single system call.
43
+ .TP
44
+ .B IORING_ENTER_SQ_WAKEUP
45
+ If the ring has been created with
46
+ .B IORING_SETUP_SQPOLL,
47
+ then this flag asks the kernel to wakeup the SQ kernel thread to submit IO.
48
+ .TP
49
+ .B IORING_ENTER_SQ_WAIT
50
+ If the ring has been created with
51
+ .B IORING_SETUP_SQPOLL,
52
+ then the application has no real insight into when the SQ kernel thread has
53
+ consumed entries from the SQ ring. This can lead to a situation where the
54
+ application can no longer get a free SQE entry to submit, without knowing
55
+ when it one becomes available as the SQ kernel thread consumes them. If
56
+ the system call is used with this flag set, then it will wait until at least
57
+ one entry is free in the SQ ring.
58
+ .TP
59
+ .B IORING_ENTER_EXT_ARG
60
+ Since kernel 5.11, the system calls arguments have been modified to look like
61
+ the following:
62
+
63
+ .nf
64
+ .BI "int io_uring_enter(unsigned int " fd ", unsigned int " to_submit ,
65
+ .BI " unsigned int " min_complete ", unsigned int " flags ,
66
+ .BI " const void *" arg ", size_t " argsz );
67
+ .fi
68
+
69
+ which is behaves just like the original definition by default. However, if
70
+ .B IORING_ENTER_EXT_ARG
71
+ is set, then instead of a
72
+ .I sigset_t
73
+ being passed in, a pointer to a
74
+ .I struct io_uring_getevents_arg
75
+ is used instead and
76
+ .I argsz
77
+ must be set to the size of this structure. The definition is as follows:
78
+
79
+ .nf
80
+ .BI "struct io_uring_getevents_args {
81
+ .BI " __u64 sigmask;
82
+ .BI " __u32 sigmask_sz;
83
+ .BI " __u32 pad;
84
+ .BI " __u64 ts;
85
+ .BI "};
86
+ .fi
87
+
88
+ which allows passing in both a signal mask as well as pointer to a
89
+ .I struct __kernel_timespec
90
+ timeout value. If
91
+ .I ts
92
+ is set to a valid pointer, then this time value indicates the timeout for
93
+ waiting on events. If an application is waiting on events and wishes to
94
+ stop waiting after a specified amount of time, then this can be accomplished
95
+ directly in version 5.11 and newer by using this feature.
96
+
97
+ .PP
98
+ .PP
99
+ If the io_uring instance was configured for polling, by specifying
100
+ .B IORING_SETUP_IOPOLL
101
+ in the call to
102
+ .BR io_uring_setup (2),
103
+ then min_complete has a slightly different meaning. Passing a value
104
+ of 0 instructs the kernel to return any events which are already complete,
105
+ without blocking. If
106
+ .I min_complete
107
+ is a non-zero value, the kernel will still return immediately if any
108
+ completion events are available. If no event completions are
109
+ available, then the call will poll either until one or more
110
+ completions become available, or until the process has exceeded its
111
+ scheduler time slice.
112
+
113
+ Note that, for interrupt driven I/O (where
114
+ .B IORING_SETUP_IOPOLL
115
+ was not specified in the call to
116
+ .BR io_uring_setup (2)),
117
+ an application may check the completion queue for event completions
118
+ without entering the kernel at all.
119
+ .PP
120
+ When the system call returns that a certain amount of SQEs have been
121
+ consumed and submitted, it's safe to reuse SQE entries in the ring. This is
122
+ true even if the actual IO submission had to be punted to async context,
123
+ which means that the SQE may in fact not have been submitted yet. If the
124
+ kernel requires later use of a particular SQE entry, it will have made a
125
+ private copy of it.
126
+
127
+ .I sig
128
+ is a pointer to a signal mask (see
129
+ .BR sigprocmask (2));
130
+ if
131
+ .I sig
132
+ is not NULL,
133
+ .BR io_uring_enter ()
134
+ first replaces the current signal mask by the one pointed to by
135
+ .IR sig ,
136
+ then waits for events to become available in the completion queue, and
137
+ then restores the original signal mask. The following
138
+ .BR io_uring_enter ()
139
+ call:
140
+ .PP
141
+ .in +4n
142
+ .EX
143
+ ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig);
144
+ .EE
145
+ .in
146
+ .PP
147
+ is equivalent to
148
+ .I atomically
149
+ executing the following calls:
150
+ .PP
151
+ .in +4n
152
+ .EX
153
+ pthread_sigmask(SIG_SETMASK, &sig, &orig);
154
+ ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL);
155
+ pthread_sigmask(SIG_SETMASK, &orig, NULL);
156
+ .EE
157
+ .in
158
+ .PP
159
+ See the description of
160
+ .BR pselect (2)
161
+ for an explanation of why the
162
+ .I sig
163
+ parameter is necessary.
164
+
165
+ Submission queue entries are represented using the following data
166
+ structure:
167
+ .PP
168
+ .in +4n
169
+ .EX
170
+ /*
171
+ * IO submission data structure (Submission Queue Entry)
172
+ */
173
+ struct io_uring_sqe {
174
+ __u8 opcode; /* type of operation for this sqe */
175
+ __u8 flags; /* IOSQE_ flags */
176
+ __u16 ioprio; /* ioprio for the request */
177
+ __s32 fd; /* file descriptor to do IO on */
178
+ union {
179
+ __u64 off; /* offset into file */
180
+ __u64 addr2;
181
+ };
182
+ union {
183
+ __u64 addr; /* pointer to buffer or iovecs */
184
+ __u64 splice_off_in;
185
+ }
186
+ __u32 len; /* buffer size or number of iovecs */
187
+ union {
188
+ __kernel_rwf_t rw_flags;
189
+ __u32 fsync_flags;
190
+ __u16 poll_events; /* compatibility */
191
+ __u32 poll32_events; /* word-reversed for BE */
192
+ __u32 sync_range_flags;
193
+ __u32 msg_flags;
194
+ __u32 timeout_flags;
195
+ __u32 accept_flags;
196
+ __u32 cancel_flags;
197
+ __u32 open_flags;
198
+ __u32 statx_flags;
199
+ __u32 fadvise_advice;
200
+ __u32 splice_flags;
201
+ __u32 rename_flags;
202
+ __u32 unlink_flags;
203
+ __u32 hardlink_flags;
204
+ };
205
+ __u64 user_data; /* data to be passed back at completion time */
206
+ union {
207
+ struct {
208
+ /* index into fixed buffers, if used */
209
+ union {
210
+ /* index into fixed buffers, if used */
211
+ __u16 buf_index;
212
+ /* for grouped buffer selection */
213
+ __u16 buf_group;
214
+ }
215
+ /* personality to use, if used */
216
+ __u16 personality;
217
+ union {
218
+ __s32 splice_fd_in;
219
+ __u32 file_index;
220
+ };
221
+ };
222
+ __u64 __pad2[3];
223
+ };
224
+ };
225
+ .EE
226
+ .in
227
+ .PP
228
+ The
229
+ .I opcode
230
+ describes the operation to be performed. It can be one of:
231
+ .TP
232
+ .B IORING_OP_NOP
233
+ Do not perform any I/O. This is useful for testing the performance of
234
+ the io_uring implementation itself.
235
+ .TP
236
+ .B IORING_OP_READV
237
+ .TP
238
+ .B IORING_OP_WRITEV
239
+ Vectored read and write operations, similar to
240
+ .BR preadv2 (2)
241
+ and
242
+ .BR pwritev2 (2).
243
+ If the file is not seekable,
244
+ .I off
245
+ must be set to zero.
246
+
247
+ .TP
248
+ .B IORING_OP_READ_FIXED
249
+ .TP
250
+ .B IORING_OP_WRITE_FIXED
251
+ Read from or write to pre-mapped buffers. See
252
+ .BR io_uring_register (2)
253
+ for details on how to setup a context for fixed reads and writes.
254
+
255
+ .TP
256
+ .B IORING_OP_FSYNC
257
+ File sync. See also
258
+ .BR fsync (2).
259
+ Note that, while I/O is initiated in the order in which it appears in
260
+ the submission queue, completions are unordered. For example, an
261
+ application which places a write I/O followed by an fsync in the
262
+ submission queue cannot expect the fsync to apply to the write. The
263
+ two operations execute in parallel, so the fsync may complete before
264
+ the write is issued to the storage. The same is also true for
265
+ previously issued writes that have not completed prior to the fsync.
266
+
267
+ .TP
268
+ .B IORING_OP_POLL_ADD
269
+ Poll the
270
+ .I fd
271
+ specified in the submission queue entry for the events
272
+ specified in the
273
+ .I poll_events
274
+ field. Unlike poll or epoll without
275
+ .BR EPOLLONESHOT ,
276
+ by default this interface always works in one shot mode. That is, once the poll
277
+ operation is completed, it will have to be resubmitted.
278
+
279
+ If
280
+ .B IORING_POLL_ADD_MULTI
281
+ is set in the SQE
282
+ .I len
283
+ field, then the poll will work in multi shot mode instead. That means it'll
284
+ repatedly trigger when the requested event becomes true, and hence multiple
285
+ CQEs can be generated from this single SQE. The CQE
286
+ .I flags
287
+ field will have
288
+ .B IORING_CQE_F_MORE
289
+ set on completion if the application should expect further CQE entries from
290
+ the original request. If this flag isn't set on completion, then the poll
291
+ request has been terminated and no further events will be generated. This mode
292
+ is available since 5.13.
293
+
294
+ If
295
+ .B IORING_POLL_UPDATE_EVENTS
296
+ is set in the SQE
297
+ .I len
298
+ field, then the request will update an existing poll request with the mask of
299
+ events passed in with this request. The lookup is based on the
300
+ .I user_data
301
+ field of the original SQE submitted, and this values is passed in the
302
+ .I addr
303
+ field of the SQE. This mode is available since 5.13.
304
+
305
+ If
306
+ .B IORING_POLL_UPDATE_USER_DATA
307
+ is set in the SQE
308
+ .I len
309
+ field, then the request will update the
310
+ .I user_data
311
+ of an existing poll request based on the value passed in the
312
+ .I off
313
+ field. This mode is available since 5.13.
314
+
315
+ This command works like
316
+ an async
317
+ .BR poll(2)
318
+ and the completion event result is the returned mask of events. For the
319
+ variants that update
320
+ .I user_data
321
+ or
322
+ .I events
323
+ , the completion result will be similar to
324
+ .B IORING_OP_POLL_REMOVE.
325
+
326
+ .TP
327
+ .B IORING_OP_POLL_REMOVE
328
+ Remove an existing poll request. If found, the
329
+ .I res
330
+ field of the
331
+ .I "struct io_uring_cqe"
332
+ will contain 0. If not found,
333
+ .I res
334
+ will contain
335
+ .B -ENOENT,
336
+ or
337
+ .B -EALREADY
338
+ if the poll request was in the process of completing already.
339
+
340
+ .TP
341
+ .B IORING_OP_EPOLL_CTL
342
+ Add, remove or modify entries in the interest list of
343
+ .BR epoll (7).
344
+ See
345
+ .BR epoll_ctl (2)
346
+ for details of the system call.
347
+ .I fd
348
+ holds the file descriptor that represents the epoll instance,
349
+ .I addr
350
+ holds the file descriptor to add, remove or modify,
351
+ .I len
352
+ holds the operation (EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD) to perform and,
353
+ .I off
354
+ holds a pointer to the
355
+ .I epoll_events
356
+ structure. Available since 5.6.
357
+
358
+ .TP
359
+ .B IORING_OP_SYNC_FILE_RANGE
360
+ Issue the equivalent of a \fBsync_file_range\fR (2) on the file descriptor. The
361
+ .I fd
362
+ field is the file descriptor to sync, the
363
+ .I off
364
+ field holds the offset in bytes, the
365
+ .I len
366
+ field holds the length in bytes, and the
367
+ .I sync_range_flags
368
+ field holds the flags for the command. See also
369
+ .BR sync_file_range (2)
370
+ for the general description of the related system call. Available since 5.2.
371
+
372
+ .TP
373
+ .B IORING_OP_SENDMSG
374
+ Issue the equivalent of a
375
+ .BR sendmsg(2)
376
+ system call.
377
+ .I fd
378
+ must be set to the socket file descriptor,
379
+ .I addr
380
+ must contain a pointer to the msghdr structure, and
381
+ .I msg_flags
382
+ holds the flags associated with the system call. See also
383
+ .BR sendmsg (2)
384
+ for the general description of the related system call. Available since 5.3.
385
+
386
+ .TP
387
+ .B IORING_OP_RECVMSG
388
+ Works just like IORING_OP_SENDMSG, except for
389
+ .BR recvmsg(2)
390
+ instead. See the description of IORING_OP_SENDMSG. Available since 5.3.
391
+
392
+ .TP
393
+ .B IORING_OP_SEND
394
+ Issue the equivalent of a
395
+ .BR send(2)
396
+ system call.
397
+ .I fd
398
+ must be set to the socket file descriptor,
399
+ .I addr
400
+ must contain a pointer to the buffer,
401
+ .I len
402
+ denotes the length of the buffer to send, and
403
+ .I msg_flags
404
+ holds the flags associated with the system call. See also
405
+ .BR send(2)
406
+ for the general description of the related system call. Available since 5.6.
407
+
408
+ .TP
409
+ .B IORING_OP_RECV
410
+ Works just like IORING_OP_SEND, except for
411
+ .BR recv(2)
412
+ instead. See the description of IORING_OP_SEND. Available since 5.6.
413
+
414
+ .TP
415
+ .B IORING_OP_TIMEOUT
416
+ This command will register a timeout operation. The
417
+ .I addr
418
+ field must contain a pointer to a struct timespec64 structure,
419
+ .I len
420
+ must contain 1 to signify one timespec64 structure,
421
+ .I timeout_flags
422
+ may contain IORING_TIMEOUT_ABS
423
+ for an absolute timeout value, or 0 for a relative timeout.
424
+ .I off
425
+ may contain a completion event count. A timeout
426
+ will trigger a wakeup event on the completion ring for anyone waiting for
427
+ events. A timeout condition is met when either the specified timeout expires,
428
+ or the specified number of events have completed. Either condition will
429
+ trigger the event. If set to 0, completed events are not counted, which
430
+ effectively acts like a timer. io_uring timeouts use the
431
+ .B CLOCK_MONOTONIC
432
+ clock source. The request will complete with
433
+ .I -ETIME
434
+ if the timeout got completed through expiration of the timer, or
435
+ .I 0
436
+ if the timeout got completed through requests completing on their own. If
437
+ the timeout was cancelled before it expired, the request will complete with
438
+ .I -ECANCELED.
439
+ Available since 5.4.
440
+
441
+ Since 5.15, this command also supports the following modifiers in
442
+ .I timeout_flags:
443
+
444
+ .PP
445
+ .in +12
446
+ .B IORING_TIMEOUT_BOOTTIME
447
+ If set, then the clocksource used is
448
+ .I CLOCK_BOOTTIME
449
+ instead of
450
+ .I CLOCK_MONOTONIC.
451
+ This clocksource differs in that it includes time elapsed if the system was
452
+ suspend while having a timeout request in-flight.
453
+
454
+ .B IORING_TIMEOUT_REALTIME
455
+ If set, then the clocksource used is
456
+ .I CLOCK_BOOTTIME
457
+ instead of
458
+ .I CLOCK_MONOTONIC.
459
+ .EE
460
+ .in
461
+ .PP
462
+
463
+ .TP
464
+ .B IORING_OP_TIMEOUT_REMOVE
465
+ If
466
+ .I timeout_flags are zero, then it attempts to remove an existing timeout
467
+ operation.
468
+ .I addr
469
+ must contain the
470
+ .I user_data
471
+ field of the previously issued timeout operation. If the specified timeout
472
+ request is found and cancelled successfully, this request will terminate
473
+ with a result value of
474
+ .I 0
475
+ If the timeout request was found but expiration was already in progress,
476
+ this request will terminate with a result value of
477
+ .I -EBUSY
478
+ If the timeout request wasn't found, the request will terminate with a result
479
+ value of
480
+ .I -ENOENT
481
+ Available since 5.5.
482
+
483
+ If
484
+ .I timeout_flags
485
+ contain
486
+ .I IORING_TIMEOUT_UPDATE,
487
+ instead of removing an existing operation, it updates it.
488
+ .I addr
489
+ and return values are same as before.
490
+ .I addr2
491
+ field must contain a pointer to a struct timespec64 structure.
492
+ .I timeout_flags
493
+ may also contain IORING_TIMEOUT_ABS, in which case the value given is an
494
+ absolute one, not a relative one.
495
+ Available since 5.11.
496
+
497
+ .TP
498
+ .B IORING_OP_ACCEPT
499
+ Issue the equivalent of an
500
+ .BR accept4(2)
501
+ system call.
502
+ .I fd
503
+ must be set to the socket file descriptor,
504
+ .I addr
505
+ must contain the pointer to the sockaddr structure, and
506
+ .I addr2
507
+ must contain a pointer to the socklen_t addrlen field. Flags can be passed using
508
+ the
509
+ .I accept_flags
510
+ field. See also
511
+ .BR accept4(2)
512
+ for the general description of the related system call. Available since 5.5.
513
+
514
+ If the
515
+ .I file_index
516
+ field is set to a positive number, the file won't be installed into the
517
+ normal file table as usual but will be placed into the fixed file table at index
518
+ .I file_index - 1.
519
+ In this case, instead of returning a file descriptor, the result will contain
520
+ either 0 on success or an error. If the index points to a valid empty slot, the
521
+ installation is guaranteed to not fail. If there is already a file in the slot,
522
+ it will be replaced, similar to
523
+ .B IORING_OP_FILES_UPDATE.
524
+ Please note that only io_uring has access to such files and no other syscall
525
+ can use them. See
526
+ .B IOSQE_FIXED_FILE
527
+ and
528
+ .B IORING_REGISTER_FILES.
529
+
530
+ Available since 5.5.
531
+
532
+ .TP
533
+ .B IORING_OP_ASYNC_CANCEL
534
+ Attempt to cancel an already issued request.
535
+ .I addr
536
+ must contain the
537
+ .I user_data
538
+ field of the request that should be cancelled. The cancellation request will
539
+ complete with one of the following results codes. If found, the
540
+ .I res
541
+ field of the cqe will contain 0. If not found,
542
+ .I res
543
+ will contain -ENOENT. If found and attempted cancelled, the
544
+ .I res
545
+ field will contain -EALREADY. In this case, the request may or may not
546
+ terminate. In general, requests that are interruptible (like socket IO) will
547
+ get cancelled, while disk IO requests cannot be cancelled if already started.
548
+ Available since 5.5.
549
+
550
+ .TP
551
+ .B IORING_OP_LINK_TIMEOUT
552
+ This request must be linked with another request through
553
+ .I IOSQE_IO_LINK
554
+ which is described below. Unlike
555
+ .I IORING_OP_TIMEOUT,
556
+ .I IORING_OP_LINK_TIMEOUT
557
+ acts on the linked request, not the completion queue. The format of the command
558
+ is otherwise like
559
+ .I IORING_OP_TIMEOUT,
560
+ except there's no completion event count as it's tied to a specific request.
561
+ If used, the timeout specified in the command will cancel the linked command,
562
+ unless the linked command completes before the timeout. The timeout will
563
+ complete with
564
+ .I -ETIME
565
+ if the timer expired and the linked request was attempted cancelled, or
566
+ .I -ECANCELED
567
+ if the timer got cancelled because of completion of the linked request. Like
568
+ .B IORING_OP_TIMEOUT
569
+ the clock source used is
570
+ .B CLOCK_MONOTONIC
571
+ Available since 5.5.
572
+
573
+
574
+ .TP
575
+ .B IORING_OP_CONNECT
576
+ Issue the equivalent of a
577
+ .BR connect(2)
578
+ system call.
579
+ .I fd
580
+ must be set to the socket file descriptor,
581
+ .I addr
582
+ must contain the const pointer to the sockaddr structure, and
583
+ .I off
584
+ must contain the socklen_t addrlen field. See also
585
+ .BR connect(2)
586
+ for the general description of the related system call. Available since 5.5.
587
+
588
+ .TP
589
+ .B IORING_OP_FALLOCATE
590
+ Issue the equivalent of a
591
+ .BR fallocate(2)
592
+ system call.
593
+ .I fd
594
+ must be set to the file descriptor,
595
+ .I len
596
+ must contain the mode associated with the operation,
597
+ .I off
598
+ must contain the offset on which to operate, and
599
+ .I addr
600
+ must contain the length. See also
601
+ .BR fallocate(2)
602
+ for the general description of the related system call. Available since 5.6.
603
+
604
+ .TP
605
+ .B IORING_OP_FADVISE
606
+ Issue the equivalent of a
607
+ .BR posix_fadvise(2)
608
+ system call.
609
+ .I fd
610
+ must be set to the file descriptor,
611
+ .I off
612
+ must contain the offset on which to operate,
613
+ .I len
614
+ must contain the length, and
615
+ .I fadvise_advice
616
+ must contain the advice associated with the operation. See also
617
+ .BR posix_fadvise(2)
618
+ for the general description of the related system call. Available since 5.6.
619
+
620
+ .TP
621
+ .B IORING_OP_MADVISE
622
+ Issue the equivalent of a
623
+ .BR madvise(2)
624
+ system call.
625
+ .I addr
626
+ must contain the address to operate on,
627
+ .I len
628
+ must contain the length on which to operate,
629
+ and
630
+ .I fadvise_advice
631
+ must contain the advice associated with the operation. See also
632
+ .BR madvise(2)
633
+ for the general description of the related system call. Available since 5.6.
634
+
635
+ .TP
636
+ .B IORING_OP_OPENAT
637
+ Issue the equivalent of a
638
+ .BR openat(2)
639
+ system call.
640
+ .I fd
641
+ is the
642
+ .I dirfd
643
+ argument,
644
+ .I addr
645
+ must contain a pointer to the
646
+ .I *pathname
647
+ argument,
648
+ .I open_flags
649
+ should contain any flags passed in, and
650
+ .I len
651
+ is access mode of the file. See also
652
+ .BR openat(2)
653
+ for the general description of the related system call. Available since 5.6.
654
+
655
+ If the
656
+ .I file_index
657
+ field is set to a positive number, the file won't be installed into the
658
+ normal file table as usual but will be placed into the fixed file table at index
659
+ .I file_index - 1.
660
+ In this case, instead of returning a file descriptor, the result will contain
661
+ either 0 on success or an error. If the index points to a valid empty slot, the
662
+ installation is guaranteed to not fail. If there is already a file in the slot,
663
+ it will be replaced, similar to
664
+ .B IORING_OP_FILES_UPDATE.
665
+ Please note that only io_uring has access to such files and no other syscall
666
+ can use them. See
667
+ .B IOSQE_FIXED_FILE
668
+ and
669
+ .B IORING_REGISTER_FILES.
670
+
671
+ Available since 5.15.
672
+
673
+ .TP
674
+ .B IORING_OP_OPENAT2
675
+ Issue the equivalent of a
676
+ .BR openat2(2)
677
+ system call.
678
+ .I fd
679
+ is the
680
+ .I dirfd
681
+ argument,
682
+ .I addr
683
+ must contain a pointer to the
684
+ .I *pathname
685
+ argument,
686
+ .I len
687
+ should contain the size of the open_how structure, and
688
+ .I off
689
+ should be set to the address of the open_how structure. See also
690
+ .BR openat2(2)
691
+ for the general description of the related system call. Available since 5.6.
692
+
693
+ If the
694
+ .I file_index
695
+ field is set to a positive number, the file won't be installed into the
696
+ normal file table as usual but will be placed into the fixed file table at index
697
+ .I file_index - 1.
698
+ In this case, instead of returning a file descriptor, the result will contain
699
+ either 0 on success or an error. If the index points to a valid empty slot, the
700
+ installation is guaranteed to not fail. If there is already a file in the slot,
701
+ it will be replaced, similar to
702
+ .B IORING_OP_FILES_UPDATE.
703
+ Please note that only io_uring has access to such files and no other syscall
704
+ can use them. See
705
+ .B IOSQE_FIXED_FILE
706
+ and
707
+ .B IORING_REGISTER_FILES.
708
+
709
+ Available since 5.15.
710
+
711
+ .TP
712
+ .B IORING_OP_CLOSE
713
+ Issue the equivalent of a
714
+ .BR close(2)
715
+ system call.
716
+ .I fd
717
+ is the file descriptor to be closed. See also
718
+ .BR close(2)
719
+ for the general description of the related system call. Available since 5.6.
720
+ If the
721
+ .I file_index
722
+ field is set to a positive number, this command can be used to close files
723
+ that were direct opened through
724
+ .B IORING_OP_OPENAT
725
+ ,
726
+ .B IORING_OP_OPENAT2
727
+ , or
728
+ .B IORING_OP_ACCEPT
729
+ using the io_uring specific direct descriptors. Note that only one of the
730
+ descriptor fields may be set. The direct close feature is available since
731
+ the 5.15 kernel, where direct descriptors were introduced.
732
+
733
+ .TP
734
+ .B IORING_OP_STATX
735
+ Issue the equivalent of a
736
+ .BR statx(2)
737
+ system call.
738
+ .I fd
739
+ is the
740
+ .I dirfd
741
+ argument,
742
+ .I addr
743
+ must contain a pointer to the
744
+ .I *pathname
745
+ string,
746
+ .I statx_flags
747
+ is the
748
+ .I flags
749
+ argument,
750
+ .I len
751
+ should be the
752
+ .I mask
753
+ argument, and
754
+ .I off
755
+ must contain a pointer to the
756
+ .I statxbuf
757
+ to be filled in. See also
758
+ .BR statx(2)
759
+ for the general description of the related system call. Available since 5.6.
760
+
761
+ .TP
762
+ .B IORING_OP_READ
763
+ .TP
764
+ .B IORING_OP_WRITE
765
+ Issue the equivalent of a
766
+ .BR pread(2)
767
+ or
768
+ .BR pwrite(2)
769
+ system call.
770
+ .I fd
771
+ is the file descriptor to be operated on,
772
+ .I addr
773
+ contains the buffer in question,
774
+ .I len
775
+ contains the length of the IO operation, and
776
+ .I offs
777
+ contains the read or write offset. If
778
+ .I fd
779
+ does not refer to a seekable file,
780
+ .I off
781
+ must be set to zero. If
782
+ .I offs
783
+ is set to
784
+ .B -1
785
+ , the offset will use (and advance) the file position, like the
786
+ .BR read(2)
787
+ and
788
+ .BR write(2)
789
+ system calls. These are non-vectored versions of the
790
+ .B IORING_OP_READV
791
+ and
792
+ .B IORING_OP_WRITEV
793
+ opcodes. See also
794
+ .BR read(2)
795
+ and
796
+ .BR write(2)
797
+ for the general description of the related system call. Available since 5.6.
798
+
799
+ .TP
800
+ .B IORING_OP_SPLICE
801
+ Issue the equivalent of a
802
+ .BR splice(2)
803
+ system call.
804
+ .I splice_fd_in
805
+ is the file descriptor to read from,
806
+ .I splice_off_in
807
+ is an offset to read from,
808
+ .I fd
809
+ is the file descriptor to write to,
810
+ .I off
811
+ is an offset from which to start writing to. A sentinel value of
812
+ .B -1
813
+ is used to pass the equivalent of a NULL for the offsets to
814
+ .BR splice(2).
815
+ .I len
816
+ contains the number of bytes to copy.
817
+ .I splice_flags
818
+ contains a bit mask for the flag field associated with the system call.
819
+ Please note that one of the file descriptors must refer to a pipe.
820
+ See also
821
+ .BR splice(2)
822
+ for the general description of the related system call. Available since 5.7.
823
+
824
+ .TP
825
+ .B IORING_OP_TEE
826
+ Issue the equivalent of a
827
+ .BR tee(2)
828
+ system call.
829
+ .I splice_fd_in
830
+ is the file descriptor to read from,
831
+ .I fd
832
+ is the file descriptor to write to,
833
+ .I len
834
+ contains the number of bytes to copy, and
835
+ .I splice_flags
836
+ contains a bit mask for the flag field associated with the system call.
837
+ Please note that both of the file descriptors must refer to a pipe.
838
+ See also
839
+ .BR tee(2)
840
+ for the general description of the related system call. Available since 5.8.
841
+
842
+ .TP
843
+ .B IORING_OP_FILES_UPDATE
844
+ This command is an alternative to using
845
+ .B IORING_REGISTER_FILES_UPDATE
846
+ which then works in an async fashion, like the rest of the io_uring commands.
847
+ The arguments passed in are the same.
848
+ .I addr
849
+ must contain a pointer to the array of file descriptors,
850
+ .I len
851
+ must contain the length of the array, and
852
+ .I off
853
+ must contain the offset at which to operate. Note that the array of file
854
+ descriptors pointed to in
855
+ .I addr
856
+ must remain valid until this operation has completed. Available since 5.6.
857
+
858
+ .TP
859
+ .B IORING_OP_PROVIDE_BUFFERS
860
+ This command allows an application to register a group of buffers to be used
861
+ by commands that read/receive data. Using buffers in this manner can eliminate
862
+ the need to separate the poll + read, which provides a convenient point in
863
+ time to allocate a buffer for a given request. It's often infeasible to have
864
+ as many buffers available as pending reads or receive. With this feature, the
865
+ application can have its pool of buffers ready in the kernel, and when the
866
+ file or socket is ready to read/receive data, a buffer can be selected for the
867
+ operation.
868
+ .I fd
869
+ must contain the number of buffers to provide,
870
+ .I addr
871
+ must contain the starting address to add buffers from,
872
+ .I len
873
+ must contain the length of each buffer to add from the range,
874
+ .I buf_group
875
+ must contain the group ID of this range of buffers, and
876
+ .I off
877
+ must contain the starting buffer ID of this range of buffers. With that set,
878
+ the kernel adds buffers starting with the memory address in
879
+ .I addr,
880
+ each with a length of
881
+ .I len.
882
+ Hence the application should provide
883
+ .I len * fd
884
+ worth of memory in
885
+ .I addr.
886
+ Buffers are grouped by the group ID, and each buffer within this group will be
887
+ identical in size according to the above arguments. This allows the application
888
+ to provide different groups of buffers, and this is often used to have
889
+ differently sized buffers available depending on what the expectations are of
890
+ the individual request. When submitting a request that should use a provided
891
+ buffer, the
892
+ .B IOSQE_BUFFER_SELECT
893
+ flag must be set, and
894
+ .I buf_group
895
+ must be set to the desired buffer group ID where the buffer should be selected
896
+ from. Available since 5.7.
897
+
898
+ .TP
899
+ .B IORING_OP_REMOVE_BUFFERS
900
+ Remove buffers previously registered with
901
+ .B IORING_OP_PROVIDE_BUFFERS.
902
+ .I fd
903
+ must contain the number of buffers to remove, and
904
+ .I buf_group
905
+ must contain the buffer group ID from which to remove the buffers. Available
906
+ since 5.7.
907
+
908
+ .TP
909
+ .B IORING_OP_SHUTDOWN
910
+ Issue the equivalent of a
911
+ .BR shutdown(2)
912
+ system call.
913
+ .I fd
914
+ is the file descriptor to the socket being shutdown, and
915
+ .I len
916
+ must be set to the
917
+ .I how
918
+ argument. No no other fields should be set. Available since 5.11.
919
+
920
+ .TP
921
+ .B IORING_OP_RENAMEAT
922
+ Issue the equivalent of a
923
+ .BR renameat2(2)
924
+ system call.
925
+ .I fd
926
+ should be set to the
927
+ .I olddirfd,
928
+ .I addr
929
+ should be set to the
930
+ .I oldpath,
931
+ .I len
932
+ should be set to the
933
+ .I newdirfd,
934
+ .I addr
935
+ should be set to the
936
+ .I oldpath,
937
+ .I addr2
938
+ should be set to the
939
+ .I newpath,
940
+ and finally
941
+ .I rename_flags
942
+ should be set to the
943
+ .I flags
944
+ passed in to
945
+ .BR renameat2(2).
946
+ Available since 5.11.
947
+
948
+ .TP
949
+ .B IORING_OP_UNLINKAT
950
+ Issue the equivalent of a
951
+ .BR unlinkat2(2)
952
+ system call.
953
+ .I fd
954
+ should be set to the
955
+ .I dirfd,
956
+ .I addr
957
+ should be set to the
958
+ .I pathname,
959
+ and
960
+ .I unlink_flags
961
+ should be set to the
962
+ .I flags
963
+ being passed in to
964
+ .BR unlinkat(2).
965
+ Available since 5.11.
966
+
967
+ .TP
968
+ .B IORING_OP_MKDIRAT
969
+ Issue the equivalent of a
970
+ .BR mkdirat2(2)
971
+ system call.
972
+ .I fd
973
+ should be set to the
974
+ .I dirfd,
975
+ .I addr
976
+ should be set to the
977
+ .I pathname,
978
+ and
979
+ .I len
980
+ should be set to the
981
+ .I mode
982
+ being passed in to
983
+ .BR mkdirat(2).
984
+ Available since 5.15.
985
+
986
+ .TP
987
+ .B IORING_OP_SYMLINKAT
988
+ Issue the equivalent of a
989
+ .BR symlinkat2(2)
990
+ system call.
991
+ .I fd
992
+ should be set to the
993
+ .I newdirfd,
994
+ .I addr
995
+ should be set to the
996
+ .I target
997
+ and
998
+ .I addr2
999
+ should be set to the
1000
+ .I linkpath
1001
+ being passed in to
1002
+ .BR symlinkat(2).
1003
+ Available since 5.15.
1004
+
1005
+ .TP
1006
+ .B IORING_OP_LINKAT
1007
+ Issue the equivalent of a
1008
+ .BR linkat2(2)
1009
+ system call.
1010
+ .I fd
1011
+ should be set to the
1012
+ .I olddirfd,
1013
+ .I addr
1014
+ should be set to the
1015
+ .I oldpath,
1016
+ .I len
1017
+ should be set to the
1018
+ .I newdirfd,
1019
+ .I addr2
1020
+ should be set to the
1021
+ .I newpath,
1022
+ and
1023
+ .I hardlink_flags
1024
+ should be set to the
1025
+ .I flags
1026
+ being passed in to
1027
+ .BR linkat(2).
1028
+ Available since 5.15.
1029
+
1030
+ .TP
1031
+ .B IORING_OP_MSG_RING
1032
+ Send a message to an io_uring.
1033
+ .I fd
1034
+ must be set to a file descriptor of a ring that the application has access to,
1035
+ .I len
1036
+ can be set to any 32-bit value that the application wishes to pass on, and
1037
+ .I off
1038
+ should be set any 64-bit value that the application wishes to send. On the
1039
+ target ring, a CQE will be posted with the
1040
+ .I res
1041
+ field matching the
1042
+ .I len
1043
+ set, and a
1044
+ .I user_data
1045
+ field matching the
1046
+ .I off
1047
+ value being passed in. This request type can be used to either just wake or
1048
+ interrupt anyone waiting for completions on the target ring, ot it can be used
1049
+ to pass messages via the two fields. Available since 5.18.
1050
+
1051
+ .PP
1052
+ The
1053
+ .I flags
1054
+ field is a bit mask. The supported flags are:
1055
+ .TP
1056
+ .B IOSQE_FIXED_FILE
1057
+ When this flag is specified,
1058
+ .I fd
1059
+ is an index into the files array registered with the io_uring instance (see the
1060
+ .B IORING_REGISTER_FILES
1061
+ section of the
1062
+ .BR io_uring_register (2)
1063
+ man page). Note that this isn't always available for all commands. If used on
1064
+ a command that doesn't support fixed files, the SQE will error with
1065
+ .B -EBADF.
1066
+ Available since 5.1.
1067
+ .TP
1068
+ .B IOSQE_IO_DRAIN
1069
+ When this flag is specified, the SQE will not be started before previously
1070
+ submitted SQEs have completed, and new SQEs will not be started before this
1071
+ one completes. Available since 5.2.
1072
+ .TP
1073
+ .B IOSQE_IO_LINK
1074
+ When this flag is specified, the SQE forms a link with the next SQE in the
1075
+ submission ring. That next SQE will not be started before the previous request
1076
+ completes. This, in effect, forms a chain of SQEs, which can be arbitrarily
1077
+ long. The tail of the chain is denoted by the first SQE that does not have this
1078
+ flag set. Chains are not supported across submission boundaries. Even if the
1079
+ last SQE in a submission has this flag set, it will still terminate the current
1080
+ chain. This flag has no effect on previous SQE submissions, nor does it impact
1081
+ SQEs that are outside of the chain tail. This means that multiple chains can be
1082
+ executing in parallel, or chains and individual SQEs. Only members inside the
1083
+ chain are serialized. A chain of SQEs will be broken, if any request in that
1084
+ chain ends in error. io_uring considers any unexpected result an error. This
1085
+ means that, eg, a short read will also terminate the remainder of the chain.
1086
+ If a chain of SQE links is broken, the remaining unstarted part of the chain
1087
+ will be terminated and completed with
1088
+ .B -ECANCELED
1089
+ as the error code. Available since 5.3.
1090
+ .TP
1091
+ .B IOSQE_IO_HARDLINK
1092
+ Like IOSQE_IO_LINK, but it doesn't sever regardless of the completion result.
1093
+ Note that the link will still sever if we fail submitting the parent request,
1094
+ hard links are only resilient in the presence of completion results for
1095
+ requests that did submit correctly. IOSQE_IO_HARDLINK implies IOSQE_IO_LINK.
1096
+ Available since 5.5.
1097
+ .TP
1098
+ .B IOSQE_ASYNC
1099
+ Normal operation for io_uring is to try and issue an sqe as non-blocking first,
1100
+ and if that fails, execute it in an async manner. To support more efficient
1101
+ overlapped operation of requests that the application knows/assumes will
1102
+ always (or most of the time) block, the application can ask for an sqe to be
1103
+ issued async from the start. Available since 5.6.
1104
+ .TP
1105
+ .B IOSQE_BUFFER_SELECT
1106
+ Used in conjunction with the
1107
+ .B IORING_OP_PROVIDE_BUFFERS
1108
+ command, which registers a pool of buffers to be used by commands that read
1109
+ or receive data. When buffers are registered for this use case, and this
1110
+ flag is set in the command, io_uring will grab a buffer from this pool when
1111
+ the request is ready to receive or read data. If successful, the resulting CQE
1112
+ will have
1113
+ .B IORING_CQE_F_BUFFER
1114
+ set in the flags part of the struct, and the upper
1115
+ .B IORING_CQE_BUFFER_SHIFT
1116
+ bits will contain the ID of the selected buffers. This allows the application
1117
+ to know exactly which buffer was selected for the operation. If no buffers
1118
+ are available and this flag is set, then the request will fail with
1119
+ .B -ENOBUFS
1120
+ as the error code. Once a buffer has been used, it is no longer available in
1121
+ the kernel pool. The application must re-register the given buffer again when
1122
+ it is ready to recycle it (eg has completed using it). Available since 5.7.
1123
+ .TP
1124
+ .B IOSQE_CQE_SKIP_SUCCESS
1125
+ Don't generate a CQE if the request completes successfully. If the request
1126
+ fails, an appropriate CQE will be posted as usual and if there is no
1127
+ .B IOSQE_IO_HARDLINK,
1128
+ CQEs for all linked requests will be omitted. The notion of failure/success is
1129
+ opcode specific and is the same as with breaking chains of
1130
+ .B IOSQE_IO_LINK.
1131
+ One special case is when the request has a linked timeout, then the CQE
1132
+ generation for the linked timeout is decided solely by whether it has
1133
+ .B IOSQE_CQE_SKIP_SUCCESS
1134
+ set, regardless whether it timed out or was cancelled. In other words, if a
1135
+ linked timeout has the flag set, it's guaranteed to not post a CQE.
1136
+
1137
+ The semantics are chosen to accommodate several use cases. First, when all but
1138
+ the last request of a normal link without linked timeouts are marked with the
1139
+ flag, only one CQE per lin is posted. Additionally, it enables supression of
1140
+ CQEs in cases where the side effects of a successfully executed operation is
1141
+ enough for userspace to know the state of the system. One such example would
1142
+ be writing to a synchronisation file.
1143
+
1144
+ This flag is incompatible with
1145
+ .B IOSQE_IO_DRAIN.
1146
+ Using both of them in a single ring is undefined behavior, even when they are
1147
+ not used together in a single request. Currently, after the first request with
1148
+ .B IOSQE_CQE_SKIP_SUCCESS,
1149
+ all subsequent requests marked with drain will be failed at submission time.
1150
+ Note that the error reporting is best effort only, and restrictions may change
1151
+ in the future.
1152
+
1153
+ Available since 5.17.
1154
+
1155
+ .PP
1156
+ .I ioprio
1157
+ specifies the I/O priority. See
1158
+ .BR ioprio_get (2)
1159
+ for a description of Linux I/O priorities.
1160
+
1161
+ .I fd
1162
+ specifies the file descriptor against which the operation will be
1163
+ performed, with the exception noted above.
1164
+
1165
+ If the operation is one of
1166
+ .B IORING_OP_READ_FIXED
1167
+ or
1168
+ .BR IORING_OP_WRITE_FIXED ,
1169
+ .I addr
1170
+ and
1171
+ .I len
1172
+ must fall within the buffer located at
1173
+ .I buf_index
1174
+ in the fixed buffer array. If the operation is either
1175
+ .B IORING_OP_READV
1176
+ or
1177
+ .BR IORING_OP_WRITEV ,
1178
+ then
1179
+ .I addr
1180
+ points to an iovec array of
1181
+ .I len
1182
+ entries.
1183
+
1184
+ .IR rw_flags ,
1185
+ specified for read and write operations, contains a bitwise OR of
1186
+ per-I/O flags, as described in the
1187
+ .BR preadv2 (2)
1188
+ man page.
1189
+
1190
+ The
1191
+ .I fsync_flags
1192
+ bit mask may contain either 0, for a normal file integrity sync, or
1193
+ .B IORING_FSYNC_DATASYNC
1194
+ to provide data sync only semantics. See the descriptions of
1195
+ .B O_SYNC
1196
+ and
1197
+ .B O_DSYNC
1198
+ in the
1199
+ .BR open (2)
1200
+ manual page for more information.
1201
+
1202
+ The bits that may be set in
1203
+ .I poll_events
1204
+ are defined in \fI<poll.h>\fP, and documented in
1205
+ .BR poll (2).
1206
+
1207
+ .I user_data
1208
+ is an application-supplied value that will be copied into
1209
+ the completion queue entry (see below).
1210
+ .I buf_index
1211
+ is an index into an array of fixed buffers, and is only valid if fixed
1212
+ buffers were registered.
1213
+ .I personality
1214
+ is the credentials id to use for this operation. See
1215
+ .BR io_uring_register(2)
1216
+ for how to register personalities with io_uring. If set to 0, the current
1217
+ personality of the submitting task is used.
1218
+ .PP
1219
+ Once the submission queue entry is initialized, I/O is submitted by
1220
+ placing the index of the submission queue entry into the tail of the
1221
+ submission queue. After one or more indexes are added to the queue,
1222
+ and the queue tail is advanced, the
1223
+ .BR io_uring_enter (2)
1224
+ system call can be invoked to initiate the I/O.
1225
+
1226
+ Completions use the following data structure:
1227
+ .PP
1228
+ .in +4n
1229
+ .EX
1230
+ /*
1231
+ * IO completion data structure (Completion Queue Entry)
1232
+ */
1233
+ struct io_uring_cqe {
1234
+ __u64 user_data; /* sqe->data submission passed back */
1235
+ __s32 res; /* result code for this event */
1236
+ __u32 flags;
1237
+ };
1238
+ .EE
1239
+ .in
1240
+ .PP
1241
+ .I user_data
1242
+ is copied from the field of the same name in the submission queue
1243
+ entry. The primary use case is to store data that the application
1244
+ will need to access upon completion of this particular I/O. The
1245
+ .I flags
1246
+ is used for certain commands, like
1247
+ .B IORING_OP_POLL_ADD
1248
+ or in conjunction with
1249
+ .B IOSQE_BUFFER_SELECT
1250
+ , see those entries.
1251
+ .I res
1252
+ is the operation-specific result, but io_uring-specific errors
1253
+ (e.g. flags or opcode invalid) are returned through this field.
1254
+ They are described in section
1255
+ .B CQE ERRORS.
1256
+ .PP
1257
+ For read and write opcodes, the
1258
+ return values match
1259
+ .I errno
1260
+ values documented in the
1261
+ .BR preadv2 (2)
1262
+ and
1263
+ .BR pwritev2 (2)
1264
+ man pages, with
1265
+ .I
1266
+ res
1267
+ holding the equivalent of
1268
+ .I -errno
1269
+ for error cases, or the transferred number of bytes in case the operation
1270
+ is successful. Hence both error and success return can be found in that
1271
+ field in the CQE. For other request types, the return values are documented
1272
+ in the matching man page for that type, or in the opcodes section above for
1273
+ io_uring-specific opcodes.
1274
+ .PP
1275
+ .SH RETURN VALUE
1276
+ .BR io_uring_enter ()
1277
+ returns the number of I/Os successfully consumed. This can be zero
1278
+ if
1279
+ .I to_submit
1280
+ was zero or if the submission queue was empty. Note that if the ring was
1281
+ created with
1282
+ .B IORING_SETUP_SQPOLL
1283
+ specified, then the return value will generally be the same as
1284
+ .I to_submit
1285
+ as submission happens outside the context of the system call.
1286
+
1287
+ The errors related to a submission queue entry will be returned through a
1288
+ completion queue entry (see section
1289
+ .B CQE ERRORS),
1290
+ rather than through the system call itself.
1291
+
1292
+ Errors that occur not on behalf of a submission queue entry are returned via the
1293
+ system call directly. On such an error,
1294
+ .B -1
1295
+ is returned and
1296
+ .I errno
1297
+ is set appropriately.
1298
+ .PP
1299
+ .SH ERRORS
1300
+ These are the errors returned by
1301
+ .BR io_uring_enter ()
1302
+ system call.
1303
+ .TP
1304
+ .B EAGAIN
1305
+ The kernel was unable to allocate memory for the request, or otherwise ran out
1306
+ of resources to handle it. The application should wait for some completions and
1307
+ try again.
1308
+ .TP
1309
+ .B EBADF
1310
+ .I fd
1311
+ is not a valid file descriptor.
1312
+ .TP
1313
+ .B EBADFD
1314
+ .I fd
1315
+ is a valid file descriptor, but the io_uring ring is not in the right state
1316
+ (enabled). See
1317
+ .BR io_uring_register (2)
1318
+ for details on how to enable the ring.
1319
+ .TP
1320
+ .B EBUSY
1321
+ The application is attempting to overcommit the number of requests it can have
1322
+ pending. The application should wait for some completions and try again. May
1323
+ occur if the application tries to queue more requests than we have room for in
1324
+ the CQ ring, or if the application attempts to wait for more events without
1325
+ having reaped the ones already present in the CQ ring.
1326
+ .TP
1327
+ .B EINVAL
1328
+ Some bits in the
1329
+ .I flags
1330
+ argument are invalid.
1331
+ .TP
1332
+ .B EFAULT
1333
+ An invalid user space address was specified for the
1334
+ .I sig
1335
+ argument.
1336
+ .TP
1337
+ .B ENXIO
1338
+ The io_uring instance is in the process of being torn down.
1339
+ .TP
1340
+ .B EOPNOTSUPP
1341
+ .I fd
1342
+ does not refer to an io_uring instance.
1343
+ .TP
1344
+ .B EINTR
1345
+ The operation was interrupted by a delivery of a signal before it could
1346
+ complete; see
1347
+ .BR signal(7).
1348
+ Can happen while waiting for events with
1349
+ .B IORING_ENTER_GETEVENTS.
1350
+
1351
+ .SH CQE ERRORS
1352
+ These io_uring-specific errors are returned as a negative value in the
1353
+ .I res
1354
+ field of the completion queue entry.
1355
+ .TP
1356
+ .B EACCES
1357
+ The
1358
+ .I flags
1359
+ field or
1360
+ .I opcode
1361
+ in a submission queue entry is not allowed due to registered restrictions.
1362
+ See
1363
+ .BR io_uring_register (2)
1364
+ for details on how restrictions work.
1365
+ .TP
1366
+ .B EBADF
1367
+ The
1368
+ .I fd
1369
+ field in the submission queue entry is invalid, or the
1370
+ .B IOSQE_FIXED_FILE
1371
+ flag was set in the submission queue entry, but no files were registered
1372
+ with the io_uring instance.
1373
+ .TP
1374
+ .B EFAULT
1375
+ buffer is outside of the process' accessible address space
1376
+ .TP
1377
+ .B EFAULT
1378
+ .B IORING_OP_READ_FIXED
1379
+ or
1380
+ .B IORING_OP_WRITE_FIXED
1381
+ was specified in the
1382
+ .I opcode
1383
+ field of the submission queue entry, but either buffers were not
1384
+ registered for this io_uring instance, or the address range described
1385
+ by
1386
+ .I addr
1387
+ and
1388
+ .I len
1389
+ does not fit within the buffer registered at
1390
+ .IR buf_index .
1391
+ .TP
1392
+ .B EINVAL
1393
+ The
1394
+ .I flags
1395
+ field or
1396
+ .I opcode
1397
+ in a submission queue entry is invalid.
1398
+ .TP
1399
+ .B EINVAL
1400
+ The
1401
+ .I buf_index
1402
+ member of the submission queue entry is invalid.
1403
+ .TP
1404
+ .B EINVAL
1405
+ The
1406
+ .I personality
1407
+ field in a submission queue entry is invalid.
1408
+ .TP
1409
+ .B EINVAL
1410
+ .B IORING_OP_NOP
1411
+ was specified in the submission queue entry, but the io_uring context
1412
+ was setup for polling
1413
+ .RB ( IORING_SETUP_IOPOLL
1414
+ was specified in the call to io_uring_setup).
1415
+ .TP
1416
+ .B EINVAL
1417
+ .B IORING_OP_READV
1418
+ or
1419
+ .B IORING_OP_WRITEV
1420
+ was specified in the submission queue entry, but the io_uring instance
1421
+ has fixed buffers registered.
1422
+ .TP
1423
+ .B EINVAL
1424
+ .B IORING_OP_READ_FIXED
1425
+ or
1426
+ .B IORING_OP_WRITE_FIXED
1427
+ was specified in the submission queue entry, and the
1428
+ .I buf_index
1429
+ is invalid.
1430
+ .TP
1431
+ .B EINVAL
1432
+ .BR IORING_OP_READV ,
1433
+ .BR IORING_OP_WRITEV ,
1434
+ .BR IORING_OP_READ_FIXED ,
1435
+ .B IORING_OP_WRITE_FIXED
1436
+ or
1437
+ .B IORING_OP_FSYNC
1438
+ was specified in the submission queue entry, but the io_uring instance
1439
+ was configured for IOPOLLing, or any of
1440
+ .IR addr ,
1441
+ .IR ioprio ,
1442
+ .IR off ,
1443
+ .IR len ,
1444
+ or
1445
+ .I buf_index
1446
+ was set in the submission queue entry.
1447
+ .TP
1448
+ .B EINVAL
1449
+ .B IORING_OP_POLL_ADD
1450
+ or
1451
+ .B IORING_OP_POLL_REMOVE
1452
+ was specified in the
1453
+ .I opcode
1454
+ field of the submission queue entry, but the io_uring instance was
1455
+ configured for busy-wait polling
1456
+ .RB ( IORING_SETUP_IOPOLL ),
1457
+ or any of
1458
+ .IR ioprio ,
1459
+ .IR off ,
1460
+ .IR len ,
1461
+ or
1462
+ .I buf_index
1463
+ was non-zero in the submission queue entry.
1464
+ .TP
1465
+ .B EINVAL
1466
+ .B IORING_OP_POLL_ADD
1467
+ was specified in the
1468
+ .I opcode
1469
+ field of the submission queue entry, and the
1470
+ .I addr
1471
+ field was non-zero.
1472
+ .TP
1473
+ .B EOPNOTSUPP
1474
+ .I opcode
1475
+ is valid, but not supported by this kernel.
1476
+ .TP
1477
+ .B EOPNOTSUPP
1478
+ .B IOSQE_BUFFER_SELECT
1479
+ was set in the
1480
+ .I flags
1481
+ field of the submission queue entry, but the
1482
+ .I opcode
1483
+ doesn't support buffer selection.