polyphony 0.85 → 0.86

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/Gemfile.lock +1 -1
  4. data/ext/polyphony/io_extensions.c +2 -3
  5. data/lib/polyphony/version.rb +1 -1
  6. data/polyphony.gemspec +1 -1
  7. data/test/test_backend.rb +1 -1
  8. data/test/test_signal.rb +3 -3
  9. data/vendor/liburing/.github/pull_request_template.md +86 -0
  10. data/vendor/liburing/.github/workflows/build.yml +85 -0
  11. data/vendor/liburing/.github/workflows/shellcheck.yml +20 -0
  12. data/vendor/liburing/.gitignore +149 -0
  13. data/vendor/liburing/COPYING +502 -0
  14. data/vendor/liburing/COPYING.GPL +339 -0
  15. data/vendor/liburing/LICENSE +7 -0
  16. data/vendor/liburing/Makefile +82 -0
  17. data/vendor/liburing/Makefile.common +5 -0
  18. data/vendor/liburing/Makefile.quiet +11 -0
  19. data/vendor/liburing/README +46 -0
  20. data/vendor/liburing/configure +486 -0
  21. data/vendor/liburing/debian/README.Debian +7 -0
  22. data/vendor/liburing/debian/changelog +27 -0
  23. data/vendor/liburing/debian/compat +1 -0
  24. data/vendor/liburing/debian/control +48 -0
  25. data/vendor/liburing/debian/copyright +49 -0
  26. data/vendor/liburing/debian/liburing-dev.install +4 -0
  27. data/vendor/liburing/debian/liburing-dev.manpages +6 -0
  28. data/vendor/liburing/debian/liburing1-udeb.install +1 -0
  29. data/vendor/liburing/debian/liburing1.install +1 -0
  30. data/vendor/liburing/debian/liburing1.symbols +32 -0
  31. data/vendor/liburing/debian/patches/series +1 -0
  32. data/vendor/liburing/debian/rules +81 -0
  33. data/vendor/liburing/debian/source/format +1 -0
  34. data/vendor/liburing/debian/source/local-options +2 -0
  35. data/vendor/liburing/debian/source/options +1 -0
  36. data/vendor/liburing/debian/watch +3 -0
  37. data/vendor/liburing/examples/Makefile +38 -0
  38. data/vendor/liburing/examples/io_uring-cp.c +282 -0
  39. data/vendor/liburing/examples/io_uring-test.c +112 -0
  40. data/vendor/liburing/examples/link-cp.c +193 -0
  41. data/vendor/liburing/examples/ucontext-cp.c +273 -0
  42. data/vendor/liburing/liburing.pc.in +12 -0
  43. data/vendor/liburing/liburing.spec +66 -0
  44. data/vendor/liburing/make-debs.sh +53 -0
  45. data/vendor/liburing/man/io_uring.7 +754 -0
  46. data/vendor/liburing/man/io_uring_cq_advance.3 +35 -0
  47. data/vendor/liburing/man/io_uring_cq_ready.3 +25 -0
  48. data/vendor/liburing/man/io_uring_cqe_get_data.3 +34 -0
  49. data/vendor/liburing/man/io_uring_cqe_seen.3 +32 -0
  50. data/vendor/liburing/man/io_uring_enter.2 +1483 -0
  51. data/vendor/liburing/man/io_uring_free_probe.3 +24 -0
  52. data/vendor/liburing/man/io_uring_get_probe.3 +29 -0
  53. data/vendor/liburing/man/io_uring_get_sqe.3 +38 -0
  54. data/vendor/liburing/man/io_uring_opcode_supported.3 +29 -0
  55. data/vendor/liburing/man/io_uring_prep_msg_ring.3 +58 -0
  56. data/vendor/liburing/man/io_uring_prep_read.3 +50 -0
  57. data/vendor/liburing/man/io_uring_prep_read_fixed.3 +54 -0
  58. data/vendor/liburing/man/io_uring_prep_readv.3 +51 -0
  59. data/vendor/liburing/man/io_uring_prep_readv2.3 +79 -0
  60. data/vendor/liburing/man/io_uring_prep_write.3 +50 -0
  61. data/vendor/liburing/man/io_uring_prep_write_fixed.3 +54 -0
  62. data/vendor/liburing/man/io_uring_prep_writev.3 +51 -0
  63. data/vendor/liburing/man/io_uring_prep_writev2.3 +78 -0
  64. data/vendor/liburing/man/io_uring_queue_exit.3 +27 -0
  65. data/vendor/liburing/man/io_uring_queue_init.3 +44 -0
  66. data/vendor/liburing/man/io_uring_register.2 +688 -0
  67. data/vendor/liburing/man/io_uring_register_buffers.3 +41 -0
  68. data/vendor/liburing/man/io_uring_register_files.3 +35 -0
  69. data/vendor/liburing/man/io_uring_setup.2 +534 -0
  70. data/vendor/liburing/man/io_uring_sq_ready.3 +25 -0
  71. data/vendor/liburing/man/io_uring_sq_space_left.3 +25 -0
  72. data/vendor/liburing/man/io_uring_sqe_set_data.3 +30 -0
  73. data/vendor/liburing/man/io_uring_sqe_set_flags.3 +60 -0
  74. data/vendor/liburing/man/io_uring_sqring_wait.3 +30 -0
  75. data/vendor/liburing/man/io_uring_submit.3 +29 -0
  76. data/vendor/liburing/man/io_uring_submit_and_wait.3 +34 -0
  77. data/vendor/liburing/man/io_uring_submit_and_wait_timeout.3 +49 -0
  78. data/vendor/liburing/man/io_uring_unregister_buffers.3 +26 -0
  79. data/vendor/liburing/man/io_uring_unregister_files.3 +26 -0
  80. data/vendor/liburing/man/io_uring_wait_cqe.3 +33 -0
  81. data/vendor/liburing/man/io_uring_wait_cqe_nr.3 +36 -0
  82. data/vendor/liburing/man/io_uring_wait_cqe_timeout.3 +39 -0
  83. data/vendor/liburing/man/io_uring_wait_cqes.3 +46 -0
  84. data/vendor/liburing/src/Makefile +89 -0
  85. data/vendor/liburing/src/arch/aarch64/syscall.h +95 -0
  86. data/vendor/liburing/src/arch/generic/lib.h +21 -0
  87. data/vendor/liburing/src/arch/generic/syscall.h +87 -0
  88. data/vendor/liburing/src/arch/syscall-defs.h +67 -0
  89. data/vendor/liburing/src/arch/x86/lib.h +32 -0
  90. data/vendor/liburing/src/arch/x86/syscall.h +160 -0
  91. data/vendor/liburing/src/include/liburing/barrier.h +81 -0
  92. data/vendor/liburing/src/include/liburing/io_uring.h +442 -0
  93. data/vendor/liburing/src/include/liburing.h +921 -0
  94. data/vendor/liburing/src/int_flags.h +8 -0
  95. data/vendor/liburing/src/lib.h +57 -0
  96. data/vendor/liburing/src/liburing.map +53 -0
  97. data/vendor/liburing/src/nolibc.c +48 -0
  98. data/vendor/liburing/src/queue.c +403 -0
  99. data/vendor/liburing/src/register.c +293 -0
  100. data/vendor/liburing/src/setup.c +332 -0
  101. data/vendor/liburing/src/syscall.c +47 -0
  102. data/vendor/liburing/src/syscall.h +103 -0
  103. data/vendor/liburing/test/232c93d07b74-test.c +306 -0
  104. data/vendor/liburing/test/35fa71a030ca-test.c +329 -0
  105. data/vendor/liburing/test/500f9fbadef8-test.c +89 -0
  106. data/vendor/liburing/test/7ad0e4b2f83c-test.c +93 -0
  107. data/vendor/liburing/test/8a9973408177-test.c +106 -0
  108. data/vendor/liburing/test/917257daa0fe-test.c +53 -0
  109. data/vendor/liburing/test/Makefile +244 -0
  110. data/vendor/liburing/test/a0908ae19763-test.c +58 -0
  111. data/vendor/liburing/test/a4c0b3decb33-test.c +180 -0
  112. data/vendor/liburing/test/accept-link.c +254 -0
  113. data/vendor/liburing/test/accept-reuse.c +164 -0
  114. data/vendor/liburing/test/accept-test.c +79 -0
  115. data/vendor/liburing/test/accept.c +477 -0
  116. data/vendor/liburing/test/across-fork.c +283 -0
  117. data/vendor/liburing/test/b19062a56726-test.c +53 -0
  118. data/vendor/liburing/test/b5837bd5311d-test.c +77 -0
  119. data/vendor/liburing/test/ce593a6c480a-test.c +136 -0
  120. data/vendor/liburing/test/close-opath.c +122 -0
  121. data/vendor/liburing/test/config +10 -0
  122. data/vendor/liburing/test/connect.c +398 -0
  123. data/vendor/liburing/test/cq-full.c +96 -0
  124. data/vendor/liburing/test/cq-overflow.c +294 -0
  125. data/vendor/liburing/test/cq-peek-batch.c +102 -0
  126. data/vendor/liburing/test/cq-ready.c +94 -0
  127. data/vendor/liburing/test/cq-size.c +64 -0
  128. data/vendor/liburing/test/d4ae271dfaae-test.c +96 -0
  129. data/vendor/liburing/test/d77a67ed5f27-test.c +65 -0
  130. data/vendor/liburing/test/defer.c +307 -0
  131. data/vendor/liburing/test/double-poll-crash.c +185 -0
  132. data/vendor/liburing/test/drop-submit.c +92 -0
  133. data/vendor/liburing/test/eeed8b54e0df-test.c +114 -0
  134. data/vendor/liburing/test/empty-eownerdead.c +45 -0
  135. data/vendor/liburing/test/eventfd-disable.c +151 -0
  136. data/vendor/liburing/test/eventfd-reg.c +76 -0
  137. data/vendor/liburing/test/eventfd-ring.c +97 -0
  138. data/vendor/liburing/test/eventfd.c +112 -0
  139. data/vendor/liburing/test/exec-target.c +6 -0
  140. data/vendor/liburing/test/exit-no-cleanup.c +117 -0
  141. data/vendor/liburing/test/fadvise.c +202 -0
  142. data/vendor/liburing/test/fallocate.c +249 -0
  143. data/vendor/liburing/test/fc2a85cb02ef-test.c +131 -0
  144. data/vendor/liburing/test/file-register.c +858 -0
  145. data/vendor/liburing/test/file-update.c +173 -0
  146. data/vendor/liburing/test/file-verify.c +629 -0
  147. data/vendor/liburing/test/files-exit-hang-poll.c +128 -0
  148. data/vendor/liburing/test/files-exit-hang-timeout.c +134 -0
  149. data/vendor/liburing/test/fixed-link.c +90 -0
  150. data/vendor/liburing/test/fpos.c +252 -0
  151. data/vendor/liburing/test/fsync.c +224 -0
  152. data/vendor/liburing/test/hardlink.c +136 -0
  153. data/vendor/liburing/test/helpers.c +135 -0
  154. data/vendor/liburing/test/helpers.h +67 -0
  155. data/vendor/liburing/test/io-cancel.c +550 -0
  156. data/vendor/liburing/test/io_uring_enter.c +296 -0
  157. data/vendor/liburing/test/io_uring_register.c +676 -0
  158. data/vendor/liburing/test/io_uring_setup.c +192 -0
  159. data/vendor/liburing/test/iopoll.c +372 -0
  160. data/vendor/liburing/test/lfs-openat-write.c +119 -0
  161. data/vendor/liburing/test/lfs-openat.c +275 -0
  162. data/vendor/liburing/test/link-timeout.c +1107 -0
  163. data/vendor/liburing/test/link.c +496 -0
  164. data/vendor/liburing/test/link_drain.c +229 -0
  165. data/vendor/liburing/test/madvise.c +195 -0
  166. data/vendor/liburing/test/mkdir.c +108 -0
  167. data/vendor/liburing/test/msg-ring.c +234 -0
  168. data/vendor/liburing/test/multicqes_drain.c +387 -0
  169. data/vendor/liburing/test/nop-all-sizes.c +99 -0
  170. data/vendor/liburing/test/nop.c +115 -0
  171. data/vendor/liburing/test/open-close.c +261 -0
  172. data/vendor/liburing/test/openat2.c +308 -0
  173. data/vendor/liburing/test/personality.c +204 -0
  174. data/vendor/liburing/test/pipe-eof.c +83 -0
  175. data/vendor/liburing/test/pipe-reuse.c +105 -0
  176. data/vendor/liburing/test/poll-cancel-ton.c +135 -0
  177. data/vendor/liburing/test/poll-cancel.c +228 -0
  178. data/vendor/liburing/test/poll-link.c +230 -0
  179. data/vendor/liburing/test/poll-many.c +208 -0
  180. data/vendor/liburing/test/poll-mshot-update.c +273 -0
  181. data/vendor/liburing/test/poll-ring.c +48 -0
  182. data/vendor/liburing/test/poll-v-poll.c +353 -0
  183. data/vendor/liburing/test/poll.c +109 -0
  184. data/vendor/liburing/test/pollfree.c +426 -0
  185. data/vendor/liburing/test/probe.c +135 -0
  186. data/vendor/liburing/test/read-write.c +876 -0
  187. data/vendor/liburing/test/register-restrictions.c +633 -0
  188. data/vendor/liburing/test/rename.c +135 -0
  189. data/vendor/liburing/test/ring-leak.c +173 -0
  190. data/vendor/liburing/test/ring-leak2.c +249 -0
  191. data/vendor/liburing/test/rsrc_tags.c +449 -0
  192. data/vendor/liburing/test/runtests-loop.sh +16 -0
  193. data/vendor/liburing/test/runtests.sh +170 -0
  194. data/vendor/liburing/test/rw_merge_test.c +97 -0
  195. data/vendor/liburing/test/self.c +91 -0
  196. data/vendor/liburing/test/send_recv.c +286 -0
  197. data/vendor/liburing/test/send_recvmsg.c +345 -0
  198. data/vendor/liburing/test/sendmsg_fs_cve.c +200 -0
  199. data/vendor/liburing/test/shared-wq.c +84 -0
  200. data/vendor/liburing/test/short-read.c +75 -0
  201. data/vendor/liburing/test/shutdown.c +165 -0
  202. data/vendor/liburing/test/sigfd-deadlock.c +74 -0
  203. data/vendor/liburing/test/skip-cqe.c +429 -0
  204. data/vendor/liburing/test/socket-rw-eagain.c +158 -0
  205. data/vendor/liburing/test/socket-rw-offset.c +157 -0
  206. data/vendor/liburing/test/socket-rw.c +145 -0
  207. data/vendor/liburing/test/splice.c +512 -0
  208. data/vendor/liburing/test/sq-full-cpp.cc +45 -0
  209. data/vendor/liburing/test/sq-full.c +45 -0
  210. data/vendor/liburing/test/sq-poll-dup.c +204 -0
  211. data/vendor/liburing/test/sq-poll-kthread.c +169 -0
  212. data/vendor/liburing/test/sq-poll-share.c +137 -0
  213. data/vendor/liburing/test/sq-space_left.c +159 -0
  214. data/vendor/liburing/test/sqpoll-cancel-hang.c +157 -0
  215. data/vendor/liburing/test/sqpoll-disable-exit.c +196 -0
  216. data/vendor/liburing/test/sqpoll-exit-hang.c +78 -0
  217. data/vendor/liburing/test/sqpoll-sleep.c +69 -0
  218. data/vendor/liburing/test/statx.c +172 -0
  219. data/vendor/liburing/test/stdout.c +232 -0
  220. data/vendor/liburing/test/submit-link-fail.c +154 -0
  221. data/vendor/liburing/test/submit-reuse.c +239 -0
  222. data/vendor/liburing/test/symlink.c +116 -0
  223. data/vendor/liburing/test/teardowns.c +58 -0
  224. data/vendor/liburing/test/thread-exit.c +143 -0
  225. data/vendor/liburing/test/timeout-new.c +252 -0
  226. data/vendor/liburing/test/timeout-overflow.c +204 -0
  227. data/vendor/liburing/test/timeout.c +1523 -0
  228. data/vendor/liburing/test/unlink.c +112 -0
  229. data/vendor/liburing/test/wakeup-hang.c +162 -0
  230. metadata +223 -2
@@ -0,0 +1,1483 @@
1
+ .\" Copyright (C) 2019 Jens Axboe <axboe@kernel.dk>
2
+ .\" Copyright (C) 2019 Red Hat, Inc.
3
+ .\"
4
+ .\" SPDX-License-Identifier: LGPL-2.0-or-later
5
+ .\"
6
+ .TH IO_URING_ENTER 2 2019-01-22 "Linux" "Linux Programmer's Manual"
7
+ .SH NAME
8
+ io_uring_enter \- initiate and/or complete asynchronous I/O
9
+ .SH SYNOPSIS
10
+ .nf
11
+ .BR "#include <linux/io_uring.h>"
12
+ .PP
13
+ .BI "int io_uring_enter(unsigned int " fd ", unsigned int " to_submit ,
14
+ .BI " unsigned int " min_complete ", unsigned int " flags ,
15
+ .BI " sigset_t *" sig );
16
+ .fi
17
+ .PP
18
+ .SH DESCRIPTION
19
+ .PP
20
+ .BR io_uring_enter ()
21
+ is used to initiate and complete I/O using the shared submission and
22
+ completion queues setup by a call to
23
+ .BR io_uring_setup (2).
24
+ A single call can both submit new I/O and wait for completions of I/O
25
+ initiated by this call or previous calls to
26
+ .BR io_uring_enter ().
27
+
28
+ .I fd
29
+ is the file descriptor returned by
30
+ .BR io_uring_setup (2).
31
+ .I to_submit
32
+ specifies the number of I/Os to submit from the submission queue.
33
+ .I flags
34
+ is a bitmask of the following values:
35
+ .TP
36
+ .B IORING_ENTER_GETEVENTS
37
+ If this flag is set, then the system call will wait for the specificied
38
+ number of events in
39
+ .I min_complete
40
+ before returning. This flag can be set along with
41
+ .I to_submit
42
+ to both submit and complete events in a single system call.
43
+ .TP
44
+ .B IORING_ENTER_SQ_WAKEUP
45
+ If the ring has been created with
46
+ .B IORING_SETUP_SQPOLL,
47
+ then this flag asks the kernel to wakeup the SQ kernel thread to submit IO.
48
+ .TP
49
+ .B IORING_ENTER_SQ_WAIT
50
+ If the ring has been created with
51
+ .B IORING_SETUP_SQPOLL,
52
+ then the application has no real insight into when the SQ kernel thread has
53
+ consumed entries from the SQ ring. This can lead to a situation where the
54
+ application can no longer get a free SQE entry to submit, without knowing
55
+ when it one becomes available as the SQ kernel thread consumes them. If
56
+ the system call is used with this flag set, then it will wait until at least
57
+ one entry is free in the SQ ring.
58
+ .TP
59
+ .B IORING_ENTER_EXT_ARG
60
+ Since kernel 5.11, the system calls arguments have been modified to look like
61
+ the following:
62
+
63
+ .nf
64
+ .BI "int io_uring_enter(unsigned int " fd ", unsigned int " to_submit ,
65
+ .BI " unsigned int " min_complete ", unsigned int " flags ,
66
+ .BI " const void *" arg ", size_t " argsz );
67
+ .fi
68
+
69
+ which is behaves just like the original definition by default. However, if
70
+ .B IORING_ENTER_EXT_ARG
71
+ is set, then instead of a
72
+ .I sigset_t
73
+ being passed in, a pointer to a
74
+ .I struct io_uring_getevents_arg
75
+ is used instead and
76
+ .I argsz
77
+ must be set to the size of this structure. The definition is as follows:
78
+
79
+ .nf
80
+ .BI "struct io_uring_getevents_args {
81
+ .BI " __u64 sigmask;
82
+ .BI " __u32 sigmask_sz;
83
+ .BI " __u32 pad;
84
+ .BI " __u64 ts;
85
+ .BI "};
86
+ .fi
87
+
88
+ which allows passing in both a signal mask as well as pointer to a
89
+ .I struct __kernel_timespec
90
+ timeout value. If
91
+ .I ts
92
+ is set to a valid pointer, then this time value indicates the timeout for
93
+ waiting on events. If an application is waiting on events and wishes to
94
+ stop waiting after a specified amount of time, then this can be accomplished
95
+ directly in version 5.11 and newer by using this feature.
96
+
97
+ .PP
98
+ .PP
99
+ If the io_uring instance was configured for polling, by specifying
100
+ .B IORING_SETUP_IOPOLL
101
+ in the call to
102
+ .BR io_uring_setup (2),
103
+ then min_complete has a slightly different meaning. Passing a value
104
+ of 0 instructs the kernel to return any events which are already complete,
105
+ without blocking. If
106
+ .I min_complete
107
+ is a non-zero value, the kernel will still return immediately if any
108
+ completion events are available. If no event completions are
109
+ available, then the call will poll either until one or more
110
+ completions become available, or until the process has exceeded its
111
+ scheduler time slice.
112
+
113
+ Note that, for interrupt driven I/O (where
114
+ .B IORING_SETUP_IOPOLL
115
+ was not specified in the call to
116
+ .BR io_uring_setup (2)),
117
+ an application may check the completion queue for event completions
118
+ without entering the kernel at all.
119
+ .PP
120
+ When the system call returns that a certain amount of SQEs have been
121
+ consumed and submitted, it's safe to reuse SQE entries in the ring. This is
122
+ true even if the actual IO submission had to be punted to async context,
123
+ which means that the SQE may in fact not have been submitted yet. If the
124
+ kernel requires later use of a particular SQE entry, it will have made a
125
+ private copy of it.
126
+
127
+ .I sig
128
+ is a pointer to a signal mask (see
129
+ .BR sigprocmask (2));
130
+ if
131
+ .I sig
132
+ is not NULL,
133
+ .BR io_uring_enter ()
134
+ first replaces the current signal mask by the one pointed to by
135
+ .IR sig ,
136
+ then waits for events to become available in the completion queue, and
137
+ then restores the original signal mask. The following
138
+ .BR io_uring_enter ()
139
+ call:
140
+ .PP
141
+ .in +4n
142
+ .EX
143
+ ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig);
144
+ .EE
145
+ .in
146
+ .PP
147
+ is equivalent to
148
+ .I atomically
149
+ executing the following calls:
150
+ .PP
151
+ .in +4n
152
+ .EX
153
+ pthread_sigmask(SIG_SETMASK, &sig, &orig);
154
+ ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL);
155
+ pthread_sigmask(SIG_SETMASK, &orig, NULL);
156
+ .EE
157
+ .in
158
+ .PP
159
+ See the description of
160
+ .BR pselect (2)
161
+ for an explanation of why the
162
+ .I sig
163
+ parameter is necessary.
164
+
165
+ Submission queue entries are represented using the following data
166
+ structure:
167
+ .PP
168
+ .in +4n
169
+ .EX
170
+ /*
171
+ * IO submission data structure (Submission Queue Entry)
172
+ */
173
+ struct io_uring_sqe {
174
+ __u8 opcode; /* type of operation for this sqe */
175
+ __u8 flags; /* IOSQE_ flags */
176
+ __u16 ioprio; /* ioprio for the request */
177
+ __s32 fd; /* file descriptor to do IO on */
178
+ union {
179
+ __u64 off; /* offset into file */
180
+ __u64 addr2;
181
+ };
182
+ union {
183
+ __u64 addr; /* pointer to buffer or iovecs */
184
+ __u64 splice_off_in;
185
+ }
186
+ __u32 len; /* buffer size or number of iovecs */
187
+ union {
188
+ __kernel_rwf_t rw_flags;
189
+ __u32 fsync_flags;
190
+ __u16 poll_events; /* compatibility */
191
+ __u32 poll32_events; /* word-reversed for BE */
192
+ __u32 sync_range_flags;
193
+ __u32 msg_flags;
194
+ __u32 timeout_flags;
195
+ __u32 accept_flags;
196
+ __u32 cancel_flags;
197
+ __u32 open_flags;
198
+ __u32 statx_flags;
199
+ __u32 fadvise_advice;
200
+ __u32 splice_flags;
201
+ __u32 rename_flags;
202
+ __u32 unlink_flags;
203
+ __u32 hardlink_flags;
204
+ };
205
+ __u64 user_data; /* data to be passed back at completion time */
206
+ union {
207
+ struct {
208
+ /* index into fixed buffers, if used */
209
+ union {
210
+ /* index into fixed buffers, if used */
211
+ __u16 buf_index;
212
+ /* for grouped buffer selection */
213
+ __u16 buf_group;
214
+ }
215
+ /* personality to use, if used */
216
+ __u16 personality;
217
+ union {
218
+ __s32 splice_fd_in;
219
+ __u32 file_index;
220
+ };
221
+ };
222
+ __u64 __pad2[3];
223
+ };
224
+ };
225
+ .EE
226
+ .in
227
+ .PP
228
+ The
229
+ .I opcode
230
+ describes the operation to be performed. It can be one of:
231
+ .TP
232
+ .B IORING_OP_NOP
233
+ Do not perform any I/O. This is useful for testing the performance of
234
+ the io_uring implementation itself.
235
+ .TP
236
+ .B IORING_OP_READV
237
+ .TP
238
+ .B IORING_OP_WRITEV
239
+ Vectored read and write operations, similar to
240
+ .BR preadv2 (2)
241
+ and
242
+ .BR pwritev2 (2).
243
+ If the file is not seekable,
244
+ .I off
245
+ must be set to zero.
246
+
247
+ .TP
248
+ .B IORING_OP_READ_FIXED
249
+ .TP
250
+ .B IORING_OP_WRITE_FIXED
251
+ Read from or write to pre-mapped buffers. See
252
+ .BR io_uring_register (2)
253
+ for details on how to setup a context for fixed reads and writes.
254
+
255
+ .TP
256
+ .B IORING_OP_FSYNC
257
+ File sync. See also
258
+ .BR fsync (2).
259
+ Note that, while I/O is initiated in the order in which it appears in
260
+ the submission queue, completions are unordered. For example, an
261
+ application which places a write I/O followed by an fsync in the
262
+ submission queue cannot expect the fsync to apply to the write. The
263
+ two operations execute in parallel, so the fsync may complete before
264
+ the write is issued to the storage. The same is also true for
265
+ previously issued writes that have not completed prior to the fsync.
266
+
267
+ .TP
268
+ .B IORING_OP_POLL_ADD
269
+ Poll the
270
+ .I fd
271
+ specified in the submission queue entry for the events
272
+ specified in the
273
+ .I poll_events
274
+ field. Unlike poll or epoll without
275
+ .BR EPOLLONESHOT ,
276
+ by default this interface always works in one shot mode. That is, once the poll
277
+ operation is completed, it will have to be resubmitted.
278
+
279
+ If
280
+ .B IORING_POLL_ADD_MULTI
281
+ is set in the SQE
282
+ .I len
283
+ field, then the poll will work in multi shot mode instead. That means it'll
284
+ repatedly trigger when the requested event becomes true, and hence multiple
285
+ CQEs can be generated from this single SQE. The CQE
286
+ .I flags
287
+ field will have
288
+ .B IORING_CQE_F_MORE
289
+ set on completion if the application should expect further CQE entries from
290
+ the original request. If this flag isn't set on completion, then the poll
291
+ request has been terminated and no further events will be generated. This mode
292
+ is available since 5.13.
293
+
294
+ If
295
+ .B IORING_POLL_UPDATE_EVENTS
296
+ is set in the SQE
297
+ .I len
298
+ field, then the request will update an existing poll request with the mask of
299
+ events passed in with this request. The lookup is based on the
300
+ .I user_data
301
+ field of the original SQE submitted, and this values is passed in the
302
+ .I addr
303
+ field of the SQE. This mode is available since 5.13.
304
+
305
+ If
306
+ .B IORING_POLL_UPDATE_USER_DATA
307
+ is set in the SQE
308
+ .I len
309
+ field, then the request will update the
310
+ .I user_data
311
+ of an existing poll request based on the value passed in the
312
+ .I off
313
+ field. This mode is available since 5.13.
314
+
315
+ This command works like
316
+ an async
317
+ .BR poll(2)
318
+ and the completion event result is the returned mask of events. For the
319
+ variants that update
320
+ .I user_data
321
+ or
322
+ .I events
323
+ , the completion result will be similar to
324
+ .B IORING_OP_POLL_REMOVE.
325
+
326
+ .TP
327
+ .B IORING_OP_POLL_REMOVE
328
+ Remove an existing poll request. If found, the
329
+ .I res
330
+ field of the
331
+ .I "struct io_uring_cqe"
332
+ will contain 0. If not found,
333
+ .I res
334
+ will contain
335
+ .B -ENOENT,
336
+ or
337
+ .B -EALREADY
338
+ if the poll request was in the process of completing already.
339
+
340
+ .TP
341
+ .B IORING_OP_EPOLL_CTL
342
+ Add, remove or modify entries in the interest list of
343
+ .BR epoll (7).
344
+ See
345
+ .BR epoll_ctl (2)
346
+ for details of the system call.
347
+ .I fd
348
+ holds the file descriptor that represents the epoll instance,
349
+ .I addr
350
+ holds the file descriptor to add, remove or modify,
351
+ .I len
352
+ holds the operation (EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD) to perform and,
353
+ .I off
354
+ holds a pointer to the
355
+ .I epoll_events
356
+ structure. Available since 5.6.
357
+
358
+ .TP
359
+ .B IORING_OP_SYNC_FILE_RANGE
360
+ Issue the equivalent of a \fBsync_file_range\fR (2) on the file descriptor. The
361
+ .I fd
362
+ field is the file descriptor to sync, the
363
+ .I off
364
+ field holds the offset in bytes, the
365
+ .I len
366
+ field holds the length in bytes, and the
367
+ .I sync_range_flags
368
+ field holds the flags for the command. See also
369
+ .BR sync_file_range (2)
370
+ for the general description of the related system call. Available since 5.2.
371
+
372
+ .TP
373
+ .B IORING_OP_SENDMSG
374
+ Issue the equivalent of a
375
+ .BR sendmsg(2)
376
+ system call.
377
+ .I fd
378
+ must be set to the socket file descriptor,
379
+ .I addr
380
+ must contain a pointer to the msghdr structure, and
381
+ .I msg_flags
382
+ holds the flags associated with the system call. See also
383
+ .BR sendmsg (2)
384
+ for the general description of the related system call. Available since 5.3.
385
+
386
+ .TP
387
+ .B IORING_OP_RECVMSG
388
+ Works just like IORING_OP_SENDMSG, except for
389
+ .BR recvmsg(2)
390
+ instead. See the description of IORING_OP_SENDMSG. Available since 5.3.
391
+
392
+ .TP
393
+ .B IORING_OP_SEND
394
+ Issue the equivalent of a
395
+ .BR send(2)
396
+ system call.
397
+ .I fd
398
+ must be set to the socket file descriptor,
399
+ .I addr
400
+ must contain a pointer to the buffer,
401
+ .I len
402
+ denotes the length of the buffer to send, and
403
+ .I msg_flags
404
+ holds the flags associated with the system call. See also
405
+ .BR send(2)
406
+ for the general description of the related system call. Available since 5.6.
407
+
408
+ .TP
409
+ .B IORING_OP_RECV
410
+ Works just like IORING_OP_SEND, except for
411
+ .BR recv(2)
412
+ instead. See the description of IORING_OP_SEND. Available since 5.6.
413
+
414
+ .TP
415
+ .B IORING_OP_TIMEOUT
416
+ This command will register a timeout operation. The
417
+ .I addr
418
+ field must contain a pointer to a struct timespec64 structure,
419
+ .I len
420
+ must contain 1 to signify one timespec64 structure,
421
+ .I timeout_flags
422
+ may contain IORING_TIMEOUT_ABS
423
+ for an absolute timeout value, or 0 for a relative timeout.
424
+ .I off
425
+ may contain a completion event count. A timeout
426
+ will trigger a wakeup event on the completion ring for anyone waiting for
427
+ events. A timeout condition is met when either the specified timeout expires,
428
+ or the specified number of events have completed. Either condition will
429
+ trigger the event. If set to 0, completed events are not counted, which
430
+ effectively acts like a timer. io_uring timeouts use the
431
+ .B CLOCK_MONOTONIC
432
+ clock source. The request will complete with
433
+ .I -ETIME
434
+ if the timeout got completed through expiration of the timer, or
435
+ .I 0
436
+ if the timeout got completed through requests completing on their own. If
437
+ the timeout was cancelled before it expired, the request will complete with
438
+ .I -ECANCELED.
439
+ Available since 5.4.
440
+
441
+ Since 5.15, this command also supports the following modifiers in
442
+ .I timeout_flags:
443
+
444
+ .PP
445
+ .in +12
446
+ .B IORING_TIMEOUT_BOOTTIME
447
+ If set, then the clocksource used is
448
+ .I CLOCK_BOOTTIME
449
+ instead of
450
+ .I CLOCK_MONOTONIC.
451
+ This clocksource differs in that it includes time elapsed if the system was
452
+ suspend while having a timeout request in-flight.
453
+
454
+ .B IORING_TIMEOUT_REALTIME
455
+ If set, then the clocksource used is
456
+ .I CLOCK_BOOTTIME
457
+ instead of
458
+ .I CLOCK_MONOTONIC.
459
+ .EE
460
+ .in
461
+ .PP
462
+
463
+ .TP
464
+ .B IORING_OP_TIMEOUT_REMOVE
465
+ If
466
+ .I timeout_flags are zero, then it attempts to remove an existing timeout
467
+ operation.
468
+ .I addr
469
+ must contain the
470
+ .I user_data
471
+ field of the previously issued timeout operation. If the specified timeout
472
+ request is found and cancelled successfully, this request will terminate
473
+ with a result value of
474
+ .I 0
475
+ If the timeout request was found but expiration was already in progress,
476
+ this request will terminate with a result value of
477
+ .I -EBUSY
478
+ If the timeout request wasn't found, the request will terminate with a result
479
+ value of
480
+ .I -ENOENT
481
+ Available since 5.5.
482
+
483
+ If
484
+ .I timeout_flags
485
+ contain
486
+ .I IORING_TIMEOUT_UPDATE,
487
+ instead of removing an existing operation, it updates it.
488
+ .I addr
489
+ and return values are same as before.
490
+ .I addr2
491
+ field must contain a pointer to a struct timespec64 structure.
492
+ .I timeout_flags
493
+ may also contain IORING_TIMEOUT_ABS, in which case the value given is an
494
+ absolute one, not a relative one.
495
+ Available since 5.11.
496
+
497
+ .TP
498
+ .B IORING_OP_ACCEPT
499
+ Issue the equivalent of an
500
+ .BR accept4(2)
501
+ system call.
502
+ .I fd
503
+ must be set to the socket file descriptor,
504
+ .I addr
505
+ must contain the pointer to the sockaddr structure, and
506
+ .I addr2
507
+ must contain a pointer to the socklen_t addrlen field. Flags can be passed using
508
+ the
509
+ .I accept_flags
510
+ field. See also
511
+ .BR accept4(2)
512
+ for the general description of the related system call. Available since 5.5.
513
+
514
+ If the
515
+ .I file_index
516
+ field is set to a positive number, the file won't be installed into the
517
+ normal file table as usual but will be placed into the fixed file table at index
518
+ .I file_index - 1.
519
+ In this case, instead of returning a file descriptor, the result will contain
520
+ either 0 on success or an error. If the index points to a valid empty slot, the
521
+ installation is guaranteed to not fail. If there is already a file in the slot,
522
+ it will be replaced, similar to
523
+ .B IORING_OP_FILES_UPDATE.
524
+ Please note that only io_uring has access to such files and no other syscall
525
+ can use them. See
526
+ .B IOSQE_FIXED_FILE
527
+ and
528
+ .B IORING_REGISTER_FILES.
529
+
530
+ Available since 5.5.
531
+
532
+ .TP
533
+ .B IORING_OP_ASYNC_CANCEL
534
+ Attempt to cancel an already issued request.
535
+ .I addr
536
+ must contain the
537
+ .I user_data
538
+ field of the request that should be cancelled. The cancellation request will
539
+ complete with one of the following results codes. If found, the
540
+ .I res
541
+ field of the cqe will contain 0. If not found,
542
+ .I res
543
+ will contain -ENOENT. If found and attempted cancelled, the
544
+ .I res
545
+ field will contain -EALREADY. In this case, the request may or may not
546
+ terminate. In general, requests that are interruptible (like socket IO) will
547
+ get cancelled, while disk IO requests cannot be cancelled if already started.
548
+ Available since 5.5.
549
+
550
+ .TP
551
+ .B IORING_OP_LINK_TIMEOUT
552
+ This request must be linked with another request through
553
+ .I IOSQE_IO_LINK
554
+ which is described below. Unlike
555
+ .I IORING_OP_TIMEOUT,
556
+ .I IORING_OP_LINK_TIMEOUT
557
+ acts on the linked request, not the completion queue. The format of the command
558
+ is otherwise like
559
+ .I IORING_OP_TIMEOUT,
560
+ except there's no completion event count as it's tied to a specific request.
561
+ If used, the timeout specified in the command will cancel the linked command,
562
+ unless the linked command completes before the timeout. The timeout will
563
+ complete with
564
+ .I -ETIME
565
+ if the timer expired and the linked request was attempted cancelled, or
566
+ .I -ECANCELED
567
+ if the timer got cancelled because of completion of the linked request. Like
568
+ .B IORING_OP_TIMEOUT
569
+ the clock source used is
570
+ .B CLOCK_MONOTONIC
571
+ Available since 5.5.
572
+
573
+
574
+ .TP
575
+ .B IORING_OP_CONNECT
576
+ Issue the equivalent of a
577
+ .BR connect(2)
578
+ system call.
579
+ .I fd
580
+ must be set to the socket file descriptor,
581
+ .I addr
582
+ must contain the const pointer to the sockaddr structure, and
583
+ .I off
584
+ must contain the socklen_t addrlen field. See also
585
+ .BR connect(2)
586
+ for the general description of the related system call. Available since 5.5.
587
+
588
+ .TP
589
+ .B IORING_OP_FALLOCATE
590
+ Issue the equivalent of a
591
+ .BR fallocate(2)
592
+ system call.
593
+ .I fd
594
+ must be set to the file descriptor,
595
+ .I len
596
+ must contain the mode associated with the operation,
597
+ .I off
598
+ must contain the offset on which to operate, and
599
+ .I addr
600
+ must contain the length. See also
601
+ .BR fallocate(2)
602
+ for the general description of the related system call. Available since 5.6.
603
+
604
+ .TP
605
+ .B IORING_OP_FADVISE
606
+ Issue the equivalent of a
607
+ .BR posix_fadvise(2)
608
+ system call.
609
+ .I fd
610
+ must be set to the file descriptor,
611
+ .I off
612
+ must contain the offset on which to operate,
613
+ .I len
614
+ must contain the length, and
615
+ .I fadvise_advice
616
+ must contain the advice associated with the operation. See also
617
+ .BR posix_fadvise(2)
618
+ for the general description of the related system call. Available since 5.6.
619
+
620
+ .TP
621
+ .B IORING_OP_MADVISE
622
+ Issue the equivalent of a
623
+ .BR madvise(2)
624
+ system call.
625
+ .I addr
626
+ must contain the address to operate on,
627
+ .I len
628
+ must contain the length on which to operate,
629
+ and
630
+ .I fadvise_advice
631
+ must contain the advice associated with the operation. See also
632
+ .BR madvise(2)
633
+ for the general description of the related system call. Available since 5.6.
634
+
635
+ .TP
636
+ .B IORING_OP_OPENAT
637
+ Issue the equivalent of a
638
+ .BR openat(2)
639
+ system call.
640
+ .I fd
641
+ is the
642
+ .I dirfd
643
+ argument,
644
+ .I addr
645
+ must contain a pointer to the
646
+ .I *pathname
647
+ argument,
648
+ .I open_flags
649
+ should contain any flags passed in, and
650
+ .I len
651
+ is access mode of the file. See also
652
+ .BR openat(2)
653
+ for the general description of the related system call. Available since 5.6.
654
+
655
+ If the
656
+ .I file_index
657
+ field is set to a positive number, the file won't be installed into the
658
+ normal file table as usual but will be placed into the fixed file table at index
659
+ .I file_index - 1.
660
+ In this case, instead of returning a file descriptor, the result will contain
661
+ either 0 on success or an error. If the index points to a valid empty slot, the
662
+ installation is guaranteed to not fail. If there is already a file in the slot,
663
+ it will be replaced, similar to
664
+ .B IORING_OP_FILES_UPDATE.
665
+ Please note that only io_uring has access to such files and no other syscall
666
+ can use them. See
667
+ .B IOSQE_FIXED_FILE
668
+ and
669
+ .B IORING_REGISTER_FILES.
670
+
671
+ Available since 5.15.
672
+
673
+ .TP
674
+ .B IORING_OP_OPENAT2
675
+ Issue the equivalent of a
676
+ .BR openat2(2)
677
+ system call.
678
+ .I fd
679
+ is the
680
+ .I dirfd
681
+ argument,
682
+ .I addr
683
+ must contain a pointer to the
684
+ .I *pathname
685
+ argument,
686
+ .I len
687
+ should contain the size of the open_how structure, and
688
+ .I off
689
+ should be set to the address of the open_how structure. See also
690
+ .BR openat2(2)
691
+ for the general description of the related system call. Available since 5.6.
692
+
693
+ If the
694
+ .I file_index
695
+ field is set to a positive number, the file won't be installed into the
696
+ normal file table as usual but will be placed into the fixed file table at index
697
+ .I file_index - 1.
698
+ In this case, instead of returning a file descriptor, the result will contain
699
+ either 0 on success or an error. If the index points to a valid empty slot, the
700
+ installation is guaranteed to not fail. If there is already a file in the slot,
701
+ it will be replaced, similar to
702
+ .B IORING_OP_FILES_UPDATE.
703
+ Please note that only io_uring has access to such files and no other syscall
704
+ can use them. See
705
+ .B IOSQE_FIXED_FILE
706
+ and
707
+ .B IORING_REGISTER_FILES.
708
+
709
+ Available since 5.15.
710
+
711
+ .TP
712
+ .B IORING_OP_CLOSE
713
+ Issue the equivalent of a
714
+ .BR close(2)
715
+ system call.
716
+ .I fd
717
+ is the file descriptor to be closed. See also
718
+ .BR close(2)
719
+ for the general description of the related system call. Available since 5.6.
720
+ If the
721
+ .I file_index
722
+ field is set to a positive number, this command can be used to close files
723
+ that were direct opened through
724
+ .B IORING_OP_OPENAT
725
+ ,
726
+ .B IORING_OP_OPENAT2
727
+ , or
728
+ .B IORING_OP_ACCEPT
729
+ using the io_uring specific direct descriptors. Note that only one of the
730
+ descriptor fields may be set. The direct close feature is available since
731
+ the 5.15 kernel, where direct descriptors were introduced.
732
+
733
+ .TP
734
+ .B IORING_OP_STATX
735
+ Issue the equivalent of a
736
+ .BR statx(2)
737
+ system call.
738
+ .I fd
739
+ is the
740
+ .I dirfd
741
+ argument,
742
+ .I addr
743
+ must contain a pointer to the
744
+ .I *pathname
745
+ string,
746
+ .I statx_flags
747
+ is the
748
+ .I flags
749
+ argument,
750
+ .I len
751
+ should be the
752
+ .I mask
753
+ argument, and
754
+ .I off
755
+ must contain a pointer to the
756
+ .I statxbuf
757
+ to be filled in. See also
758
+ .BR statx(2)
759
+ for the general description of the related system call. Available since 5.6.
760
+
761
+ .TP
762
+ .B IORING_OP_READ
763
+ .TP
764
+ .B IORING_OP_WRITE
765
+ Issue the equivalent of a
766
+ .BR pread(2)
767
+ or
768
+ .BR pwrite(2)
769
+ system call.
770
+ .I fd
771
+ is the file descriptor to be operated on,
772
+ .I addr
773
+ contains the buffer in question,
774
+ .I len
775
+ contains the length of the IO operation, and
776
+ .I offs
777
+ contains the read or write offset. If
778
+ .I fd
779
+ does not refer to a seekable file,
780
+ .I off
781
+ must be set to zero. If
782
+ .I offs
783
+ is set to
784
+ .B -1
785
+ , the offset will use (and advance) the file position, like the
786
+ .BR read(2)
787
+ and
788
+ .BR write(2)
789
+ system calls. These are non-vectored versions of the
790
+ .B IORING_OP_READV
791
+ and
792
+ .B IORING_OP_WRITEV
793
+ opcodes. See also
794
+ .BR read(2)
795
+ and
796
+ .BR write(2)
797
+ for the general description of the related system call. Available since 5.6.
798
+
799
+ .TP
800
+ .B IORING_OP_SPLICE
801
+ Issue the equivalent of a
802
+ .BR splice(2)
803
+ system call.
804
+ .I splice_fd_in
805
+ is the file descriptor to read from,
806
+ .I splice_off_in
807
+ is an offset to read from,
808
+ .I fd
809
+ is the file descriptor to write to,
810
+ .I off
811
+ is an offset from which to start writing to. A sentinel value of
812
+ .B -1
813
+ is used to pass the equivalent of a NULL for the offsets to
814
+ .BR splice(2).
815
+ .I len
816
+ contains the number of bytes to copy.
817
+ .I splice_flags
818
+ contains a bit mask for the flag field associated with the system call.
819
+ Please note that one of the file descriptors must refer to a pipe.
820
+ See also
821
+ .BR splice(2)
822
+ for the general description of the related system call. Available since 5.7.
823
+
824
+ .TP
825
+ .B IORING_OP_TEE
826
+ Issue the equivalent of a
827
+ .BR tee(2)
828
+ system call.
829
+ .I splice_fd_in
830
+ is the file descriptor to read from,
831
+ .I fd
832
+ is the file descriptor to write to,
833
+ .I len
834
+ contains the number of bytes to copy, and
835
+ .I splice_flags
836
+ contains a bit mask for the flag field associated with the system call.
837
+ Please note that both of the file descriptors must refer to a pipe.
838
+ See also
839
+ .BR tee(2)
840
+ for the general description of the related system call. Available since 5.8.
841
+
842
+ .TP
843
+ .B IORING_OP_FILES_UPDATE
844
+ This command is an alternative to using
845
+ .B IORING_REGISTER_FILES_UPDATE
846
+ which then works in an async fashion, like the rest of the io_uring commands.
847
+ The arguments passed in are the same.
848
+ .I addr
849
+ must contain a pointer to the array of file descriptors,
850
+ .I len
851
+ must contain the length of the array, and
852
+ .I off
853
+ must contain the offset at which to operate. Note that the array of file
854
+ descriptors pointed to in
855
+ .I addr
856
+ must remain valid until this operation has completed. Available since 5.6.
857
+
858
+ .TP
859
+ .B IORING_OP_PROVIDE_BUFFERS
860
+ This command allows an application to register a group of buffers to be used
861
+ by commands that read/receive data. Using buffers in this manner can eliminate
862
+ the need to separate the poll + read, which provides a convenient point in
863
+ time to allocate a buffer for a given request. It's often infeasible to have
864
+ as many buffers available as pending reads or receive. With this feature, the
865
+ application can have its pool of buffers ready in the kernel, and when the
866
+ file or socket is ready to read/receive data, a buffer can be selected for the
867
+ operation.
868
+ .I fd
869
+ must contain the number of buffers to provide,
870
+ .I addr
871
+ must contain the starting address to add buffers from,
872
+ .I len
873
+ must contain the length of each buffer to add from the range,
874
+ .I buf_group
875
+ must contain the group ID of this range of buffers, and
876
+ .I off
877
+ must contain the starting buffer ID of this range of buffers. With that set,
878
+ the kernel adds buffers starting with the memory address in
879
+ .I addr,
880
+ each with a length of
881
+ .I len.
882
+ Hence the application should provide
883
+ .I len * fd
884
+ worth of memory in
885
+ .I addr.
886
+ Buffers are grouped by the group ID, and each buffer within this group will be
887
+ identical in size according to the above arguments. This allows the application
888
+ to provide different groups of buffers, and this is often used to have
889
+ differently sized buffers available depending on what the expectations are of
890
+ the individual request. When submitting a request that should use a provided
891
+ buffer, the
892
+ .B IOSQE_BUFFER_SELECT
893
+ flag must be set, and
894
+ .I buf_group
895
+ must be set to the desired buffer group ID where the buffer should be selected
896
+ from. Available since 5.7.
897
+
898
+ .TP
899
+ .B IORING_OP_REMOVE_BUFFERS
900
+ Remove buffers previously registered with
901
+ .B IORING_OP_PROVIDE_BUFFERS.
902
+ .I fd
903
+ must contain the number of buffers to remove, and
904
+ .I buf_group
905
+ must contain the buffer group ID from which to remove the buffers. Available
906
+ since 5.7.
907
+
908
+ .TP
909
+ .B IORING_OP_SHUTDOWN
910
+ Issue the equivalent of a
911
+ .BR shutdown(2)
912
+ system call.
913
+ .I fd
914
+ is the file descriptor to the socket being shutdown, and
915
+ .I len
916
+ must be set to the
917
+ .I how
918
+ argument. No no other fields should be set. Available since 5.11.
919
+
920
+ .TP
921
+ .B IORING_OP_RENAMEAT
922
+ Issue the equivalent of a
923
+ .BR renameat2(2)
924
+ system call.
925
+ .I fd
926
+ should be set to the
927
+ .I olddirfd,
928
+ .I addr
929
+ should be set to the
930
+ .I oldpath,
931
+ .I len
932
+ should be set to the
933
+ .I newdirfd,
934
+ .I addr
935
+ should be set to the
936
+ .I oldpath,
937
+ .I addr2
938
+ should be set to the
939
+ .I newpath,
940
+ and finally
941
+ .I rename_flags
942
+ should be set to the
943
+ .I flags
944
+ passed in to
945
+ .BR renameat2(2).
946
+ Available since 5.11.
947
+
948
+ .TP
949
+ .B IORING_OP_UNLINKAT
950
+ Issue the equivalent of a
951
+ .BR unlinkat2(2)
952
+ system call.
953
+ .I fd
954
+ should be set to the
955
+ .I dirfd,
956
+ .I addr
957
+ should be set to the
958
+ .I pathname,
959
+ and
960
+ .I unlink_flags
961
+ should be set to the
962
+ .I flags
963
+ being passed in to
964
+ .BR unlinkat(2).
965
+ Available since 5.11.
966
+
967
+ .TP
968
+ .B IORING_OP_MKDIRAT
969
+ Issue the equivalent of a
970
+ .BR mkdirat2(2)
971
+ system call.
972
+ .I fd
973
+ should be set to the
974
+ .I dirfd,
975
+ .I addr
976
+ should be set to the
977
+ .I pathname,
978
+ and
979
+ .I len
980
+ should be set to the
981
+ .I mode
982
+ being passed in to
983
+ .BR mkdirat(2).
984
+ Available since 5.15.
985
+
986
+ .TP
987
+ .B IORING_OP_SYMLINKAT
988
+ Issue the equivalent of a
989
+ .BR symlinkat2(2)
990
+ system call.
991
+ .I fd
992
+ should be set to the
993
+ .I newdirfd,
994
+ .I addr
995
+ should be set to the
996
+ .I target
997
+ and
998
+ .I addr2
999
+ should be set to the
1000
+ .I linkpath
1001
+ being passed in to
1002
+ .BR symlinkat(2).
1003
+ Available since 5.15.
1004
+
1005
+ .TP
1006
+ .B IORING_OP_LINKAT
1007
+ Issue the equivalent of a
1008
+ .BR linkat2(2)
1009
+ system call.
1010
+ .I fd
1011
+ should be set to the
1012
+ .I olddirfd,
1013
+ .I addr
1014
+ should be set to the
1015
+ .I oldpath,
1016
+ .I len
1017
+ should be set to the
1018
+ .I newdirfd,
1019
+ .I addr2
1020
+ should be set to the
1021
+ .I newpath,
1022
+ and
1023
+ .I hardlink_flags
1024
+ should be set to the
1025
+ .I flags
1026
+ being passed in to
1027
+ .BR linkat(2).
1028
+ Available since 5.15.
1029
+
1030
+ .TP
1031
+ .B IORING_OP_MSG_RING
1032
+ Send a message to an io_uring.
1033
+ .I fd
1034
+ must be set to a file descriptor of a ring that the application has access to,
1035
+ .I len
1036
+ can be set to any 32-bit value that the application wishes to pass on, and
1037
+ .I off
1038
+ should be set any 64-bit value that the application wishes to send. On the
1039
+ target ring, a CQE will be posted with the
1040
+ .I res
1041
+ field matching the
1042
+ .I len
1043
+ set, and a
1044
+ .I user_data
1045
+ field matching the
1046
+ .I off
1047
+ value being passed in. This request type can be used to either just wake or
1048
+ interrupt anyone waiting for completions on the target ring, ot it can be used
1049
+ to pass messages via the two fields. Available since 5.18.
1050
+
1051
+ .PP
1052
+ The
1053
+ .I flags
1054
+ field is a bit mask. The supported flags are:
1055
+ .TP
1056
+ .B IOSQE_FIXED_FILE
1057
+ When this flag is specified,
1058
+ .I fd
1059
+ is an index into the files array registered with the io_uring instance (see the
1060
+ .B IORING_REGISTER_FILES
1061
+ section of the
1062
+ .BR io_uring_register (2)
1063
+ man page). Note that this isn't always available for all commands. If used on
1064
+ a command that doesn't support fixed files, the SQE will error with
1065
+ .B -EBADF.
1066
+ Available since 5.1.
1067
+ .TP
1068
+ .B IOSQE_IO_DRAIN
1069
+ When this flag is specified, the SQE will not be started before previously
1070
+ submitted SQEs have completed, and new SQEs will not be started before this
1071
+ one completes. Available since 5.2.
1072
+ .TP
1073
+ .B IOSQE_IO_LINK
1074
+ When this flag is specified, the SQE forms a link with the next SQE in the
1075
+ submission ring. That next SQE will not be started before the previous request
1076
+ completes. This, in effect, forms a chain of SQEs, which can be arbitrarily
1077
+ long. The tail of the chain is denoted by the first SQE that does not have this
1078
+ flag set. Chains are not supported across submission boundaries. Even if the
1079
+ last SQE in a submission has this flag set, it will still terminate the current
1080
+ chain. This flag has no effect on previous SQE submissions, nor does it impact
1081
+ SQEs that are outside of the chain tail. This means that multiple chains can be
1082
+ executing in parallel, or chains and individual SQEs. Only members inside the
1083
+ chain are serialized. A chain of SQEs will be broken, if any request in that
1084
+ chain ends in error. io_uring considers any unexpected result an error. This
1085
+ means that, eg, a short read will also terminate the remainder of the chain.
1086
+ If a chain of SQE links is broken, the remaining unstarted part of the chain
1087
+ will be terminated and completed with
1088
+ .B -ECANCELED
1089
+ as the error code. Available since 5.3.
1090
+ .TP
1091
+ .B IOSQE_IO_HARDLINK
1092
+ Like IOSQE_IO_LINK, but it doesn't sever regardless of the completion result.
1093
+ Note that the link will still sever if we fail submitting the parent request,
1094
+ hard links are only resilient in the presence of completion results for
1095
+ requests that did submit correctly. IOSQE_IO_HARDLINK implies IOSQE_IO_LINK.
1096
+ Available since 5.5.
1097
+ .TP
1098
+ .B IOSQE_ASYNC
1099
+ Normal operation for io_uring is to try and issue an sqe as non-blocking first,
1100
+ and if that fails, execute it in an async manner. To support more efficient
1101
+ overlapped operation of requests that the application knows/assumes will
1102
+ always (or most of the time) block, the application can ask for an sqe to be
1103
+ issued async from the start. Available since 5.6.
1104
+ .TP
1105
+ .B IOSQE_BUFFER_SELECT
1106
+ Used in conjunction with the
1107
+ .B IORING_OP_PROVIDE_BUFFERS
1108
+ command, which registers a pool of buffers to be used by commands that read
1109
+ or receive data. When buffers are registered for this use case, and this
1110
+ flag is set in the command, io_uring will grab a buffer from this pool when
1111
+ the request is ready to receive or read data. If successful, the resulting CQE
1112
+ will have
1113
+ .B IORING_CQE_F_BUFFER
1114
+ set in the flags part of the struct, and the upper
1115
+ .B IORING_CQE_BUFFER_SHIFT
1116
+ bits will contain the ID of the selected buffers. This allows the application
1117
+ to know exactly which buffer was selected for the operation. If no buffers
1118
+ are available and this flag is set, then the request will fail with
1119
+ .B -ENOBUFS
1120
+ as the error code. Once a buffer has been used, it is no longer available in
1121
+ the kernel pool. The application must re-register the given buffer again when
1122
+ it is ready to recycle it (eg has completed using it). Available since 5.7.
1123
+ .TP
1124
+ .B IOSQE_CQE_SKIP_SUCCESS
1125
+ Don't generate a CQE if the request completes successfully. If the request
1126
+ fails, an appropriate CQE will be posted as usual and if there is no
1127
+ .B IOSQE_IO_HARDLINK,
1128
+ CQEs for all linked requests will be omitted. The notion of failure/success is
1129
+ opcode specific and is the same as with breaking chains of
1130
+ .B IOSQE_IO_LINK.
1131
+ One special case is when the request has a linked timeout, then the CQE
1132
+ generation for the linked timeout is decided solely by whether it has
1133
+ .B IOSQE_CQE_SKIP_SUCCESS
1134
+ set, regardless whether it timed out or was cancelled. In other words, if a
1135
+ linked timeout has the flag set, it's guaranteed to not post a CQE.
1136
+
1137
+ The semantics are chosen to accommodate several use cases. First, when all but
1138
+ the last request of a normal link without linked timeouts are marked with the
1139
+ flag, only one CQE per lin is posted. Additionally, it enables supression of
1140
+ CQEs in cases where the side effects of a successfully executed operation is
1141
+ enough for userspace to know the state of the system. One such example would
1142
+ be writing to a synchronisation file.
1143
+
1144
+ This flag is incompatible with
1145
+ .B IOSQE_IO_DRAIN.
1146
+ Using both of them in a single ring is undefined behavior, even when they are
1147
+ not used together in a single request. Currently, after the first request with
1148
+ .B IOSQE_CQE_SKIP_SUCCESS,
1149
+ all subsequent requests marked with drain will be failed at submission time.
1150
+ Note that the error reporting is best effort only, and restrictions may change
1151
+ in the future.
1152
+
1153
+ Available since 5.17.
1154
+
1155
+ .PP
1156
+ .I ioprio
1157
+ specifies the I/O priority. See
1158
+ .BR ioprio_get (2)
1159
+ for a description of Linux I/O priorities.
1160
+
1161
+ .I fd
1162
+ specifies the file descriptor against which the operation will be
1163
+ performed, with the exception noted above.
1164
+
1165
+ If the operation is one of
1166
+ .B IORING_OP_READ_FIXED
1167
+ or
1168
+ .BR IORING_OP_WRITE_FIXED ,
1169
+ .I addr
1170
+ and
1171
+ .I len
1172
+ must fall within the buffer located at
1173
+ .I buf_index
1174
+ in the fixed buffer array. If the operation is either
1175
+ .B IORING_OP_READV
1176
+ or
1177
+ .BR IORING_OP_WRITEV ,
1178
+ then
1179
+ .I addr
1180
+ points to an iovec array of
1181
+ .I len
1182
+ entries.
1183
+
1184
+ .IR rw_flags ,
1185
+ specified for read and write operations, contains a bitwise OR of
1186
+ per-I/O flags, as described in the
1187
+ .BR preadv2 (2)
1188
+ man page.
1189
+
1190
+ The
1191
+ .I fsync_flags
1192
+ bit mask may contain either 0, for a normal file integrity sync, or
1193
+ .B IORING_FSYNC_DATASYNC
1194
+ to provide data sync only semantics. See the descriptions of
1195
+ .B O_SYNC
1196
+ and
1197
+ .B O_DSYNC
1198
+ in the
1199
+ .BR open (2)
1200
+ manual page for more information.
1201
+
1202
+ The bits that may be set in
1203
+ .I poll_events
1204
+ are defined in \fI<poll.h>\fP, and documented in
1205
+ .BR poll (2).
1206
+
1207
+ .I user_data
1208
+ is an application-supplied value that will be copied into
1209
+ the completion queue entry (see below).
1210
+ .I buf_index
1211
+ is an index into an array of fixed buffers, and is only valid if fixed
1212
+ buffers were registered.
1213
+ .I personality
1214
+ is the credentials id to use for this operation. See
1215
+ .BR io_uring_register(2)
1216
+ for how to register personalities with io_uring. If set to 0, the current
1217
+ personality of the submitting task is used.
1218
+ .PP
1219
+ Once the submission queue entry is initialized, I/O is submitted by
1220
+ placing the index of the submission queue entry into the tail of the
1221
+ submission queue. After one or more indexes are added to the queue,
1222
+ and the queue tail is advanced, the
1223
+ .BR io_uring_enter (2)
1224
+ system call can be invoked to initiate the I/O.
1225
+
1226
+ Completions use the following data structure:
1227
+ .PP
1228
+ .in +4n
1229
+ .EX
1230
+ /*
1231
+ * IO completion data structure (Completion Queue Entry)
1232
+ */
1233
+ struct io_uring_cqe {
1234
+ __u64 user_data; /* sqe->data submission passed back */
1235
+ __s32 res; /* result code for this event */
1236
+ __u32 flags;
1237
+ };
1238
+ .EE
1239
+ .in
1240
+ .PP
1241
+ .I user_data
1242
+ is copied from the field of the same name in the submission queue
1243
+ entry. The primary use case is to store data that the application
1244
+ will need to access upon completion of this particular I/O. The
1245
+ .I flags
1246
+ is used for certain commands, like
1247
+ .B IORING_OP_POLL_ADD
1248
+ or in conjunction with
1249
+ .B IOSQE_BUFFER_SELECT
1250
+ , see those entries.
1251
+ .I res
1252
+ is the operation-specific result, but io_uring-specific errors
1253
+ (e.g. flags or opcode invalid) are returned through this field.
1254
+ They are described in section
1255
+ .B CQE ERRORS.
1256
+ .PP
1257
+ For read and write opcodes, the
1258
+ return values match
1259
+ .I errno
1260
+ values documented in the
1261
+ .BR preadv2 (2)
1262
+ and
1263
+ .BR pwritev2 (2)
1264
+ man pages, with
1265
+ .I
1266
+ res
1267
+ holding the equivalent of
1268
+ .I -errno
1269
+ for error cases, or the transferred number of bytes in case the operation
1270
+ is successful. Hence both error and success return can be found in that
1271
+ field in the CQE. For other request types, the return values are documented
1272
+ in the matching man page for that type, or in the opcodes section above for
1273
+ io_uring-specific opcodes.
1274
+ .PP
1275
+ .SH RETURN VALUE
1276
+ .BR io_uring_enter ()
1277
+ returns the number of I/Os successfully consumed. This can be zero
1278
+ if
1279
+ .I to_submit
1280
+ was zero or if the submission queue was empty. Note that if the ring was
1281
+ created with
1282
+ .B IORING_SETUP_SQPOLL
1283
+ specified, then the return value will generally be the same as
1284
+ .I to_submit
1285
+ as submission happens outside the context of the system call.
1286
+
1287
+ The errors related to a submission queue entry will be returned through a
1288
+ completion queue entry (see section
1289
+ .B CQE ERRORS),
1290
+ rather than through the system call itself.
1291
+
1292
+ Errors that occur not on behalf of a submission queue entry are returned via the
1293
+ system call directly. On such an error,
1294
+ .B -1
1295
+ is returned and
1296
+ .I errno
1297
+ is set appropriately.
1298
+ .PP
1299
+ .SH ERRORS
1300
+ These are the errors returned by
1301
+ .BR io_uring_enter ()
1302
+ system call.
1303
+ .TP
1304
+ .B EAGAIN
1305
+ The kernel was unable to allocate memory for the request, or otherwise ran out
1306
+ of resources to handle it. The application should wait for some completions and
1307
+ try again.
1308
+ .TP
1309
+ .B EBADF
1310
+ .I fd
1311
+ is not a valid file descriptor.
1312
+ .TP
1313
+ .B EBADFD
1314
+ .I fd
1315
+ is a valid file descriptor, but the io_uring ring is not in the right state
1316
+ (enabled). See
1317
+ .BR io_uring_register (2)
1318
+ for details on how to enable the ring.
1319
+ .TP
1320
+ .B EBUSY
1321
+ The application is attempting to overcommit the number of requests it can have
1322
+ pending. The application should wait for some completions and try again. May
1323
+ occur if the application tries to queue more requests than we have room for in
1324
+ the CQ ring, or if the application attempts to wait for more events without
1325
+ having reaped the ones already present in the CQ ring.
1326
+ .TP
1327
+ .B EINVAL
1328
+ Some bits in the
1329
+ .I flags
1330
+ argument are invalid.
1331
+ .TP
1332
+ .B EFAULT
1333
+ An invalid user space address was specified for the
1334
+ .I sig
1335
+ argument.
1336
+ .TP
1337
+ .B ENXIO
1338
+ The io_uring instance is in the process of being torn down.
1339
+ .TP
1340
+ .B EOPNOTSUPP
1341
+ .I fd
1342
+ does not refer to an io_uring instance.
1343
+ .TP
1344
+ .B EINTR
1345
+ The operation was interrupted by a delivery of a signal before it could
1346
+ complete; see
1347
+ .BR signal(7).
1348
+ Can happen while waiting for events with
1349
+ .B IORING_ENTER_GETEVENTS.
1350
+
1351
+ .SH CQE ERRORS
1352
+ These io_uring-specific errors are returned as a negative value in the
1353
+ .I res
1354
+ field of the completion queue entry.
1355
+ .TP
1356
+ .B EACCES
1357
+ The
1358
+ .I flags
1359
+ field or
1360
+ .I opcode
1361
+ in a submission queue entry is not allowed due to registered restrictions.
1362
+ See
1363
+ .BR io_uring_register (2)
1364
+ for details on how restrictions work.
1365
+ .TP
1366
+ .B EBADF
1367
+ The
1368
+ .I fd
1369
+ field in the submission queue entry is invalid, or the
1370
+ .B IOSQE_FIXED_FILE
1371
+ flag was set in the submission queue entry, but no files were registered
1372
+ with the io_uring instance.
1373
+ .TP
1374
+ .B EFAULT
1375
+ buffer is outside of the process' accessible address space
1376
+ .TP
1377
+ .B EFAULT
1378
+ .B IORING_OP_READ_FIXED
1379
+ or
1380
+ .B IORING_OP_WRITE_FIXED
1381
+ was specified in the
1382
+ .I opcode
1383
+ field of the submission queue entry, but either buffers were not
1384
+ registered for this io_uring instance, or the address range described
1385
+ by
1386
+ .I addr
1387
+ and
1388
+ .I len
1389
+ does not fit within the buffer registered at
1390
+ .IR buf_index .
1391
+ .TP
1392
+ .B EINVAL
1393
+ The
1394
+ .I flags
1395
+ field or
1396
+ .I opcode
1397
+ in a submission queue entry is invalid.
1398
+ .TP
1399
+ .B EINVAL
1400
+ The
1401
+ .I buf_index
1402
+ member of the submission queue entry is invalid.
1403
+ .TP
1404
+ .B EINVAL
1405
+ The
1406
+ .I personality
1407
+ field in a submission queue entry is invalid.
1408
+ .TP
1409
+ .B EINVAL
1410
+ .B IORING_OP_NOP
1411
+ was specified in the submission queue entry, but the io_uring context
1412
+ was setup for polling
1413
+ .RB ( IORING_SETUP_IOPOLL
1414
+ was specified in the call to io_uring_setup).
1415
+ .TP
1416
+ .B EINVAL
1417
+ .B IORING_OP_READV
1418
+ or
1419
+ .B IORING_OP_WRITEV
1420
+ was specified in the submission queue entry, but the io_uring instance
1421
+ has fixed buffers registered.
1422
+ .TP
1423
+ .B EINVAL
1424
+ .B IORING_OP_READ_FIXED
1425
+ or
1426
+ .B IORING_OP_WRITE_FIXED
1427
+ was specified in the submission queue entry, and the
1428
+ .I buf_index
1429
+ is invalid.
1430
+ .TP
1431
+ .B EINVAL
1432
+ .BR IORING_OP_READV ,
1433
+ .BR IORING_OP_WRITEV ,
1434
+ .BR IORING_OP_READ_FIXED ,
1435
+ .B IORING_OP_WRITE_FIXED
1436
+ or
1437
+ .B IORING_OP_FSYNC
1438
+ was specified in the submission queue entry, but the io_uring instance
1439
+ was configured for IOPOLLing, or any of
1440
+ .IR addr ,
1441
+ .IR ioprio ,
1442
+ .IR off ,
1443
+ .IR len ,
1444
+ or
1445
+ .I buf_index
1446
+ was set in the submission queue entry.
1447
+ .TP
1448
+ .B EINVAL
1449
+ .B IORING_OP_POLL_ADD
1450
+ or
1451
+ .B IORING_OP_POLL_REMOVE
1452
+ was specified in the
1453
+ .I opcode
1454
+ field of the submission queue entry, but the io_uring instance was
1455
+ configured for busy-wait polling
1456
+ .RB ( IORING_SETUP_IOPOLL ),
1457
+ or any of
1458
+ .IR ioprio ,
1459
+ .IR off ,
1460
+ .IR len ,
1461
+ or
1462
+ .I buf_index
1463
+ was non-zero in the submission queue entry.
1464
+ .TP
1465
+ .B EINVAL
1466
+ .B IORING_OP_POLL_ADD
1467
+ was specified in the
1468
+ .I opcode
1469
+ field of the submission queue entry, and the
1470
+ .I addr
1471
+ field was non-zero.
1472
+ .TP
1473
+ .B EOPNOTSUPP
1474
+ .I opcode
1475
+ is valid, but not supported by this kernel.
1476
+ .TP
1477
+ .B EOPNOTSUPP
1478
+ .B IOSQE_BUFFER_SELECT
1479
+ was set in the
1480
+ .I flags
1481
+ field of the submission queue entry, but the
1482
+ .I opcode
1483
+ doesn't support buffer selection.