polyphony 1.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +1 -0
  3. data/CHANGELOG.md +16 -3
  4. data/README.md +1 -0
  5. data/TODO.md +5 -13
  6. data/docs/cheat-sheet.md +248 -0
  7. data/docs/design-principles.md +59 -3
  8. data/docs/faq.md +15 -32
  9. data/docs/fiber-scheduling.md +14 -12
  10. data/docs/overview.md +140 -35
  11. data/docs/readme.md +4 -3
  12. data/docs/tutorial.md +19 -149
  13. data/examples/core/debug.rb +12 -0
  14. data/examples/core/rpc_benchmark.rb +136 -0
  15. data/ext/polyphony/polyphony.c +2 -1
  16. data/lib/polyphony/extensions/fiber.rb +1 -0
  17. data/lib/polyphony/extensions/io.rb +171 -161
  18. data/lib/polyphony/extensions/pipe.rb +3 -5
  19. data/lib/polyphony/extensions/socket.rb +45 -54
  20. data/lib/polyphony/version.rb +1 -1
  21. data/polyphony.gemspec +3 -1
  22. data/test/test_socket.rb +1 -1
  23. metadata +33 -149
  24. data/vendor/liburing/man/IO_URING_CHECK_VERSION.3 +0 -1
  25. data/vendor/liburing/man/IO_URING_VERSION_MAJOR.3 +0 -1
  26. data/vendor/liburing/man/IO_URING_VERSION_MINOR.3 +0 -1
  27. data/vendor/liburing/man/io_uring.7 +0 -781
  28. data/vendor/liburing/man/io_uring_buf_ring_add.3 +0 -53
  29. data/vendor/liburing/man/io_uring_buf_ring_advance.3 +0 -31
  30. data/vendor/liburing/man/io_uring_buf_ring_cq_advance.3 +0 -41
  31. data/vendor/liburing/man/io_uring_buf_ring_init.3 +0 -30
  32. data/vendor/liburing/man/io_uring_buf_ring_mask.3 +0 -27
  33. data/vendor/liburing/man/io_uring_check_version.3 +0 -72
  34. data/vendor/liburing/man/io_uring_close_ring_fd.3 +0 -43
  35. data/vendor/liburing/man/io_uring_cq_advance.3 +0 -49
  36. data/vendor/liburing/man/io_uring_cq_has_overflow.3 +0 -25
  37. data/vendor/liburing/man/io_uring_cq_ready.3 +0 -26
  38. data/vendor/liburing/man/io_uring_cqe_get_data.3 +0 -53
  39. data/vendor/liburing/man/io_uring_cqe_get_data64.3 +0 -1
  40. data/vendor/liburing/man/io_uring_cqe_seen.3 +0 -42
  41. data/vendor/liburing/man/io_uring_enter.2 +0 -1700
  42. data/vendor/liburing/man/io_uring_enter2.2 +0 -1
  43. data/vendor/liburing/man/io_uring_free_probe.3 +0 -27
  44. data/vendor/liburing/man/io_uring_get_events.3 +0 -33
  45. data/vendor/liburing/man/io_uring_get_probe.3 +0 -30
  46. data/vendor/liburing/man/io_uring_get_sqe.3 +0 -57
  47. data/vendor/liburing/man/io_uring_major_version.3 +0 -1
  48. data/vendor/liburing/man/io_uring_minor_version.3 +0 -1
  49. data/vendor/liburing/man/io_uring_opcode_supported.3 +0 -30
  50. data/vendor/liburing/man/io_uring_peek_cqe.3 +0 -38
  51. data/vendor/liburing/man/io_uring_prep_accept.3 +0 -197
  52. data/vendor/liburing/man/io_uring_prep_accept_direct.3 +0 -1
  53. data/vendor/liburing/man/io_uring_prep_cancel.3 +0 -118
  54. data/vendor/liburing/man/io_uring_prep_cancel64.3 +0 -1
  55. data/vendor/liburing/man/io_uring_prep_close.3 +0 -59
  56. data/vendor/liburing/man/io_uring_prep_close_direct.3 +0 -1
  57. data/vendor/liburing/man/io_uring_prep_connect.3 +0 -66
  58. data/vendor/liburing/man/io_uring_prep_fadvise.3 +0 -59
  59. data/vendor/liburing/man/io_uring_prep_fallocate.3 +0 -59
  60. data/vendor/liburing/man/io_uring_prep_fgetxattr.3 +0 -1
  61. data/vendor/liburing/man/io_uring_prep_files_update.3 +0 -92
  62. data/vendor/liburing/man/io_uring_prep_fsetxattr.3 +0 -1
  63. data/vendor/liburing/man/io_uring_prep_fsync.3 +0 -70
  64. data/vendor/liburing/man/io_uring_prep_getxattr.3 +0 -61
  65. data/vendor/liburing/man/io_uring_prep_link.3 +0 -1
  66. data/vendor/liburing/man/io_uring_prep_link_timeout.3 +0 -94
  67. data/vendor/liburing/man/io_uring_prep_linkat.3 +0 -91
  68. data/vendor/liburing/man/io_uring_prep_madvise.3 +0 -56
  69. data/vendor/liburing/man/io_uring_prep_mkdir.3 +0 -1
  70. data/vendor/liburing/man/io_uring_prep_mkdirat.3 +0 -83
  71. data/vendor/liburing/man/io_uring_prep_msg_ring.3 +0 -92
  72. data/vendor/liburing/man/io_uring_prep_msg_ring_cqe_flags.3 +0 -1
  73. data/vendor/liburing/man/io_uring_prep_multishot_accept.3 +0 -1
  74. data/vendor/liburing/man/io_uring_prep_multishot_accept_direct.3 +0 -1
  75. data/vendor/liburing/man/io_uring_prep_nop.3 +0 -28
  76. data/vendor/liburing/man/io_uring_prep_openat.3 +0 -117
  77. data/vendor/liburing/man/io_uring_prep_openat2.3 +0 -117
  78. data/vendor/liburing/man/io_uring_prep_openat2_direct.3 +0 -1
  79. data/vendor/liburing/man/io_uring_prep_openat_direct.3 +0 -1
  80. data/vendor/liburing/man/io_uring_prep_poll_add.3 +0 -72
  81. data/vendor/liburing/man/io_uring_prep_poll_multishot.3 +0 -1
  82. data/vendor/liburing/man/io_uring_prep_poll_remove.3 +0 -55
  83. data/vendor/liburing/man/io_uring_prep_poll_update.3 +0 -89
  84. data/vendor/liburing/man/io_uring_prep_provide_buffers.3 +0 -140
  85. data/vendor/liburing/man/io_uring_prep_read.3 +0 -69
  86. data/vendor/liburing/man/io_uring_prep_read_fixed.3 +0 -72
  87. data/vendor/liburing/man/io_uring_prep_readv.3 +0 -85
  88. data/vendor/liburing/man/io_uring_prep_readv2.3 +0 -111
  89. data/vendor/liburing/man/io_uring_prep_recv.3 +0 -105
  90. data/vendor/liburing/man/io_uring_prep_recv_multishot.3 +0 -1
  91. data/vendor/liburing/man/io_uring_prep_recvmsg.3 +0 -124
  92. data/vendor/liburing/man/io_uring_prep_recvmsg_multishot.3 +0 -1
  93. data/vendor/liburing/man/io_uring_prep_remove_buffers.3 +0 -52
  94. data/vendor/liburing/man/io_uring_prep_rename.3 +0 -1
  95. data/vendor/liburing/man/io_uring_prep_renameat.3 +0 -96
  96. data/vendor/liburing/man/io_uring_prep_send.3 +0 -66
  97. data/vendor/liburing/man/io_uring_prep_send_set_addr.3 +0 -38
  98. data/vendor/liburing/man/io_uring_prep_send_zc.3 +0 -96
  99. data/vendor/liburing/man/io_uring_prep_send_zc_fixed.3 +0 -1
  100. data/vendor/liburing/man/io_uring_prep_sendmsg.3 +0 -89
  101. data/vendor/liburing/man/io_uring_prep_sendmsg_zc.3 +0 -1
  102. data/vendor/liburing/man/io_uring_prep_setxattr.3 +0 -64
  103. data/vendor/liburing/man/io_uring_prep_shutdown.3 +0 -53
  104. data/vendor/liburing/man/io_uring_prep_socket.3 +0 -118
  105. data/vendor/liburing/man/io_uring_prep_socket_direct.3 +0 -1
  106. data/vendor/liburing/man/io_uring_prep_socket_direct_alloc.3 +0 -1
  107. data/vendor/liburing/man/io_uring_prep_splice.3 +0 -120
  108. data/vendor/liburing/man/io_uring_prep_statx.3 +0 -74
  109. data/vendor/liburing/man/io_uring_prep_symlink.3 +0 -1
  110. data/vendor/liburing/man/io_uring_prep_symlinkat.3 +0 -85
  111. data/vendor/liburing/man/io_uring_prep_sync_file_range.3 +0 -59
  112. data/vendor/liburing/man/io_uring_prep_tee.3 +0 -74
  113. data/vendor/liburing/man/io_uring_prep_timeout.3 +0 -95
  114. data/vendor/liburing/man/io_uring_prep_timeout_remove.3 +0 -1
  115. data/vendor/liburing/man/io_uring_prep_timeout_update.3 +0 -98
  116. data/vendor/liburing/man/io_uring_prep_unlink.3 +0 -1
  117. data/vendor/liburing/man/io_uring_prep_unlinkat.3 +0 -82
  118. data/vendor/liburing/man/io_uring_prep_write.3 +0 -67
  119. data/vendor/liburing/man/io_uring_prep_write_fixed.3 +0 -72
  120. data/vendor/liburing/man/io_uring_prep_writev.3 +0 -85
  121. data/vendor/liburing/man/io_uring_prep_writev2.3 +0 -111
  122. data/vendor/liburing/man/io_uring_queue_exit.3 +0 -26
  123. data/vendor/liburing/man/io_uring_queue_init.3 +0 -89
  124. data/vendor/liburing/man/io_uring_queue_init_params.3 +0 -1
  125. data/vendor/liburing/man/io_uring_recvmsg_cmsg_firsthdr.3 +0 -1
  126. data/vendor/liburing/man/io_uring_recvmsg_cmsg_nexthdr.3 +0 -1
  127. data/vendor/liburing/man/io_uring_recvmsg_name.3 +0 -1
  128. data/vendor/liburing/man/io_uring_recvmsg_out.3 +0 -82
  129. data/vendor/liburing/man/io_uring_recvmsg_payload.3 +0 -1
  130. data/vendor/liburing/man/io_uring_recvmsg_payload_length.3 +0 -1
  131. data/vendor/liburing/man/io_uring_recvmsg_validate.3 +0 -1
  132. data/vendor/liburing/man/io_uring_register.2 +0 -834
  133. data/vendor/liburing/man/io_uring_register_buf_ring.3 +0 -140
  134. data/vendor/liburing/man/io_uring_register_buffers.3 +0 -104
  135. data/vendor/liburing/man/io_uring_register_buffers_sparse.3 +0 -1
  136. data/vendor/liburing/man/io_uring_register_buffers_tags.3 +0 -1
  137. data/vendor/liburing/man/io_uring_register_buffers_update_tag.3 +0 -1
  138. data/vendor/liburing/man/io_uring_register_eventfd.3 +0 -51
  139. data/vendor/liburing/man/io_uring_register_eventfd_async.3 +0 -1
  140. data/vendor/liburing/man/io_uring_register_file_alloc_range.3 +0 -52
  141. data/vendor/liburing/man/io_uring_register_files.3 +0 -112
  142. data/vendor/liburing/man/io_uring_register_files_sparse.3 +0 -1
  143. data/vendor/liburing/man/io_uring_register_files_tags.3 +0 -1
  144. data/vendor/liburing/man/io_uring_register_files_update.3 +0 -1
  145. data/vendor/liburing/man/io_uring_register_files_update_tag.3 +0 -1
  146. data/vendor/liburing/man/io_uring_register_iowq_aff.3 +0 -61
  147. data/vendor/liburing/man/io_uring_register_iowq_max_workers.3 +0 -71
  148. data/vendor/liburing/man/io_uring_register_ring_fd.3 +0 -49
  149. data/vendor/liburing/man/io_uring_register_sync_cancel.3 +0 -71
  150. data/vendor/liburing/man/io_uring_setup.2 +0 -669
  151. data/vendor/liburing/man/io_uring_sq_ready.3 +0 -31
  152. data/vendor/liburing/man/io_uring_sq_space_left.3 +0 -25
  153. data/vendor/liburing/man/io_uring_sqe_set_data.3 +0 -48
  154. data/vendor/liburing/man/io_uring_sqe_set_data64.3 +0 -1
  155. data/vendor/liburing/man/io_uring_sqe_set_flags.3 +0 -87
  156. data/vendor/liburing/man/io_uring_sqring_wait.3 +0 -34
  157. data/vendor/liburing/man/io_uring_submit.3 +0 -46
  158. data/vendor/liburing/man/io_uring_submit_and_get_events.3 +0 -31
  159. data/vendor/liburing/man/io_uring_submit_and_wait.3 +0 -38
  160. data/vendor/liburing/man/io_uring_submit_and_wait_timeout.3 +0 -56
  161. data/vendor/liburing/man/io_uring_unregister_buf_ring.3 +0 -30
  162. data/vendor/liburing/man/io_uring_unregister_buffers.3 +0 -27
  163. data/vendor/liburing/man/io_uring_unregister_eventfd.3 +0 -1
  164. data/vendor/liburing/man/io_uring_unregister_files.3 +0 -27
  165. data/vendor/liburing/man/io_uring_unregister_iowq_aff.3 +0 -1
  166. data/vendor/liburing/man/io_uring_unregister_ring_fd.3 +0 -32
  167. data/vendor/liburing/man/io_uring_wait_cqe.3 +0 -40
  168. data/vendor/liburing/man/io_uring_wait_cqe_nr.3 +0 -43
  169. data/vendor/liburing/man/io_uring_wait_cqe_timeout.3 +0 -53
  170. data/vendor/liburing/man/io_uring_wait_cqes.3 +0 -56
@@ -1,1700 +0,0 @@
1
- .\" Copyright (C) 2019 Jens Axboe <axboe@kernel.dk>
2
- .\" Copyright (C) 2019 Red Hat, Inc.
3
- .\"
4
- .\" SPDX-License-Identifier: LGPL-2.0-or-later
5
- .\"
6
- .TH io_uring_enter 2 2019-01-22 "Linux" "Linux Programmer's Manual"
7
- .SH NAME
8
- io_uring_enter \- initiate and/or complete asynchronous I/O
9
- .SH SYNOPSIS
10
- .nf
11
- .BR "#include <liburing.h>"
12
- .PP
13
- .BI "int io_uring_enter(unsigned int " fd ", unsigned int " to_submit ,
14
- .BI " unsigned int " min_complete ", unsigned int " flags ,
15
- .BI " sigset_t *" sig );
16
- .PP
17
- .BI "int io_uring_enter2(unsigned int " fd ", unsigned int " to_submit ,
18
- .BI " unsigned int " min_complete ", unsigned int " flags ,
19
- .BI " sigset_t *" sig ", size_t " sz );
20
- .fi
21
- .PP
22
- .SH DESCRIPTION
23
- .PP
24
- .BR io_uring_enter (2)
25
- is used to initiate and complete I/O using the shared submission and
26
- completion queues setup by a call to
27
- .BR io_uring_setup (2).
28
- A single call can both submit new I/O and wait for completions of I/O
29
- initiated by this call or previous calls to
30
- .BR io_uring_enter (2).
31
-
32
- .I fd
33
- is the file descriptor returned by
34
- .BR io_uring_setup (2).
35
- .I to_submit
36
- specifies the number of I/Os to submit from the submission queue.
37
- .I flags
38
- is a bitmask of the following values:
39
- .TP
40
- .B IORING_ENTER_GETEVENTS
41
- If this flag is set, then the system call will wait for the specified
42
- number of events in
43
- .I min_complete
44
- before returning. This flag can be set along with
45
- .I to_submit
46
- to both submit and complete events in a single system call.
47
- .TP
48
- .B IORING_ENTER_SQ_WAKEUP
49
- If the ring has been created with
50
- .B IORING_SETUP_SQPOLL,
51
- then this flag asks the kernel to wakeup the SQ kernel thread to submit IO.
52
- .TP
53
- .B IORING_ENTER_SQ_WAIT
54
- If the ring has been created with
55
- .B IORING_SETUP_SQPOLL,
56
- then the application has no real insight into when the SQ kernel thread has
57
- consumed entries from the SQ ring. This can lead to a situation where the
58
- application can no longer get a free SQE entry to submit, without knowing
59
- when it one becomes available as the SQ kernel thread consumes them. If
60
- the system call is used with this flag set, then it will wait until at least
61
- one entry is free in the SQ ring.
62
- .TP
63
- .B IORING_ENTER_EXT_ARG
64
- Since kernel 5.11, the system calls arguments have been modified to look like
65
- the following:
66
-
67
- .nf
68
- .BI "int io_uring_enter2(unsigned int " fd ", unsigned int " to_submit ,
69
- .BI " unsigned int " min_complete ", unsigned int " flags ,
70
- .BI " const void *" arg ", size_t " argsz );
71
- .fi
72
-
73
- which behaves just like the original definition by default. However, if
74
- .B IORING_ENTER_EXT_ARG
75
- is set, then instead of a
76
- .I sigset_t
77
- being passed in, a pointer to a
78
- .I struct io_uring_getevents_arg
79
- is used instead and
80
- .I argsz
81
- must be set to the size of this structure. The definition is as follows:
82
-
83
- .nf
84
- .BI "struct io_uring_getevents_args {
85
- .BI " __u64 sigmask;
86
- .BI " __u32 sigmask_sz;
87
- .BI " __u32 pad;
88
- .BI " __u64 ts;
89
- .BI "};
90
- .fi
91
-
92
- which allows passing in both a signal mask as well as pointer to a
93
- .I struct __kernel_timespec
94
- timeout value. If
95
- .I ts
96
- is set to a valid pointer, then this time value indicates the timeout for
97
- waiting on events. If an application is waiting on events and wishes to
98
- stop waiting after a specified amount of time, then this can be accomplished
99
- directly in version 5.11 and newer by using this feature.
100
- .TP
101
- .B IORING_ENTER_REGISTERED_RING
102
- If the ring file descriptor has been registered through use of
103
- .B IORING_REGISTER_RING_FDS,
104
- then setting this flag will tell the kernel that the
105
- .I ring_fd
106
- passed in is the registered ring offset rather than a normal file descriptor.
107
-
108
- .PP
109
- .PP
110
- If the io_uring instance was configured for polling, by specifying
111
- .B IORING_SETUP_IOPOLL
112
- in the call to
113
- .BR io_uring_setup (2),
114
- then min_complete has a slightly different meaning. Passing a value
115
- of 0 instructs the kernel to return any events which are already complete,
116
- without blocking. If
117
- .I min_complete
118
- is a non-zero value, the kernel will still return immediately if any
119
- completion events are available. If no event completions are
120
- available, then the call will poll either until one or more
121
- completions become available, or until the process has exceeded its
122
- scheduler time slice.
123
-
124
- Note that, for interrupt driven I/O (where
125
- .B IORING_SETUP_IOPOLL
126
- was not specified in the call to
127
- .BR io_uring_setup (2)),
128
- an application may check the completion queue for event completions
129
- without entering the kernel at all.
130
- .PP
131
- When the system call returns that a certain amount of SQEs have been
132
- consumed and submitted, it's safe to reuse SQE entries in the ring. This is
133
- true even if the actual IO submission had to be punted to async context,
134
- which means that the SQE may in fact not have been submitted yet. If the
135
- kernel requires later use of a particular SQE entry, it will have made a
136
- private copy of it.
137
-
138
- .I sig
139
- is a pointer to a signal mask (see
140
- .BR sigprocmask (2));
141
- if
142
- .I sig
143
- is not NULL,
144
- .BR io_uring_enter (2)
145
- first replaces the current signal mask by the one pointed to by
146
- .IR sig ,
147
- then waits for events to become available in the completion queue, and
148
- then restores the original signal mask. The following
149
- .BR io_uring_enter (2)
150
- call:
151
- .PP
152
- .in +4n
153
- .EX
154
- ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig);
155
- .EE
156
- .in
157
- .PP
158
- is equivalent to
159
- .I atomically
160
- executing the following calls:
161
- .PP
162
- .in +4n
163
- .EX
164
- pthread_sigmask(SIG_SETMASK, &sig, &orig);
165
- ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL);
166
- pthread_sigmask(SIG_SETMASK, &orig, NULL);
167
- .EE
168
- .in
169
- .PP
170
- See the description of
171
- .BR pselect (2)
172
- for an explanation of why the
173
- .I sig
174
- parameter is necessary.
175
-
176
- Submission queue entries are represented using the following data
177
- structure:
178
- .PP
179
- .in +4n
180
- .EX
181
- /*
182
- * IO submission data structure (Submission Queue Entry)
183
- */
184
- struct io_uring_sqe {
185
- __u8 opcode; /* type of operation for this sqe */
186
- __u8 flags; /* IOSQE_ flags */
187
- __u16 ioprio; /* ioprio for the request */
188
- __s32 fd; /* file descriptor to do IO on */
189
- union {
190
- __u64 off; /* offset into file */
191
- __u64 addr2;
192
- };
193
- union {
194
- __u64 addr; /* pointer to buffer or iovecs */
195
- __u64 splice_off_in;
196
- }
197
- __u32 len; /* buffer size or number of iovecs */
198
- union {
199
- __kernel_rwf_t rw_flags;
200
- __u32 fsync_flags;
201
- __u16 poll_events; /* compatibility */
202
- __u32 poll32_events; /* word-reversed for BE */
203
- __u32 sync_range_flags;
204
- __u32 msg_flags;
205
- __u32 timeout_flags;
206
- __u32 accept_flags;
207
- __u32 cancel_flags;
208
- __u32 open_flags;
209
- __u32 statx_flags;
210
- __u32 fadvise_advice;
211
- __u32 splice_flags;
212
- __u32 rename_flags;
213
- __u32 unlink_flags;
214
- __u32 hardlink_flags;
215
- };
216
- __u64 user_data; /* data to be passed back at completion time */
217
- union {
218
- struct {
219
- /* index into fixed buffers, if used */
220
- union {
221
- /* index into fixed buffers, if used */
222
- __u16 buf_index;
223
- /* for grouped buffer selection */
224
- __u16 buf_group;
225
- }
226
- /* personality to use, if used */
227
- __u16 personality;
228
- union {
229
- __s32 splice_fd_in;
230
- __u32 file_index;
231
- };
232
- };
233
- __u64 __pad2[3];
234
- };
235
- };
236
- .EE
237
- .in
238
- .PP
239
- The
240
- .I opcode
241
- describes the operation to be performed. It can be one of:
242
- .TP
243
- .B IORING_OP_NOP
244
- Do not perform any I/O. This is useful for testing the performance of
245
- the io_uring implementation itself.
246
- .TP
247
- .B IORING_OP_READV
248
- .TP
249
- .B IORING_OP_WRITEV
250
- Vectored read and write operations, similar to
251
- .BR preadv2 (2)
252
- and
253
- .BR pwritev2 (2).
254
- If the file is not seekable,
255
- .I off
256
- must be set to zero or -1.
257
-
258
- .TP
259
- .B IORING_OP_READ_FIXED
260
- .TP
261
- .B IORING_OP_WRITE_FIXED
262
- Read from or write to pre-mapped buffers. See
263
- .BR io_uring_register (2)
264
- for details on how to setup a context for fixed reads and writes.
265
-
266
- .TP
267
- .B IORING_OP_FSYNC
268
- File sync. See also
269
- .BR fsync (2).
270
- Note that, while I/O is initiated in the order in which it appears in
271
- the submission queue, completions are unordered. For example, an
272
- application which places a write I/O followed by an fsync in the
273
- submission queue cannot expect the fsync to apply to the write. The
274
- two operations execute in parallel, so the fsync may complete before
275
- the write is issued to the storage. The same is also true for
276
- previously issued writes that have not completed prior to the fsync.
277
-
278
- .TP
279
- .B IORING_OP_POLL_ADD
280
- Poll the
281
- .I fd
282
- specified in the submission queue entry for the events
283
- specified in the
284
- .I poll_events
285
- field. Unlike poll or epoll without
286
- .BR EPOLLONESHOT ,
287
- by default this interface always works in one shot mode. That is, once the poll
288
- operation is completed, it will have to be resubmitted.
289
-
290
- If
291
- .B IORING_POLL_ADD_MULTI
292
- is set in the SQE
293
- .I len
294
- field, then the poll will work in multi shot mode instead. That means it'll
295
- repatedly trigger when the requested event becomes true, and hence multiple
296
- CQEs can be generated from this single SQE. The CQE
297
- .I flags
298
- field will have
299
- .B IORING_CQE_F_MORE
300
- set on completion if the application should expect further CQE entries from
301
- the original request. If this flag isn't set on completion, then the poll
302
- request has been terminated and no further events will be generated. This mode
303
- is available since 5.13.
304
-
305
- If
306
- .B IORING_POLL_UPDATE_EVENTS
307
- is set in the SQE
308
- .I len
309
- field, then the request will update an existing poll request with the mask of
310
- events passed in with this request. The lookup is based on the
311
- .I user_data
312
- field of the original SQE submitted, and this values is passed in the
313
- .I addr
314
- field of the SQE. This mode is available since 5.13.
315
-
316
- If
317
- .B IORING_POLL_UPDATE_USER_DATA
318
- is set in the SQE
319
- .I len
320
- field, then the request will update the
321
- .I user_data
322
- of an existing poll request based on the value passed in the
323
- .I off
324
- field. This mode is available since 5.13.
325
-
326
- This command works like
327
- an async
328
- .BR poll(2)
329
- and the completion event result is the returned mask of events. For the
330
- variants that update
331
- .I user_data
332
- or
333
- .I events
334
- , the completion result will be similar to
335
- .B IORING_OP_POLL_REMOVE.
336
-
337
- .TP
338
- .B IORING_OP_POLL_REMOVE
339
- Remove an existing poll request. If found, the
340
- .I res
341
- field of the
342
- .I "struct io_uring_cqe"
343
- will contain 0. If not found,
344
- .I res
345
- will contain
346
- .B -ENOENT,
347
- or
348
- .B -EALREADY
349
- if the poll request was in the process of completing already.
350
-
351
- .TP
352
- .B IORING_OP_EPOLL_CTL
353
- Add, remove or modify entries in the interest list of
354
- .BR epoll (7).
355
- See
356
- .BR epoll_ctl (2)
357
- for details of the system call.
358
- .I fd
359
- holds the file descriptor that represents the epoll instance,
360
- .I addr
361
- holds the file descriptor to add, remove or modify,
362
- .I len
363
- holds the operation (EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD) to perform and,
364
- .I off
365
- holds a pointer to the
366
- .I epoll_events
367
- structure. Available since 5.6.
368
-
369
- .TP
370
- .B IORING_OP_SYNC_FILE_RANGE
371
- Issue the equivalent of a \fBsync_file_range\fR (2) on the file descriptor. The
372
- .I fd
373
- field is the file descriptor to sync, the
374
- .I off
375
- field holds the offset in bytes, the
376
- .I len
377
- field holds the length in bytes, and the
378
- .I sync_range_flags
379
- field holds the flags for the command. See also
380
- .BR sync_file_range (2)
381
- for the general description of the related system call. Available since 5.2.
382
-
383
- .TP
384
- .B IORING_OP_SENDMSG
385
- Issue the equivalent of a
386
- .BR sendmsg(2)
387
- system call.
388
- .I fd
389
- must be set to the socket file descriptor,
390
- .I addr
391
- must contain a pointer to the msghdr structure, and
392
- .I msg_flags
393
- holds the flags associated with the system call. See also
394
- .BR sendmsg (2)
395
- for the general description of the related system call. Available since 5.3.
396
-
397
- This command also supports the following modifiers in
398
- .I ioprio:
399
-
400
- .PP
401
- .in +12
402
- .B IORING_RECVSEND_POLL_FIRST
403
- If set, io_uring will assume the socket is currently full and attempting to
404
- send data will be unsuccessful. For this case, io_uring will arm internal
405
- poll and trigger a send of the data when there is enough space available.
406
- This initial send attempt can be wasteful for the case where the socket
407
- is expected to be full, setting this flag will bypass the initial send
408
- attempt and go straight to arming poll. If poll does indicate that data can
409
- be sent, the operation will proceed.
410
- .EE
411
- .in
412
- .PP
413
-
414
- .TP
415
- .B IORING_OP_RECVMSG
416
- Works just like IORING_OP_SENDMSG, except for
417
- .BR recvmsg(2)
418
- instead. See the description of IORING_OP_SENDMSG. Available since 5.3.
419
-
420
- This command also supports the following modifiers in
421
- .I ioprio:
422
-
423
- .PP
424
- .in +12
425
- .B IORING_RECVSEND_POLL_FIRST
426
- If set, io_uring will assume the socket is currently empty and attempting to
427
- receive data will be unsuccessful. For this case, io_uring will arm internal
428
- poll and trigger a receive of the data when the socket has data to be read.
429
- This initial receive attempt can be wasteful for the case where the socket
430
- is expected to be empty, setting this flag will bypass the initial receive
431
- attempt and go straight to arming poll. If poll does indicate that data is
432
- ready to be received, the operation will proceed.
433
- .EE
434
- .in
435
- .PP
436
-
437
- .TP
438
- .B IORING_OP_SEND
439
- Issue the equivalent of a
440
- .BR send(2)
441
- system call.
442
- .I fd
443
- must be set to the socket file descriptor,
444
- .I addr
445
- must contain a pointer to the buffer,
446
- .I len
447
- denotes the length of the buffer to send, and
448
- .I msg_flags
449
- holds the flags associated with the system call. See also
450
- .BR send(2)
451
- for the general description of the related system call. Available since 5.6.
452
-
453
- This command also supports the following modifiers in
454
- .I ioprio:
455
-
456
- .PP
457
- .in +12
458
- .B IORING_RECVSEND_POLL_FIRST
459
- If set, io_uring will assume the socket is currently full and attempting to
460
- send data will be unsuccessful. For this case, io_uring will arm internal
461
- poll and trigger a send of the data when there is enough space available.
462
- This initial send attempt can be wasteful for the case where the socket
463
- is expected to be full, setting this flag will bypass the initial send
464
- attempt and go straight to arming poll. If poll does indicate that data can
465
- be sent, the operation will proceed.
466
- .EE
467
- .in
468
- .PP
469
-
470
- .TP
471
- .B IORING_OP_RECV
472
- Works just like IORING_OP_SEND, except for
473
- .BR recv(2)
474
- instead. See the description of IORING_OP_SEND. Available since 5.6.
475
-
476
- This command also supports the following modifiers in
477
- .I ioprio:
478
-
479
- .PP
480
- .in +12
481
- .B IORING_RECVSEND_POLL_FIRST
482
- If set, io_uring will assume the socket is currently empty and attempting to
483
- receive data will be unsuccessful. For this case, io_uring will arm internal
484
- poll and trigger a receive of the data when the socket has data to be read.
485
- This initial receive attempt can be wasteful for the case where the socket
486
- is expected to be empty, setting this flag will bypass the initial receive
487
- attempt and go straight to arming poll. If poll does indicate that data is
488
- ready to be received, the operation will proceed.
489
- .EE
490
- .in
491
- .PP
492
-
493
- .TP
494
- .B IORING_OP_TIMEOUT
495
- This command will register a timeout operation. The
496
- .I addr
497
- field must contain a pointer to a struct timespec64 structure,
498
- .I len
499
- must contain 1 to signify one timespec64 structure,
500
- .I timeout_flags
501
- may contain IORING_TIMEOUT_ABS
502
- for an absolute timeout value, or 0 for a relative timeout.
503
- .I off
504
- may contain a completion event count. A timeout
505
- will trigger a wakeup event on the completion ring for anyone waiting for
506
- events. A timeout condition is met when either the specified timeout expires,
507
- or the specified number of events have completed. Either condition will
508
- trigger the event. If set to 0, completed events are not counted, which
509
- effectively acts like a timer. io_uring timeouts use the
510
- .B CLOCK_MONOTONIC
511
- clock source. The request will complete with
512
- .I -ETIME
513
- if the timeout got completed through expiration of the timer, or
514
- .I 0
515
- if the timeout got completed through requests completing on their own. If
516
- the timeout was canceled before it expired, the request will complete with
517
- .I -ECANCELED.
518
- Available since 5.4.
519
-
520
- Since 5.15, this command also supports the following modifiers in
521
- .I timeout_flags:
522
-
523
- .PP
524
- .in +12
525
- .B IORING_TIMEOUT_BOOTTIME
526
- If set, then the clocksource used is
527
- .I CLOCK_BOOTTIME
528
- instead of
529
- .I CLOCK_MONOTONIC.
530
- This clocksource differs in that it includes time elapsed if the system was
531
- suspend while having a timeout request in-flight.
532
-
533
- .B IORING_TIMEOUT_REALTIME
534
- If set, then the clocksource used is
535
- .I CLOCK_REALTIME
536
- instead of
537
- .I CLOCK_MONOTONIC.
538
- .EE
539
- .in
540
- .PP
541
-
542
- .TP
543
- .B IORING_OP_TIMEOUT_REMOVE
544
- If
545
- .I timeout_flags are zero, then it attempts to remove an existing timeout
546
- operation.
547
- .I addr
548
- must contain the
549
- .I user_data
550
- field of the previously issued timeout operation. If the specified timeout
551
- request is found and canceled successfully, this request will terminate
552
- with a result value of
553
- .I 0
554
- If the timeout request was found but expiration was already in progress,
555
- this request will terminate with a result value of
556
- .I -EBUSY
557
- If the timeout request wasn't found, the request will terminate with a result
558
- value of
559
- .I -ENOENT
560
- Available since 5.5.
561
-
562
- If
563
- .I timeout_flags
564
- contain
565
- .I IORING_TIMEOUT_UPDATE,
566
- instead of removing an existing operation, it updates it.
567
- .I addr
568
- and return values are same as before.
569
- .I addr2
570
- field must contain a pointer to a struct timespec64 structure.
571
- .I timeout_flags
572
- may also contain IORING_TIMEOUT_ABS, in which case the value given is an
573
- absolute one, not a relative one.
574
- Available since 5.11.
575
-
576
- .TP
577
- .B IORING_OP_ACCEPT
578
- Issue the equivalent of an
579
- .BR accept4(2)
580
- system call.
581
- .I fd
582
- must be set to the socket file descriptor,
583
- .I addr
584
- must contain the pointer to the sockaddr structure, and
585
- .I addr2
586
- must contain a pointer to the socklen_t addrlen field. Flags can be passed using
587
- the
588
- .I accept_flags
589
- field. See also
590
- .BR accept4(2)
591
- for the general description of the related system call. Available since 5.5.
592
-
593
- If the
594
- .I file_index
595
- field is set to a positive number, the file won't be installed into the
596
- normal file table as usual but will be placed into the fixed file table at index
597
- .I file_index - 1.
598
- In this case, instead of returning a file descriptor, the result will contain
599
- either 0 on success or an error. If the index points to a valid empty slot, the
600
- installation is guaranteed to not fail. If there is already a file in the slot,
601
- it will be replaced, similar to
602
- .B IORING_OP_FILES_UPDATE.
603
- Please note that only io_uring has access to such files and no other syscall
604
- can use them. See
605
- .B IOSQE_FIXED_FILE
606
- and
607
- .B IORING_REGISTER_FILES.
608
-
609
- Available since 5.5.
610
-
611
- .TP
612
- .B IORING_OP_ASYNC_CANCEL
613
- Attempt to cancel an already issued request.
614
- .I addr
615
- must contain the
616
- .I user_data
617
- field of the request that should be canceled. The cancelation request will
618
- complete with one of the following results codes. If found, the
619
- .I res
620
- field of the cqe will contain 0. If not found,
621
- .I res
622
- will contain -ENOENT. If found and attempted canceled, the
623
- .I res
624
- field will contain -EALREADY. In this case, the request may or may not
625
- terminate. In general, requests that are interruptible (like socket IO) will
626
- get canceled, while disk IO requests cannot be canceled if already started.
627
- Available since 5.5.
628
-
629
- .TP
630
- .B IORING_OP_LINK_TIMEOUT
631
- This request must be linked with another request through
632
- .I IOSQE_IO_LINK
633
- which is described below. Unlike
634
- .I IORING_OP_TIMEOUT,
635
- .I IORING_OP_LINK_TIMEOUT
636
- acts on the linked request, not the completion queue. The format of the command
637
- is otherwise like
638
- .I IORING_OP_TIMEOUT,
639
- except there's no completion event count as it's tied to a specific request.
640
- If used, the timeout specified in the command will cancel the linked command,
641
- unless the linked command completes before the timeout. The timeout will
642
- complete with
643
- .I -ETIME
644
- if the timer expired and the linked request was attempted canceled, or
645
- .I -ECANCELED
646
- if the timer got canceled because of completion of the linked request. Like
647
- .B IORING_OP_TIMEOUT
648
- the clock source used is
649
- .B CLOCK_MONOTONIC
650
- Available since 5.5.
651
-
652
-
653
- .TP
654
- .B IORING_OP_CONNECT
655
- Issue the equivalent of a
656
- .BR connect(2)
657
- system call.
658
- .I fd
659
- must be set to the socket file descriptor,
660
- .I addr
661
- must contain the const pointer to the sockaddr structure, and
662
- .I off
663
- must contain the socklen_t addrlen field. See also
664
- .BR connect(2)
665
- for the general description of the related system call. Available since 5.5.
666
-
667
- .TP
668
- .B IORING_OP_FALLOCATE
669
- Issue the equivalent of a
670
- .BR fallocate(2)
671
- system call.
672
- .I fd
673
- must be set to the file descriptor,
674
- .I len
675
- must contain the mode associated with the operation,
676
- .I off
677
- must contain the offset on which to operate, and
678
- .I addr
679
- must contain the length. See also
680
- .BR fallocate(2)
681
- for the general description of the related system call. Available since 5.6.
682
-
683
- .TP
684
- .B IORING_OP_FADVISE
685
- Issue the equivalent of a
686
- .BR posix_fadvise(2)
687
- system call.
688
- .I fd
689
- must be set to the file descriptor,
690
- .I off
691
- must contain the offset on which to operate,
692
- .I len
693
- must contain the length, and
694
- .I fadvise_advice
695
- must contain the advice associated with the operation. See also
696
- .BR posix_fadvise(2)
697
- for the general description of the related system call. Available since 5.6.
698
-
699
- .TP
700
- .B IORING_OP_MADVISE
701
- Issue the equivalent of a
702
- .BR madvise(2)
703
- system call.
704
- .I addr
705
- must contain the address to operate on,
706
- .I len
707
- must contain the length on which to operate,
708
- and
709
- .I fadvise_advice
710
- must contain the advice associated with the operation. See also
711
- .BR madvise(2)
712
- for the general description of the related system call. Available since 5.6.
713
-
714
- .TP
715
- .B IORING_OP_OPENAT
716
- Issue the equivalent of a
717
- .BR openat(2)
718
- system call.
719
- .I fd
720
- is the
721
- .I dirfd
722
- argument,
723
- .I addr
724
- must contain a pointer to the
725
- .I *pathname
726
- argument,
727
- .I open_flags
728
- should contain any flags passed in, and
729
- .I len
730
- is access mode of the file. See also
731
- .BR openat(2)
732
- for the general description of the related system call. Available since 5.6.
733
-
734
- If the
735
- .I file_index
736
- field is set to a positive number, the file won't be installed into the
737
- normal file table as usual but will be placed into the fixed file table at index
738
- .I file_index - 1.
739
- In this case, instead of returning a file descriptor, the result will contain
740
- either 0 on success or an error. If the index points to a valid empty slot, the
741
- installation is guaranteed to not fail. If there is already a file in the slot,
742
- it will be replaced, similar to
743
- .B IORING_OP_FILES_UPDATE.
744
- Please note that only io_uring has access to such files and no other syscall
745
- can use them. See
746
- .B IOSQE_FIXED_FILE
747
- and
748
- .B IORING_REGISTER_FILES.
749
-
750
- Available since 5.15.
751
-
752
- .TP
753
- .B IORING_OP_OPENAT2
754
- Issue the equivalent of a
755
- .BR openat2(2)
756
- system call.
757
- .I fd
758
- is the
759
- .I dirfd
760
- argument,
761
- .I addr
762
- must contain a pointer to the
763
- .I *pathname
764
- argument,
765
- .I len
766
- should contain the size of the open_how structure, and
767
- .I off
768
- should be set to the address of the open_how structure. See also
769
- .BR openat2(2)
770
- for the general description of the related system call. Available since 5.6.
771
-
772
- If the
773
- .I file_index
774
- field is set to a positive number, the file won't be installed into the
775
- normal file table as usual but will be placed into the fixed file table at index
776
- .I file_index - 1.
777
- In this case, instead of returning a file descriptor, the result will contain
778
- either 0 on success or an error. If the index points to a valid empty slot, the
779
- installation is guaranteed to not fail. If there is already a file in the slot,
780
- it will be replaced, similar to
781
- .B IORING_OP_FILES_UPDATE.
782
- Please note that only io_uring has access to such files and no other syscall
783
- can use them. See
784
- .B IOSQE_FIXED_FILE
785
- and
786
- .B IORING_REGISTER_FILES.
787
-
788
- Available since 5.15.
789
-
790
- .TP
791
- .B IORING_OP_CLOSE
792
- Issue the equivalent of a
793
- .BR close(2)
794
- system call.
795
- .I fd
796
- is the file descriptor to be closed. See also
797
- .BR close(2)
798
- for the general description of the related system call. Available since 5.6.
799
- If the
800
- .I file_index
801
- field is set to a positive number, this command can be used to close files
802
- that were direct opened through
803
- .B IORING_OP_OPENAT
804
- ,
805
- .B IORING_OP_OPENAT2
806
- , or
807
- .B IORING_OP_ACCEPT
808
- using the io_uring specific direct descriptors. Note that only one of the
809
- descriptor fields may be set. The direct close feature is available since
810
- the 5.15 kernel, where direct descriptors were introduced.
811
-
812
- .TP
813
- .B IORING_OP_STATX
814
- Issue the equivalent of a
815
- .BR statx(2)
816
- system call.
817
- .I fd
818
- is the
819
- .I dirfd
820
- argument,
821
- .I addr
822
- must contain a pointer to the
823
- .I *pathname
824
- string,
825
- .I statx_flags
826
- is the
827
- .I flags
828
- argument,
829
- .I len
830
- should be the
831
- .I mask
832
- argument, and
833
- .I off
834
- must contain a pointer to the
835
- .I statxbuf
836
- to be filled in. See also
837
- .BR statx(2)
838
- for the general description of the related system call. Available since 5.6.
839
-
840
- .TP
841
- .B IORING_OP_READ
842
- .TP
843
- .B IORING_OP_WRITE
844
- Issue the equivalent of a
845
- .BR pread(2)
846
- or
847
- .BR pwrite(2)
848
- system call.
849
- .I fd
850
- is the file descriptor to be operated on,
851
- .I addr
852
- contains the buffer in question,
853
- .I len
854
- contains the length of the IO operation, and
855
- .I offs
856
- contains the read or write offset. If
857
- .I fd
858
- does not refer to a seekable file,
859
- .I off
860
- must be set to zero or -1. If
861
- .I offs
862
- is set to
863
- .B -1
864
- , the offset will use (and advance) the file position, like the
865
- .BR read(2)
866
- and
867
- .BR write(2)
868
- system calls. These are non-vectored versions of the
869
- .B IORING_OP_READV
870
- and
871
- .B IORING_OP_WRITEV
872
- opcodes. See also
873
- .BR read(2)
874
- and
875
- .BR write(2)
876
- for the general description of the related system call. Available since 5.6.
877
-
878
- .TP
879
- .B IORING_OP_SPLICE
880
- Issue the equivalent of a
881
- .BR splice(2)
882
- system call.
883
- .I splice_fd_in
884
- is the file descriptor to read from,
885
- .I splice_off_in
886
- is an offset to read from,
887
- .I fd
888
- is the file descriptor to write to,
889
- .I off
890
- is an offset from which to start writing to. A sentinel value of
891
- .B -1
892
- is used to pass the equivalent of a NULL for the offsets to
893
- .BR splice(2).
894
- .I len
895
- contains the number of bytes to copy.
896
- .I splice_flags
897
- contains a bit mask for the flag field associated with the system call.
898
- Please note that one of the file descriptors must refer to a pipe.
899
- See also
900
- .BR splice(2)
901
- for the general description of the related system call. Available since 5.7.
902
-
903
- .TP
904
- .B IORING_OP_TEE
905
- Issue the equivalent of a
906
- .BR tee(2)
907
- system call.
908
- .I splice_fd_in
909
- is the file descriptor to read from,
910
- .I fd
911
- is the file descriptor to write to,
912
- .I len
913
- contains the number of bytes to copy, and
914
- .I splice_flags
915
- contains a bit mask for the flag field associated with the system call.
916
- Please note that both of the file descriptors must refer to a pipe.
917
- See also
918
- .BR tee(2)
919
- for the general description of the related system call. Available since 5.8.
920
-
921
- .TP
922
- .B IORING_OP_FILES_UPDATE
923
- This command is an alternative to using
924
- .B IORING_REGISTER_FILES_UPDATE
925
- which then works in an async fashion, like the rest of the io_uring commands.
926
- The arguments passed in are the same.
927
- .I addr
928
- must contain a pointer to the array of file descriptors,
929
- .I len
930
- must contain the length of the array, and
931
- .I off
932
- must contain the offset at which to operate. Note that the array of file
933
- descriptors pointed to in
934
- .I addr
935
- must remain valid until this operation has completed. Available since 5.6.
936
-
937
- .TP
938
- .B IORING_OP_PROVIDE_BUFFERS
939
- This command allows an application to register a group of buffers to be used
940
- by commands that read/receive data. Using buffers in this manner can eliminate
941
- the need to separate the poll + read, which provides a convenient point in
942
- time to allocate a buffer for a given request. It's often infeasible to have
943
- as many buffers available as pending reads or receive. With this feature, the
944
- application can have its pool of buffers ready in the kernel, and when the
945
- file or socket is ready to read/receive data, a buffer can be selected for the
946
- operation.
947
- .I fd
948
- must contain the number of buffers to provide,
949
- .I addr
950
- must contain the starting address to add buffers from,
951
- .I len
952
- must contain the length of each buffer to add from the range,
953
- .I buf_group
954
- must contain the group ID of this range of buffers, and
955
- .I off
956
- must contain the starting buffer ID of this range of buffers. With that set,
957
- the kernel adds buffers starting with the memory address in
958
- .I addr,
959
- each with a length of
960
- .I len.
961
- Hence the application should provide
962
- .I len * fd
963
- worth of memory in
964
- .I addr.
965
- Buffers are grouped by the group ID, and each buffer within this group will be
966
- identical in size according to the above arguments. This allows the application
967
- to provide different groups of buffers, and this is often used to have
968
- differently sized buffers available depending on what the expectations are of
969
- the individual request. When submitting a request that should use a provided
970
- buffer, the
971
- .B IOSQE_BUFFER_SELECT
972
- flag must be set, and
973
- .I buf_group
974
- must be set to the desired buffer group ID where the buffer should be selected
975
- from. Available since 5.7.
976
-
977
- .TP
978
- .B IORING_OP_REMOVE_BUFFERS
979
- Remove buffers previously registered with
980
- .B IORING_OP_PROVIDE_BUFFERS.
981
- .I fd
982
- must contain the number of buffers to remove, and
983
- .I buf_group
984
- must contain the buffer group ID from which to remove the buffers. Available
985
- since 5.7.
986
-
987
- .TP
988
- .B IORING_OP_SHUTDOWN
989
- Issue the equivalent of a
990
- .BR shutdown(2)
991
- system call.
992
- .I fd
993
- is the file descriptor to the socket being shutdown, and
994
- .I len
995
- must be set to the
996
- .I how
997
- argument. No no other fields should be set. Available since 5.11.
998
-
999
- .TP
1000
- .B IORING_OP_RENAMEAT
1001
- Issue the equivalent of a
1002
- .BR renameat2(2)
1003
- system call.
1004
- .I fd
1005
- should be set to the
1006
- .I olddirfd,
1007
- .I addr
1008
- should be set to the
1009
- .I oldpath,
1010
- .I len
1011
- should be set to the
1012
- .I newdirfd,
1013
- .I addr
1014
- should be set to the
1015
- .I oldpath,
1016
- .I addr2
1017
- should be set to the
1018
- .I newpath,
1019
- and finally
1020
- .I rename_flags
1021
- should be set to the
1022
- .I flags
1023
- passed in to
1024
- .BR renameat2(2).
1025
- Available since 5.11.
1026
-
1027
- .TP
1028
- .B IORING_OP_UNLINKAT
1029
- Issue the equivalent of a
1030
- .BR unlinkat2(2)
1031
- system call.
1032
- .I fd
1033
- should be set to the
1034
- .I dirfd,
1035
- .I addr
1036
- should be set to the
1037
- .I pathname,
1038
- and
1039
- .I unlink_flags
1040
- should be set to the
1041
- .I flags
1042
- being passed in to
1043
- .BR unlinkat(2).
1044
- Available since 5.11.
1045
-
1046
- .TP
1047
- .B IORING_OP_MKDIRAT
1048
- Issue the equivalent of a
1049
- .BR mkdirat2(2)
1050
- system call.
1051
- .I fd
1052
- should be set to the
1053
- .I dirfd,
1054
- .I addr
1055
- should be set to the
1056
- .I pathname,
1057
- and
1058
- .I len
1059
- should be set to the
1060
- .I mode
1061
- being passed in to
1062
- .BR mkdirat(2).
1063
- Available since 5.15.
1064
-
1065
- .TP
1066
- .B IORING_OP_SYMLINKAT
1067
- Issue the equivalent of a
1068
- .BR symlinkat2(2)
1069
- system call.
1070
- .I fd
1071
- should be set to the
1072
- .I newdirfd,
1073
- .I addr
1074
- should be set to the
1075
- .I target
1076
- and
1077
- .I addr2
1078
- should be set to the
1079
- .I linkpath
1080
- being passed in to
1081
- .BR symlinkat(2).
1082
- Available since 5.15.
1083
-
1084
- .TP
1085
- .B IORING_OP_LINKAT
1086
- Issue the equivalent of a
1087
- .BR linkat2(2)
1088
- system call.
1089
- .I fd
1090
- should be set to the
1091
- .I olddirfd,
1092
- .I addr
1093
- should be set to the
1094
- .I oldpath,
1095
- .I len
1096
- should be set to the
1097
- .I newdirfd,
1098
- .I addr2
1099
- should be set to the
1100
- .I newpath,
1101
- and
1102
- .I hardlink_flags
1103
- should be set to the
1104
- .I flags
1105
- being passed in to
1106
- .BR linkat(2).
1107
- Available since 5.15.
1108
-
1109
- .TP
1110
- .B IORING_OP_MSG_RING
1111
- Send a message to an io_uring.
1112
- .I fd
1113
- must be set to a file descriptor of a ring that the application has access to,
1114
- .I len
1115
- can be set to any 32-bit value that the application wishes to pass on, and
1116
- .I off
1117
- should be set any 64-bit value that the application wishes to send. On the
1118
- target ring, a CQE will be posted with the
1119
- .I res
1120
- field matching the
1121
- .I len
1122
- set, and a
1123
- .I user_data
1124
- field matching the
1125
- .I off
1126
- value being passed in. This request type can be used to either just wake or
1127
- interrupt anyone waiting for completions on the target ring, or it can be used
1128
- to pass messages via the two fields. Available since 5.18.
1129
-
1130
- .TP
1131
- .B IORING_OP_SOCKET
1132
- Issue the equivalent of a
1133
- .BR socket(2)
1134
- system call.
1135
- .I fd
1136
- must contain the communication domain,
1137
- .I off
1138
- must contain the communication type,
1139
- .I len
1140
- must contain the protocol, and
1141
- .I rw_flags
1142
- is currently unused and must be set to zero. See also
1143
- .BR socket(2)
1144
- for the general description of the related system call. Available since 5.19.
1145
-
1146
- If the
1147
- .I file_index
1148
- field is set to a positive number, the file won't be installed into the
1149
- normal file table as usual but will be placed into the fixed file table at index
1150
- .I file_index - 1.
1151
- In this case, instead of returning a file descriptor, the result will contain
1152
- either 0 on success or an error. If the index points to a valid empty slot, the
1153
- installation is guaranteed to not fail. If there is already a file in the slot,
1154
- it will be replaced, similar to
1155
- .B IORING_OP_FILES_UPDATE.
1156
- Please note that only io_uring has access to such files and no other syscall
1157
- can use them. See
1158
- .B IOSQE_FIXED_FILE
1159
- and
1160
- .B IORING_REGISTER_FILES.
1161
-
1162
- Available since 5.19.
1163
-
1164
- .TP
1165
- .B IORING_OP_SEND_ZC
1166
- Issue the zerocopy equivalent of a
1167
- .BR send(2)
1168
- system call. Similar to IORING_OP_SEND, but tries to avoid making intermediate
1169
- copies of data. Zerocopy execution is not guaranteed and may fall back to
1170
- copying. The request may also fail with
1171
- .B -EOPNOTSUPP ,
1172
- when a protocol doesn't support zerocopy, in which case users are recommended
1173
- to use copying sends instead.
1174
-
1175
- The
1176
- .I flags
1177
- field of the first
1178
- .I "struct io_uring_cqe"
1179
- may likely contain
1180
- .B IORING_CQE_F_MORE ,
1181
- which means that there will be a second completion event / notification for
1182
- the request, with the
1183
- .I user_data
1184
- field set to the same value. The user must not modify the data buffer until the
1185
- notification is posted. The first cqe follows the usual rules and so its
1186
- .I res
1187
- field will contain the number of bytes sent or a negative error code. The
1188
- notification's
1189
- .I res
1190
- field will be set to zero and the
1191
- .I flags
1192
- field will contain
1193
- .B IORING_CQE_F_NOTIF .
1194
- The two step model is needed because the kernel may hold on to buffers for a
1195
- long time, e.g. waiting for a TCP ACK, and having a separate cqe for request
1196
- completions allows userspace to push more data without extra delays. Note,
1197
- notifications are only responsible for controlling the lifetime of the buffers,
1198
- and as such don't mean anything about whether the data has atually been sent
1199
- out or received by the other end. Even errored requests may generate a
1200
- notification, and the user must check for
1201
- .B IORING_CQE_F_MORE
1202
- rather than relying on the result.
1203
-
1204
- .I fd
1205
- must be set to the socket file descriptor,
1206
- .I addr
1207
- must contain a pointer to the buffer,
1208
- .I len
1209
- denotes the length of the buffer to send, and
1210
- .I msg_flags
1211
- holds the flags associated with the system call. When
1212
- .I addr2
1213
- is non-zero it points to the address of the target with
1214
- .I addr_len
1215
- specifying its size, turning the request into a
1216
- .BR sendto(2)
1217
- system call equivalent.
1218
-
1219
- Available since 6.0.
1220
-
1221
- This command also supports the following modifiers in
1222
- .I ioprio:
1223
-
1224
- .PP
1225
- .in +12
1226
- .B IORING_RECVSEND_POLL_FIRST
1227
- If set, io_uring will assume the socket is currently full and attempting to
1228
- send data will be unsuccessful. For this case, io_uring will arm internal
1229
- poll and trigger a send of the data when there is enough space available.
1230
- This initial send attempt can be wasteful for the case where the socket
1231
- is expected to be full, setting this flag will bypass the initial send
1232
- attempt and go straight to arming poll. If poll does indicate that data can
1233
- be sent, the operation will proceed.
1234
-
1235
- .B IORING_RECVSEND_FIXED_BUF
1236
- If set, instructs io_uring to use a pre-mapped buffer. The
1237
- .I buf_index
1238
- field should contain an index into an array of fixed buffers. See
1239
- .BR io_uring_register (2)
1240
- for details on how to setup a context for fixed buffer I/O.
1241
- .EE
1242
- .in
1243
- .PP
1244
-
1245
- .PP
1246
- The
1247
- .I flags
1248
- field is a bit mask. The supported flags are:
1249
- .TP
1250
- .B IOSQE_FIXED_FILE
1251
- When this flag is specified,
1252
- .I fd
1253
- is an index into the files array registered with the io_uring instance (see the
1254
- .B IORING_REGISTER_FILES
1255
- section of the
1256
- .BR io_uring_register (2)
1257
- man page). Note that this isn't always available for all commands. If used on
1258
- a command that doesn't support fixed files, the SQE will error with
1259
- .B -EBADF.
1260
- Available since 5.1.
1261
- .TP
1262
- .B IOSQE_IO_DRAIN
1263
- When this flag is specified, the SQE will not be started before previously
1264
- submitted SQEs have completed, and new SQEs will not be started before this
1265
- one completes. Available since 5.2.
1266
- .TP
1267
- .B IOSQE_IO_LINK
1268
- When this flag is specified, the SQE forms a link with the next SQE in the
1269
- submission ring. That next SQE will not be started before the previous request
1270
- completes. This, in effect, forms a chain of SQEs, which can be arbitrarily
1271
- long. The tail of the chain is denoted by the first SQE that does not have this
1272
- flag set. Chains are not supported across submission boundaries. Even if the
1273
- last SQE in a submission has this flag set, it will still terminate the current
1274
- chain. This flag has no effect on previous SQE submissions, nor does it impact
1275
- SQEs that are outside of the chain tail. This means that multiple chains can be
1276
- executing in parallel, or chains and individual SQEs. Only members inside the
1277
- chain are serialized. A chain of SQEs will be broken, if any request in that
1278
- chain ends in error. io_uring considers any unexpected result an error. This
1279
- means that, eg, a short read will also terminate the remainder of the chain.
1280
- If a chain of SQE links is broken, the remaining unstarted part of the chain
1281
- will be terminated and completed with
1282
- .B -ECANCELED
1283
- as the error code. Available since 5.3.
1284
- .TP
1285
- .B IOSQE_IO_HARDLINK
1286
- Like IOSQE_IO_LINK, but it doesn't sever regardless of the completion result.
1287
- Note that the link will still sever if we fail submitting the parent request,
1288
- hard links are only resilient in the presence of completion results for
1289
- requests that did submit correctly. IOSQE_IO_HARDLINK implies IOSQE_IO_LINK.
1290
- Available since 5.5.
1291
- .TP
1292
- .B IOSQE_ASYNC
1293
- Normal operation for io_uring is to try and issue an sqe as non-blocking first,
1294
- and if that fails, execute it in an async manner. To support more efficient
1295
- overlapped operation of requests that the application knows/assumes will
1296
- always (or most of the time) block, the application can ask for an sqe to be
1297
- issued async from the start. Available since 5.6.
1298
- .TP
1299
- .B IOSQE_BUFFER_SELECT
1300
- Used in conjunction with the
1301
- .B IORING_OP_PROVIDE_BUFFERS
1302
- command, which registers a pool of buffers to be used by commands that read
1303
- or receive data. When buffers are registered for this use case, and this
1304
- flag is set in the command, io_uring will grab a buffer from this pool when
1305
- the request is ready to receive or read data. If successful, the resulting CQE
1306
- will have
1307
- .B IORING_CQE_F_BUFFER
1308
- set in the flags part of the struct, and the upper
1309
- .B IORING_CQE_BUFFER_SHIFT
1310
- bits will contain the ID of the selected buffers. This allows the application
1311
- to know exactly which buffer was selected for the operation. If no buffers
1312
- are available and this flag is set, then the request will fail with
1313
- .B -ENOBUFS
1314
- as the error code. Once a buffer has been used, it is no longer available in
1315
- the kernel pool. The application must re-register the given buffer again when
1316
- it is ready to recycle it (eg has completed using it). Available since 5.7.
1317
- .TP
1318
- .B IOSQE_CQE_SKIP_SUCCESS
1319
- Don't generate a CQE if the request completes successfully. If the request
1320
- fails, an appropriate CQE will be posted as usual and if there is no
1321
- .B IOSQE_IO_HARDLINK,
1322
- CQEs for all linked requests will be omitted. The notion of failure/success is
1323
- opcode specific and is the same as with breaking chains of
1324
- .B IOSQE_IO_LINK.
1325
- One special case is when the request has a linked timeout, then the CQE
1326
- generation for the linked timeout is decided solely by whether it has
1327
- .B IOSQE_CQE_SKIP_SUCCESS
1328
- set, regardless whether it timed out or was canceled. In other words, if a
1329
- linked timeout has the flag set, it's guaranteed to not post a CQE.
1330
-
1331
- The semantics are chosen to accommodate several use cases. First, when all but
1332
- the last request of a normal link without linked timeouts are marked with the
1333
- flag, only one CQE per lin is posted. Additionally, it enables suppression of
1334
- CQEs in cases where the side effects of a successfully executed operation is
1335
- enough for userspace to know the state of the system. One such example would
1336
- be writing to a synchronisation file.
1337
-
1338
- This flag is incompatible with
1339
- .B IOSQE_IO_DRAIN.
1340
- Using both of them in a single ring is undefined behavior, even when they are
1341
- not used together in a single request. Currently, after the first request with
1342
- .B IOSQE_CQE_SKIP_SUCCESS,
1343
- all subsequent requests marked with drain will be failed at submission time.
1344
- Note that the error reporting is best effort only, and restrictions may change
1345
- in the future.
1346
-
1347
- Available since 5.17.
1348
-
1349
- .PP
1350
- .I ioprio
1351
- specifies the I/O priority. See
1352
- .BR ioprio_get (2)
1353
- for a description of Linux I/O priorities.
1354
-
1355
- .I fd
1356
- specifies the file descriptor against which the operation will be
1357
- performed, with the exception noted above.
1358
-
1359
- If the operation is one of
1360
- .B IORING_OP_READ_FIXED
1361
- or
1362
- .BR IORING_OP_WRITE_FIXED ,
1363
- .I addr
1364
- and
1365
- .I len
1366
- must fall within the buffer located at
1367
- .I buf_index
1368
- in the fixed buffer array. If the operation is either
1369
- .B IORING_OP_READV
1370
- or
1371
- .BR IORING_OP_WRITEV ,
1372
- then
1373
- .I addr
1374
- points to an iovec array of
1375
- .I len
1376
- entries.
1377
-
1378
- .IR rw_flags ,
1379
- specified for read and write operations, contains a bitwise OR of
1380
- per-I/O flags, as described in the
1381
- .BR preadv2 (2)
1382
- man page.
1383
-
1384
- The
1385
- .I fsync_flags
1386
- bit mask may contain either 0, for a normal file integrity sync, or
1387
- .B IORING_FSYNC_DATASYNC
1388
- to provide data sync only semantics. See the descriptions of
1389
- .B O_SYNC
1390
- and
1391
- .B O_DSYNC
1392
- in the
1393
- .BR open (2)
1394
- manual page for more information.
1395
-
1396
- The bits that may be set in
1397
- .I poll_events
1398
- are defined in \fI<poll.h>\fP, and documented in
1399
- .BR poll (2).
1400
-
1401
- .I user_data
1402
- is an application-supplied value that will be copied into
1403
- the completion queue entry (see below).
1404
- .I buf_index
1405
- is an index into an array of fixed buffers, and is only valid if fixed
1406
- buffers were registered.
1407
- .I personality
1408
- is the credentials id to use for this operation. See
1409
- .BR io_uring_register(2)
1410
- for how to register personalities with io_uring. If set to 0, the current
1411
- personality of the submitting task is used.
1412
- .PP
1413
- Once the submission queue entry is initialized, I/O is submitted by
1414
- placing the index of the submission queue entry into the tail of the
1415
- submission queue. After one or more indexes are added to the queue,
1416
- and the queue tail is advanced, the
1417
- .BR io_uring_enter (2)
1418
- system call can be invoked to initiate the I/O.
1419
-
1420
- Completions use the following data structure:
1421
- .PP
1422
- .in +4n
1423
- .EX
1424
- /*
1425
- * IO completion data structure (Completion Queue Entry)
1426
- */
1427
- struct io_uring_cqe {
1428
- __u64 user_data; /* sqe->data submission passed back */
1429
- __s32 res; /* result code for this event */
1430
- __u32 flags;
1431
- };
1432
- .EE
1433
- .in
1434
- .PP
1435
- .I user_data
1436
- is copied from the field of the same name in the submission queue
1437
- entry. The primary use case is to store data that the application
1438
- will need to access upon completion of this particular I/O. The
1439
- .I flags
1440
- is used for certain commands, like
1441
- .B IORING_OP_POLL_ADD
1442
- or in conjunction with
1443
- .B IOSQE_BUFFER_SELECT
1444
- or
1445
- .B IORING_OP_MSG_RING,
1446
- , see those entries for details.
1447
- .I res
1448
- is the operation-specific result, but io_uring-specific errors
1449
- (e.g. flags or opcode invalid) are returned through this field.
1450
- They are described in section
1451
- .B CQE ERRORS.
1452
- .PP
1453
- For read and write opcodes, the
1454
- return values match
1455
- .I errno
1456
- values documented in the
1457
- .BR preadv2 (2)
1458
- and
1459
- .BR pwritev2 (2)
1460
- man pages, with
1461
- .I
1462
- res
1463
- holding the equivalent of
1464
- .I -errno
1465
- for error cases, or the transferred number of bytes in case the operation
1466
- is successful. Hence both error and success return can be found in that
1467
- field in the CQE. For other request types, the return values are documented
1468
- in the matching man page for that type, or in the opcodes section above for
1469
- io_uring-specific opcodes.
1470
- .PP
1471
- .SH RETURN VALUE
1472
- .BR io_uring_enter (2)
1473
- returns the number of I/Os successfully consumed. This can be zero
1474
- if
1475
- .I to_submit
1476
- was zero or if the submission queue was empty. Note that if the ring was
1477
- created with
1478
- .B IORING_SETUP_SQPOLL
1479
- specified, then the return value will generally be the same as
1480
- .I to_submit
1481
- as submission happens outside the context of the system call.
1482
-
1483
- The errors related to a submission queue entry will be returned through a
1484
- completion queue entry (see section
1485
- .B CQE ERRORS),
1486
- rather than through the system call itself.
1487
-
1488
- Errors that occur not on behalf of a submission queue entry are returned via the
1489
- system call directly. On such an error, a negative error code is returned. The
1490
- caller should not rely on
1491
- .I errno
1492
- variable.
1493
- .PP
1494
- .SH ERRORS
1495
- These are the errors returned by
1496
- .BR io_uring_enter (2)
1497
- system call.
1498
- .TP
1499
- .B EAGAIN
1500
- The kernel was unable to allocate memory for the request, or otherwise ran out
1501
- of resources to handle it. The application should wait for some completions and
1502
- try again.
1503
- .TP
1504
- .B EBADF
1505
- .I fd
1506
- is not a valid file descriptor.
1507
- .TP
1508
- .B EBADFD
1509
- .I fd
1510
- is a valid file descriptor, but the io_uring ring is not in the right state
1511
- (enabled). See
1512
- .BR io_uring_register (2)
1513
- for details on how to enable the ring.
1514
- .TP
1515
- .B EBADR
1516
- At least one CQE was dropped even with the
1517
- .B IORING_FEAT_NODROP
1518
- feature, and there are no otherwise available CQEs. This clears the error state
1519
- and so with no other changes the next call to
1520
- .BR io_uring_setup (2)
1521
- will not have this error. This error should be extremely rare and indicates the
1522
- machine is running critically low on memory and. It may be reasonable for the
1523
- application to terminate running unless it is able to safely handle any CQE
1524
- being lost.
1525
- .TP
1526
- .B EBUSY
1527
- If the
1528
- .B IORING_FEAT_NODROP
1529
- feature flag is set, then
1530
- .B EBUSY
1531
- will be returned if there were overflow entries,
1532
- .B IORING_ENTER_GETEVENTS
1533
- flag is set and not all of the overflow entries were able to be flushed to
1534
- the CQ ring.
1535
-
1536
- Without
1537
- .B IORING_FEAT_NODROP
1538
- the application is attempting to overcommit the number of requests it can have
1539
- pending. The application should wait for some completions and try again. May
1540
- occur if the application tries to queue more requests than we have room for in
1541
- the CQ ring, or if the application attempts to wait for more events without
1542
- having reaped the ones already present in the CQ ring.
1543
- .TP
1544
- .B EINVAL
1545
- Some bits in the
1546
- .I flags
1547
- argument are invalid.
1548
- .TP
1549
- .B EFAULT
1550
- An invalid user space address was specified for the
1551
- .I sig
1552
- argument.
1553
- .TP
1554
- .B ENXIO
1555
- The io_uring instance is in the process of being torn down.
1556
- .TP
1557
- .B EOPNOTSUPP
1558
- .I fd
1559
- does not refer to an io_uring instance.
1560
- .TP
1561
- .B EINTR
1562
- The operation was interrupted by a delivery of a signal before it could
1563
- complete; see
1564
- .BR signal(7).
1565
- Can happen while waiting for events with
1566
- .B IORING_ENTER_GETEVENTS.
1567
-
1568
- .SH CQE ERRORS
1569
- These io_uring-specific errors are returned as a negative value in the
1570
- .I res
1571
- field of the completion queue entry.
1572
- .TP
1573
- .B EACCES
1574
- The
1575
- .I flags
1576
- field or
1577
- .I opcode
1578
- in a submission queue entry is not allowed due to registered restrictions.
1579
- See
1580
- .BR io_uring_register (2)
1581
- for details on how restrictions work.
1582
- .TP
1583
- .B EBADF
1584
- The
1585
- .I fd
1586
- field in the submission queue entry is invalid, or the
1587
- .B IOSQE_FIXED_FILE
1588
- flag was set in the submission queue entry, but no files were registered
1589
- with the io_uring instance.
1590
- .TP
1591
- .B EFAULT
1592
- buffer is outside of the process' accessible address space
1593
- .TP
1594
- .B EFAULT
1595
- .B IORING_OP_READ_FIXED
1596
- or
1597
- .B IORING_OP_WRITE_FIXED
1598
- was specified in the
1599
- .I opcode
1600
- field of the submission queue entry, but either buffers were not
1601
- registered for this io_uring instance, or the address range described
1602
- by
1603
- .I addr
1604
- and
1605
- .I len
1606
- does not fit within the buffer registered at
1607
- .IR buf_index .
1608
- .TP
1609
- .B EINVAL
1610
- The
1611
- .I flags
1612
- field or
1613
- .I opcode
1614
- in a submission queue entry is invalid.
1615
- .TP
1616
- .B EINVAL
1617
- The
1618
- .I buf_index
1619
- member of the submission queue entry is invalid.
1620
- .TP
1621
- .B EINVAL
1622
- The
1623
- .I personality
1624
- field in a submission queue entry is invalid.
1625
- .TP
1626
- .B EINVAL
1627
- .B IORING_OP_NOP
1628
- was specified in the submission queue entry, but the io_uring context
1629
- was setup for polling
1630
- .RB ( IORING_SETUP_IOPOLL
1631
- was specified in the call to io_uring_setup).
1632
- .TP
1633
- .B EINVAL
1634
- .B IORING_OP_READV
1635
- or
1636
- .B IORING_OP_WRITEV
1637
- was specified in the submission queue entry, but the io_uring instance
1638
- has fixed buffers registered.
1639
- .TP
1640
- .B EINVAL
1641
- .B IORING_OP_READ_FIXED
1642
- or
1643
- .B IORING_OP_WRITE_FIXED
1644
- was specified in the submission queue entry, and the
1645
- .I buf_index
1646
- is invalid.
1647
- .TP
1648
- .B EINVAL
1649
- .BR IORING_OP_READV ,
1650
- .BR IORING_OP_WRITEV ,
1651
- .BR IORING_OP_READ_FIXED ,
1652
- .B IORING_OP_WRITE_FIXED
1653
- or
1654
- .B IORING_OP_FSYNC
1655
- was specified in the submission queue entry, but the io_uring instance
1656
- was configured for IOPOLLing, or any of
1657
- .IR addr ,
1658
- .IR ioprio ,
1659
- .IR off ,
1660
- .IR len ,
1661
- or
1662
- .I buf_index
1663
- was set in the submission queue entry.
1664
- .TP
1665
- .B EINVAL
1666
- .B IORING_OP_POLL_ADD
1667
- or
1668
- .B IORING_OP_POLL_REMOVE
1669
- was specified in the
1670
- .I opcode
1671
- field of the submission queue entry, but the io_uring instance was
1672
- configured for busy-wait polling
1673
- .RB ( IORING_SETUP_IOPOLL ),
1674
- or any of
1675
- .IR ioprio ,
1676
- .IR off ,
1677
- .IR len ,
1678
- or
1679
- .I buf_index
1680
- was non-zero in the submission queue entry.
1681
- .TP
1682
- .B EINVAL
1683
- .B IORING_OP_POLL_ADD
1684
- was specified in the
1685
- .I opcode
1686
- field of the submission queue entry, and the
1687
- .I addr
1688
- field was non-zero.
1689
- .TP
1690
- .B EOPNOTSUPP
1691
- .I opcode
1692
- is valid, but not supported by this kernel.
1693
- .TP
1694
- .B EOPNOTSUPP
1695
- .B IOSQE_BUFFER_SELECT
1696
- was set in the
1697
- .I flags
1698
- field of the submission queue entry, but the
1699
- .I opcode
1700
- doesn't support buffer selection.