polyphony 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (158) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/TODO.md +4 -0
  4. data/examples/core/debug.rb +12 -0
  5. data/examples/core/rpc_benchmark.rb +136 -0
  6. data/lib/polyphony/extensions/fiber.rb +1 -0
  7. data/lib/polyphony/extensions/socket.rb +42 -42
  8. data/lib/polyphony/version.rb +1 -1
  9. data/polyphony.gemspec +3 -1
  10. data/test/test_socket.rb +1 -1
  11. metadata +32 -149
  12. data/vendor/liburing/man/IO_URING_CHECK_VERSION.3 +0 -1
  13. data/vendor/liburing/man/IO_URING_VERSION_MAJOR.3 +0 -1
  14. data/vendor/liburing/man/IO_URING_VERSION_MINOR.3 +0 -1
  15. data/vendor/liburing/man/io_uring.7 +0 -781
  16. data/vendor/liburing/man/io_uring_buf_ring_add.3 +0 -53
  17. data/vendor/liburing/man/io_uring_buf_ring_advance.3 +0 -31
  18. data/vendor/liburing/man/io_uring_buf_ring_cq_advance.3 +0 -41
  19. data/vendor/liburing/man/io_uring_buf_ring_init.3 +0 -30
  20. data/vendor/liburing/man/io_uring_buf_ring_mask.3 +0 -27
  21. data/vendor/liburing/man/io_uring_check_version.3 +0 -72
  22. data/vendor/liburing/man/io_uring_close_ring_fd.3 +0 -43
  23. data/vendor/liburing/man/io_uring_cq_advance.3 +0 -49
  24. data/vendor/liburing/man/io_uring_cq_has_overflow.3 +0 -25
  25. data/vendor/liburing/man/io_uring_cq_ready.3 +0 -26
  26. data/vendor/liburing/man/io_uring_cqe_get_data.3 +0 -53
  27. data/vendor/liburing/man/io_uring_cqe_get_data64.3 +0 -1
  28. data/vendor/liburing/man/io_uring_cqe_seen.3 +0 -42
  29. data/vendor/liburing/man/io_uring_enter.2 +0 -1700
  30. data/vendor/liburing/man/io_uring_enter2.2 +0 -1
  31. data/vendor/liburing/man/io_uring_free_probe.3 +0 -27
  32. data/vendor/liburing/man/io_uring_get_events.3 +0 -33
  33. data/vendor/liburing/man/io_uring_get_probe.3 +0 -30
  34. data/vendor/liburing/man/io_uring_get_sqe.3 +0 -57
  35. data/vendor/liburing/man/io_uring_major_version.3 +0 -1
  36. data/vendor/liburing/man/io_uring_minor_version.3 +0 -1
  37. data/vendor/liburing/man/io_uring_opcode_supported.3 +0 -30
  38. data/vendor/liburing/man/io_uring_peek_cqe.3 +0 -38
  39. data/vendor/liburing/man/io_uring_prep_accept.3 +0 -197
  40. data/vendor/liburing/man/io_uring_prep_accept_direct.3 +0 -1
  41. data/vendor/liburing/man/io_uring_prep_cancel.3 +0 -118
  42. data/vendor/liburing/man/io_uring_prep_cancel64.3 +0 -1
  43. data/vendor/liburing/man/io_uring_prep_close.3 +0 -59
  44. data/vendor/liburing/man/io_uring_prep_close_direct.3 +0 -1
  45. data/vendor/liburing/man/io_uring_prep_connect.3 +0 -66
  46. data/vendor/liburing/man/io_uring_prep_fadvise.3 +0 -59
  47. data/vendor/liburing/man/io_uring_prep_fallocate.3 +0 -59
  48. data/vendor/liburing/man/io_uring_prep_fgetxattr.3 +0 -1
  49. data/vendor/liburing/man/io_uring_prep_files_update.3 +0 -92
  50. data/vendor/liburing/man/io_uring_prep_fsetxattr.3 +0 -1
  51. data/vendor/liburing/man/io_uring_prep_fsync.3 +0 -70
  52. data/vendor/liburing/man/io_uring_prep_getxattr.3 +0 -61
  53. data/vendor/liburing/man/io_uring_prep_link.3 +0 -1
  54. data/vendor/liburing/man/io_uring_prep_link_timeout.3 +0 -94
  55. data/vendor/liburing/man/io_uring_prep_linkat.3 +0 -91
  56. data/vendor/liburing/man/io_uring_prep_madvise.3 +0 -56
  57. data/vendor/liburing/man/io_uring_prep_mkdir.3 +0 -1
  58. data/vendor/liburing/man/io_uring_prep_mkdirat.3 +0 -83
  59. data/vendor/liburing/man/io_uring_prep_msg_ring.3 +0 -92
  60. data/vendor/liburing/man/io_uring_prep_msg_ring_cqe_flags.3 +0 -1
  61. data/vendor/liburing/man/io_uring_prep_multishot_accept.3 +0 -1
  62. data/vendor/liburing/man/io_uring_prep_multishot_accept_direct.3 +0 -1
  63. data/vendor/liburing/man/io_uring_prep_nop.3 +0 -28
  64. data/vendor/liburing/man/io_uring_prep_openat.3 +0 -117
  65. data/vendor/liburing/man/io_uring_prep_openat2.3 +0 -117
  66. data/vendor/liburing/man/io_uring_prep_openat2_direct.3 +0 -1
  67. data/vendor/liburing/man/io_uring_prep_openat_direct.3 +0 -1
  68. data/vendor/liburing/man/io_uring_prep_poll_add.3 +0 -72
  69. data/vendor/liburing/man/io_uring_prep_poll_multishot.3 +0 -1
  70. data/vendor/liburing/man/io_uring_prep_poll_remove.3 +0 -55
  71. data/vendor/liburing/man/io_uring_prep_poll_update.3 +0 -89
  72. data/vendor/liburing/man/io_uring_prep_provide_buffers.3 +0 -140
  73. data/vendor/liburing/man/io_uring_prep_read.3 +0 -69
  74. data/vendor/liburing/man/io_uring_prep_read_fixed.3 +0 -72
  75. data/vendor/liburing/man/io_uring_prep_readv.3 +0 -85
  76. data/vendor/liburing/man/io_uring_prep_readv2.3 +0 -111
  77. data/vendor/liburing/man/io_uring_prep_recv.3 +0 -105
  78. data/vendor/liburing/man/io_uring_prep_recv_multishot.3 +0 -1
  79. data/vendor/liburing/man/io_uring_prep_recvmsg.3 +0 -124
  80. data/vendor/liburing/man/io_uring_prep_recvmsg_multishot.3 +0 -1
  81. data/vendor/liburing/man/io_uring_prep_remove_buffers.3 +0 -52
  82. data/vendor/liburing/man/io_uring_prep_rename.3 +0 -1
  83. data/vendor/liburing/man/io_uring_prep_renameat.3 +0 -96
  84. data/vendor/liburing/man/io_uring_prep_send.3 +0 -66
  85. data/vendor/liburing/man/io_uring_prep_send_set_addr.3 +0 -38
  86. data/vendor/liburing/man/io_uring_prep_send_zc.3 +0 -96
  87. data/vendor/liburing/man/io_uring_prep_send_zc_fixed.3 +0 -1
  88. data/vendor/liburing/man/io_uring_prep_sendmsg.3 +0 -89
  89. data/vendor/liburing/man/io_uring_prep_sendmsg_zc.3 +0 -1
  90. data/vendor/liburing/man/io_uring_prep_setxattr.3 +0 -64
  91. data/vendor/liburing/man/io_uring_prep_shutdown.3 +0 -53
  92. data/vendor/liburing/man/io_uring_prep_socket.3 +0 -118
  93. data/vendor/liburing/man/io_uring_prep_socket_direct.3 +0 -1
  94. data/vendor/liburing/man/io_uring_prep_socket_direct_alloc.3 +0 -1
  95. data/vendor/liburing/man/io_uring_prep_splice.3 +0 -120
  96. data/vendor/liburing/man/io_uring_prep_statx.3 +0 -74
  97. data/vendor/liburing/man/io_uring_prep_symlink.3 +0 -1
  98. data/vendor/liburing/man/io_uring_prep_symlinkat.3 +0 -85
  99. data/vendor/liburing/man/io_uring_prep_sync_file_range.3 +0 -59
  100. data/vendor/liburing/man/io_uring_prep_tee.3 +0 -74
  101. data/vendor/liburing/man/io_uring_prep_timeout.3 +0 -95
  102. data/vendor/liburing/man/io_uring_prep_timeout_remove.3 +0 -1
  103. data/vendor/liburing/man/io_uring_prep_timeout_update.3 +0 -98
  104. data/vendor/liburing/man/io_uring_prep_unlink.3 +0 -1
  105. data/vendor/liburing/man/io_uring_prep_unlinkat.3 +0 -82
  106. data/vendor/liburing/man/io_uring_prep_write.3 +0 -67
  107. data/vendor/liburing/man/io_uring_prep_write_fixed.3 +0 -72
  108. data/vendor/liburing/man/io_uring_prep_writev.3 +0 -85
  109. data/vendor/liburing/man/io_uring_prep_writev2.3 +0 -111
  110. data/vendor/liburing/man/io_uring_queue_exit.3 +0 -26
  111. data/vendor/liburing/man/io_uring_queue_init.3 +0 -89
  112. data/vendor/liburing/man/io_uring_queue_init_params.3 +0 -1
  113. data/vendor/liburing/man/io_uring_recvmsg_cmsg_firsthdr.3 +0 -1
  114. data/vendor/liburing/man/io_uring_recvmsg_cmsg_nexthdr.3 +0 -1
  115. data/vendor/liburing/man/io_uring_recvmsg_name.3 +0 -1
  116. data/vendor/liburing/man/io_uring_recvmsg_out.3 +0 -82
  117. data/vendor/liburing/man/io_uring_recvmsg_payload.3 +0 -1
  118. data/vendor/liburing/man/io_uring_recvmsg_payload_length.3 +0 -1
  119. data/vendor/liburing/man/io_uring_recvmsg_validate.3 +0 -1
  120. data/vendor/liburing/man/io_uring_register.2 +0 -834
  121. data/vendor/liburing/man/io_uring_register_buf_ring.3 +0 -140
  122. data/vendor/liburing/man/io_uring_register_buffers.3 +0 -104
  123. data/vendor/liburing/man/io_uring_register_buffers_sparse.3 +0 -1
  124. data/vendor/liburing/man/io_uring_register_buffers_tags.3 +0 -1
  125. data/vendor/liburing/man/io_uring_register_buffers_update_tag.3 +0 -1
  126. data/vendor/liburing/man/io_uring_register_eventfd.3 +0 -51
  127. data/vendor/liburing/man/io_uring_register_eventfd_async.3 +0 -1
  128. data/vendor/liburing/man/io_uring_register_file_alloc_range.3 +0 -52
  129. data/vendor/liburing/man/io_uring_register_files.3 +0 -112
  130. data/vendor/liburing/man/io_uring_register_files_sparse.3 +0 -1
  131. data/vendor/liburing/man/io_uring_register_files_tags.3 +0 -1
  132. data/vendor/liburing/man/io_uring_register_files_update.3 +0 -1
  133. data/vendor/liburing/man/io_uring_register_files_update_tag.3 +0 -1
  134. data/vendor/liburing/man/io_uring_register_iowq_aff.3 +0 -61
  135. data/vendor/liburing/man/io_uring_register_iowq_max_workers.3 +0 -71
  136. data/vendor/liburing/man/io_uring_register_ring_fd.3 +0 -49
  137. data/vendor/liburing/man/io_uring_register_sync_cancel.3 +0 -71
  138. data/vendor/liburing/man/io_uring_setup.2 +0 -669
  139. data/vendor/liburing/man/io_uring_sq_ready.3 +0 -31
  140. data/vendor/liburing/man/io_uring_sq_space_left.3 +0 -25
  141. data/vendor/liburing/man/io_uring_sqe_set_data.3 +0 -48
  142. data/vendor/liburing/man/io_uring_sqe_set_data64.3 +0 -1
  143. data/vendor/liburing/man/io_uring_sqe_set_flags.3 +0 -87
  144. data/vendor/liburing/man/io_uring_sqring_wait.3 +0 -34
  145. data/vendor/liburing/man/io_uring_submit.3 +0 -46
  146. data/vendor/liburing/man/io_uring_submit_and_get_events.3 +0 -31
  147. data/vendor/liburing/man/io_uring_submit_and_wait.3 +0 -38
  148. data/vendor/liburing/man/io_uring_submit_and_wait_timeout.3 +0 -56
  149. data/vendor/liburing/man/io_uring_unregister_buf_ring.3 +0 -30
  150. data/vendor/liburing/man/io_uring_unregister_buffers.3 +0 -27
  151. data/vendor/liburing/man/io_uring_unregister_eventfd.3 +0 -1
  152. data/vendor/liburing/man/io_uring_unregister_files.3 +0 -27
  153. data/vendor/liburing/man/io_uring_unregister_iowq_aff.3 +0 -1
  154. data/vendor/liburing/man/io_uring_unregister_ring_fd.3 +0 -32
  155. data/vendor/liburing/man/io_uring_wait_cqe.3 +0 -40
  156. data/vendor/liburing/man/io_uring_wait_cqe_nr.3 +0 -43
  157. data/vendor/liburing/man/io_uring_wait_cqe_timeout.3 +0 -53
  158. data/vendor/liburing/man/io_uring_wait_cqes.3 +0 -56
@@ -1,781 +0,0 @@
1
- .\" Copyright (C) 2020 Shuveb Hussain <shuveb@gmail.com>
2
- .\" SPDX-License-Identifier: LGPL-2.0-or-later
3
- .\"
4
-
5
- .TH io_uring 7 2020-07-26 "Linux" "Linux Programmer's Manual"
6
- .SH NAME
7
- io_uring \- Asynchronous I/O facility
8
- .SH SYNOPSIS
9
- .nf
10
- .B "#include <linux/io_uring.h>"
11
- .fi
12
- .PP
13
- .SH DESCRIPTION
14
- .PP
15
- .B io_uring
16
- is a Linux-specific API for asynchronous I/O.
17
- It allows the user to submit one or more I/O requests,
18
- which are processed asynchronously without blocking the calling process.
19
- .B io_uring
20
- gets its name from ring buffers which are shared between user space and
21
- kernel space. This arrangement allows for efficient I/O,
22
- while avoiding the overhead of copying buffers between them,
23
- where possible.
24
- This interface makes
25
- .B io_uring
26
- different from other UNIX I/O APIs,
27
- wherein,
28
- rather than just communicate between kernel and user space with system calls,
29
- ring buffers are used as the main mode of communication.
30
- This arrangement has various performance benefits which are discussed in a
31
- separate section below.
32
- This man page uses the terms shared buffers, shared ring buffers and
33
- queues interchangeably.
34
- .PP
35
- The general programming model you need to follow for
36
- .B io_uring
37
- is outlined below
38
- .IP \(bu
39
- Set up shared buffers with
40
- .BR io_uring_setup (2)
41
- and
42
- .BR mmap (2),
43
- mapping into user space shared buffers for the submission queue (SQ) and the
44
- completion queue (CQ).
45
- You place I/O requests you want to make on the SQ,
46
- while the kernel places the results of those operations on the CQ.
47
- .IP \(bu
48
- For every I/O request you need to make (like to read a file, write a file,
49
- accept a socket connection, etc), you create a submission queue entry,
50
- or SQE,
51
- describe the I/O operation you need to get done and add it to the tail of
52
- the submission queue (SQ).
53
- Each I/O operation is,
54
- in essence,
55
- the equivalent of a system call you would have made otherwise,
56
- if you were not using
57
- .BR io_uring .
58
- You can add more than one SQE to the queue depending on the number of
59
- operations you want to request.
60
- .IP \(bu
61
- After you add one or more SQEs,
62
- you need to call
63
- .BR io_uring_enter (2)
64
- to tell the kernel to dequeue your I/O requests off the SQ and begin
65
- processing them.
66
- .IP \(bu
67
- For each SQE you submit,
68
- once it is done processing the request,
69
- the kernel places a completion queue event or CQE at the tail of the
70
- completion queue or CQ.
71
- The kernel places exactly one matching CQE in the CQ for every SQE you
72
- submit on the SQ.
73
- After you retrieve a CQE,
74
- minimally,
75
- you might be interested in checking the
76
- .I res
77
- field of the CQE structure,
78
- which corresponds to the return value of the system
79
- call's equivalent,
80
- had you used it directly without using
81
- .BR io_uring .
82
- For instance,
83
- a read operation under
84
- .BR io_uring ,
85
- started with the
86
- .BR IORING_OP_READ
87
- operation, issues the equivalent of the
88
- .BR read (2)
89
- system call. In practice, it mixes the semantics of
90
- .BR pread (2)
91
- and
92
- .BR preadv2 (2)
93
- in that it takes an explicit offset, and supports using -1 for the offset to
94
- indicate that the current file position should be used instead of passing in
95
- an explicit offset. See the opcode documentation for more details. Given that
96
- io_uring is an async interface,
97
- .I errno
98
- is never used for passing back error information. Instead,
99
- .I res
100
- will contain what the equivalent system call would have returned in case
101
- of success, and in case of error
102
- .I res
103
- will contain
104
- .I -errno .
105
- For example, if the normal read system call would have returned -1 and set
106
- .I errno
107
- to
108
- .B EINVAL ,
109
- then
110
- .I res
111
- would contain
112
- .B -EINVAL .
113
- If the normal system call would have returned a read size of 1024, then
114
- .I res
115
- would contain 1024.
116
- .IP \(bu
117
- Optionally,
118
- .BR io_uring_enter (2)
119
- can also wait for a specified number of requests to be processed by the kernel
120
- before it returns.
121
- If you specified a certain number of completions to wait for,
122
- the kernel would have placed at least those many number of CQEs on the CQ,
123
- which you can then readily read,
124
- right after the return from
125
- .BR io_uring_enter (2).
126
- .IP \(bu
127
- It is important to remember that I/O requests submitted to the kernel can
128
- complete in any order.
129
- It is not necessary for the kernel to process one request after another,
130
- in the order you placed them.
131
- Given that the interface is a ring,
132
- the requests are attempted in order,
133
- however that doesn't imply any sort of ordering on their completion.
134
- When more than one request is in flight,
135
- it is not possible to determine which one will complete first.
136
- When you dequeue CQEs off the CQ,
137
- you should always check which submitted request it corresponds to.
138
- The most common method for doing so is utilizing the
139
- .I user_data
140
- field in the request, which is passed back on the completion side.
141
- .PP
142
- Adding to and reading from the queues:
143
- .IP \(bu
144
- You add SQEs to the tail of the SQ.
145
- The kernel reads SQEs off the head of the queue.
146
- .IP \(bu
147
- The kernel adds CQEs to the tail of the CQ.
148
- You read CQEs off the head of the queue.
149
- .SS Submission queue polling
150
- One of the goals of
151
- .B io_uring
152
- is to provide a means for efficient I/O.
153
- To this end,
154
- .B io_uring
155
- supports a polling mode that lets you avoid the call to
156
- .BR io_uring_enter (2),
157
- which you use to inform the kernel that you have queued SQEs on to the SQ.
158
- With SQ Polling,
159
- .B io_uring
160
- starts a kernel thread that polls the submission queue for any I/O
161
- requests you submit by adding SQEs.
162
- With SQ Polling enabled,
163
- there is no need for you to call
164
- .BR io_uring_enter (2),
165
- letting you avoid the overhead of system calls.
166
- A designated kernel thread dequeues SQEs off the SQ as you add them and
167
- dispatches them for asynchronous processing.
168
- .SS Setting up io_uring
169
- .PP
170
- The main steps in setting up
171
- .B io_uring
172
- consist of mapping in the shared buffers with
173
- .BR mmap (2)
174
- calls.
175
- In the example program included in this man page,
176
- the function
177
- .BR app_setup_uring ()
178
- sets up
179
- .B io_uring
180
- with a QUEUE_DEPTH deep submission queue.
181
- Pay attention to the 2
182
- .BR mmap (2)
183
- calls that set up the shared submission and completion queues.
184
- If your kernel is older than version 5.4,
185
- three
186
- .BR mmap(2)
187
- calls are required.
188
- .PP
189
- .SS Submitting I/O requests
190
- The process of submitting a request consists of describing the I/O
191
- operation you need to get done using an
192
- .B io_uring_sqe
193
- structure instance.
194
- These details describe the equivalent system call and its parameters.
195
- Because the range of I/O operations Linux supports are very varied and the
196
- .B io_uring_sqe
197
- structure needs to be able to describe them,
198
- it has several fields,
199
- some packed into unions for space efficiency.
200
- Here is a simplified version of struct
201
- .B io_uring_sqe
202
- with some of the most often used fields:
203
- .PP
204
- .in +4n
205
- .EX
206
- struct io_uring_sqe {
207
- __u8 opcode; /* type of operation for this sqe */
208
- __s32 fd; /* file descriptor to do IO on */
209
- __u64 off; /* offset into file */
210
- __u64 addr; /* pointer to buffer or iovecs */
211
- __u32 len; /* buffer size or number of iovecs */
212
- __u64 user_data; /* data to be passed back at completion time */
213
- __u8 flags; /* IOSQE_ flags */
214
- ...
215
- };
216
- .EE
217
- .in
218
-
219
- Here is struct
220
- .B io_uring_sqe
221
- in full:
222
-
223
- .in +4n
224
- .EX
225
- struct io_uring_sqe {
226
- __u8 opcode; /* type of operation for this sqe */
227
- __u8 flags; /* IOSQE_ flags */
228
- __u16 ioprio; /* ioprio for the request */
229
- __s32 fd; /* file descriptor to do IO on */
230
- union {
231
- __u64 off; /* offset into file */
232
- __u64 addr2;
233
- };
234
- union {
235
- __u64 addr; /* pointer to buffer or iovecs */
236
- __u64 splice_off_in;
237
- };
238
- __u32 len; /* buffer size or number of iovecs */
239
- union {
240
- __kernel_rwf_t rw_flags;
241
- __u32 fsync_flags;
242
- __u16 poll_events; /* compatibility */
243
- __u32 poll32_events; /* word-reversed for BE */
244
- __u32 sync_range_flags;
245
- __u32 msg_flags;
246
- __u32 timeout_flags;
247
- __u32 accept_flags;
248
- __u32 cancel_flags;
249
- __u32 open_flags;
250
- __u32 statx_flags;
251
- __u32 fadvise_advice;
252
- __u32 splice_flags;
253
- };
254
- __u64 user_data; /* data to be passed back at completion time */
255
- union {
256
- struct {
257
- /* pack this to avoid bogus arm OABI complaints */
258
- union {
259
- /* index into fixed buffers, if used */
260
- __u16 buf_index;
261
- /* for grouped buffer selection */
262
- __u16 buf_group;
263
- } __attribute__((packed));
264
- /* personality to use, if used */
265
- __u16 personality;
266
- __s32 splice_fd_in;
267
- };
268
- __u64 __pad2[3];
269
- };
270
- };
271
- .EE
272
- .in
273
- .PP
274
- To submit an I/O request to
275
- .BR io_uring ,
276
- you need to acquire a submission queue entry (SQE) from the submission
277
- queue (SQ),
278
- fill it up with details of the operation you want to submit and call
279
- .BR io_uring_enter (2).
280
- There are helper functions of the form io_uring_prep_X to enable proper
281
- setup of the SQE. If you want to avoid calling
282
- .BR io_uring_enter (2),
283
- you have the option of setting up Submission Queue Polling.
284
- .PP
285
- SQEs are added to the tail of the submission queue.
286
- The kernel picks up SQEs off the head of the SQ.
287
- The general algorithm to get the next available SQE and update the tail is
288
- as follows.
289
- .PP
290
- .in +4n
291
- .EX
292
- struct io_uring_sqe *sqe;
293
- unsigned tail, index;
294
- tail = *sqring->tail;
295
- index = tail & (*sqring->ring_mask);
296
- sqe = &sqring->sqes[index];
297
- /* fill up details about this I/O request */
298
- describe_io(sqe);
299
- /* fill the sqe index into the SQ ring array */
300
- sqring->array[index] = index;
301
- tail++;
302
- atomic_store_release(sqring->tail, tail);
303
- .EE
304
- .in
305
- .PP
306
- To get the index of an entry,
307
- the application must mask the current tail index with the size mask of the
308
- ring.
309
- This holds true for both SQs and CQs.
310
- Once the SQE is acquired,
311
- the necessary fields are filled in,
312
- describing the request.
313
- While the CQ ring directly indexes the shared array of CQEs,
314
- the submission side has an indirection array between them.
315
- The submission side ring buffer is an index into this array,
316
- which in turn contains the index into the SQEs.
317
- .PP
318
- The following code snippet demonstrates how a read operation,
319
- an equivalent of a
320
- .BR preadv2 (2)
321
- system call is described by filling up an SQE with the necessary
322
- parameters.
323
- .PP
324
- .in +4n
325
- .EX
326
- struct iovec iovecs[16];
327
- ...
328
- sqe->opcode = IORING_OP_READV;
329
- sqe->fd = fd;
330
- sqe->addr = (unsigned long) iovecs;
331
- sqe->len = 16;
332
- sqe->off = offset;
333
- sqe->flags = 0;
334
- .EE
335
- .in
336
- .TP
337
- .B Memory ordering
338
- Modern compilers and CPUs freely reorder reads and writes without
339
- affecting the program's outcome to optimize performance.
340
- Some aspects of this need to be kept in mind on SMP systems since
341
- .B io_uring
342
- involves buffers shared between kernel and user space.
343
- These buffers are both visible and modifiable from kernel and user space.
344
- As heads and tails belonging to these shared buffers are updated by kernel
345
- and user space,
346
- changes need to be coherently visible on either side,
347
- irrespective of whether a CPU switch took place after the kernel-user mode
348
- switch happened.
349
- We use memory barriers to enforce this coherency.
350
- Being significantly large subjects on their own,
351
- memory barriers are out of scope for further discussion on this man page.
352
- .TP
353
- .B Letting the kernel know about I/O submissions
354
- Once you place one or more SQEs on to the SQ,
355
- you need to let the kernel know that you've done so.
356
- You can do this by calling the
357
- .BR io_uring_enter (2)
358
- system call.
359
- This system call is also capable of waiting for a specified count of
360
- events to complete.
361
- This way,
362
- you can be sure to find completion events in the completion queue without
363
- having to poll it for events later.
364
- .SS Reading completion events
365
- Similar to the submission queue (SQ),
366
- the completion queue (CQ) is a shared buffer between the kernel and user
367
- space.
368
- Whereas you placed submission queue entries on the tail of the SQ and the
369
- kernel read off the head,
370
- when it comes to the CQ,
371
- the kernel places completion queue events or CQEs on the tail of the CQ and
372
- you read off its head.
373
- .PP
374
- Submission is flexible (and thus a bit more complicated) since it needs to
375
- be able to encode different types of system calls that take various
376
- parameters.
377
- Completion,
378
- on the other hand is simpler since we're looking only for a return value
379
- back from the kernel.
380
- This is easily understood by looking at the completion queue event
381
- structure,
382
- struct
383
- .BR io_uring_cqe :
384
- .PP
385
- .in +4n
386
- .EX
387
- struct io_uring_cqe {
388
- __u64 user_data; /* sqe->data submission passed back */
389
- __s32 res; /* result code for this event */
390
- __u32 flags;
391
- };
392
- .EE
393
- .in
394
- .PP
395
- Here,
396
- .I user_data
397
- is custom data that is passed unchanged from submission to completion.
398
- That is,
399
- from SQEs to CQEs.
400
- This field can be used to set context,
401
- uniquely identifying submissions that got completed.
402
- Given that I/O requests can complete in any order,
403
- this field can be used to correlate a submission with a completion.
404
- .I res
405
- is the result from the system call that was performed as part of the
406
- submission;
407
- its return value.
408
-
409
- The
410
- .I flags
411
- field carries request-specific information. As of the 6.0 kernel, the following
412
- flags are defined:
413
-
414
- .TP
415
- .B IORING_CQE_F_BUFFER
416
- If set, the upper 16 bits of the flags field carries the buffer ID that was
417
- chosen for this request. The request must have been issued with
418
- .B IOSQE_BUFFER_SELECT
419
- set, and used with a request type that supports buffer selection. Additionally,
420
- buffers must have been provided upfront either via the
421
- .B IORING_OP_PROVIDE_BUFFERS
422
- or the
423
- .B IORING_REGISTER_PBUF_RING
424
- methods.
425
- .TP
426
- .B IORING_CQE_F_MORE
427
- If set, the application should expect more completions from the request. This
428
- is used for requests that can generate multiple completions, such as multi-shot
429
- requests, receive, or accept.
430
- .TP
431
- .B IORING_CQE_F_SOCK_NONEMPTY
432
- If set, upon receiving the data from the socket in the current request, the
433
- socket still had data left on completion of this request.
434
- .TP
435
- .B IORING_CQE_F_NOTIF
436
- Set for notification CQEs, as seen with the zero-copy networking send and
437
- receive support.
438
- .PP
439
- The general sequence to read completion events off the completion queue is
440
- as follows:
441
- .PP
442
- .in +4n
443
- .EX
444
- unsigned head;
445
- head = *cqring->head;
446
- if (head != atomic_load_acquire(cqring->tail)) {
447
- struct io_uring_cqe *cqe;
448
- unsigned index;
449
- index = head & (cqring->mask);
450
- cqe = &cqring->cqes[index];
451
- /* process completed CQE */
452
- process_cqe(cqe);
453
- /* CQE consumption complete */
454
- head++;
455
- }
456
- atomic_store_release(cqring->head, head);
457
- .EE
458
- .in
459
- .PP
460
- It helps to be reminded that the kernel adds CQEs to the tail of the CQ,
461
- while you need to dequeue them off the head.
462
- To get the index of an entry at the head,
463
- the application must mask the current head index with the size mask of the
464
- ring.
465
- Once the CQE has been consumed or processed,
466
- the head needs to be updated to reflect the consumption of the CQE.
467
- Attention should be paid to the read and write barriers to ensure
468
- successful read and update of the head.
469
- .SS io_uring performance
470
- Because of the shared ring buffers between kernel and user space,
471
- .B io_uring
472
- can be a zero-copy system.
473
- Copying buffers to and from becomes necessary when system calls that
474
- transfer data between kernel and user space are involved.
475
- But since the bulk of the communication in
476
- .B io_uring
477
- is via buffers shared between the kernel and user space,
478
- this huge performance overhead is completely avoided.
479
- .PP
480
- While system calls may not seem like a significant overhead,
481
- in high performance applications,
482
- making a lot of them will begin to matter.
483
- While workarounds the operating system has in place to deal with Spectre
484
- and Meltdown are ideally best done away with,
485
- unfortunately,
486
- some of these workarounds are around the system call interface,
487
- making system calls not as cheap as before on affected hardware.
488
- While newer hardware should not need these workarounds,
489
- hardware with these vulnerabilities can be expected to be in the wild for a
490
- long time.
491
- While using synchronous programming interfaces or even when using
492
- asynchronous programming interfaces under Linux,
493
- there is at least one system call involved in the submission of each
494
- request.
495
- In
496
- .BR io_uring ,
497
- on the other hand,
498
- you can batch several requests in one go,
499
- simply by queueing up multiple SQEs,
500
- each describing an I/O operation you want and make a single call to
501
- .BR io_uring_enter (2).
502
- This is possible due to
503
- .BR io_uring 's
504
- shared buffers based design.
505
- .PP
506
- While this batching in itself can avoid the overhead associated with
507
- potentially multiple and frequent system calls,
508
- you can reduce even this overhead further with Submission Queue Polling,
509
- by having the kernel poll and pick up your SQEs for processing as you add
510
- them to the submission queue. This avoids the
511
- .BR io_uring_enter (2)
512
- call you need to make to tell the kernel to pick SQEs up.
513
- For high-performance applications,
514
- this means even fewer system call overheads.
515
- .SH CONFORMING TO
516
- .B io_uring
517
- is Linux-specific.
518
- .SH EXAMPLES
519
- The following example uses
520
- .B io_uring
521
- to copy stdin to stdout.
522
- Using shell redirection,
523
- you should be able to copy files with this example.
524
- Because it uses a queue depth of only one,
525
- this example processes I/O requests one after the other.
526
- It is purposefully kept this way to aid understanding.
527
- In real-world scenarios however,
528
- you'll want to have a larger queue depth to parallelize I/O request
529
- processing so as to gain the kind of performance benefits
530
- .B io_uring
531
- provides with its asynchronous processing of requests.
532
- .PP
533
- .EX
534
- #include <stdio.h>
535
- #include <stdlib.h>
536
- #include <sys/stat.h>
537
- #include <sys/ioctl.h>
538
- #include <sys/syscall.h>
539
- #include <sys/mman.h>
540
- #include <sys/uio.h>
541
- #include <linux/fs.h>
542
- #include <fcntl.h>
543
- #include <unistd.h>
544
- #include <string.h>
545
- #include <stdatomic.h>
546
-
547
- #include <linux/io_uring.h>
548
-
549
- #define QUEUE_DEPTH 1
550
- #define BLOCK_SZ 1024
551
-
552
- /* Macros for barriers needed by io_uring */
553
- #define io_uring_smp_store_release(p, v) \\
554
- atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \\
555
- memory_order_release)
556
- #define io_uring_smp_load_acquire(p) \\
557
- atomic_load_explicit((_Atomic typeof(*(p)) *)(p), \\
558
- memory_order_acquire)
559
-
560
- int ring_fd;
561
- unsigned *sring_tail, *sring_mask, *sring_array,
562
- *cring_head, *cring_tail, *cring_mask;
563
- struct io_uring_sqe *sqes;
564
- struct io_uring_cqe *cqes;
565
- char buff[BLOCK_SZ];
566
- off_t offset;
567
-
568
- /*
569
- * System call wrappers provided since glibc does not yet
570
- * provide wrappers for io_uring system calls.
571
- * */
572
-
573
- int io_uring_setup(unsigned entries, struct io_uring_params *p)
574
- {
575
- return (int) syscall(__NR_io_uring_setup, entries, p);
576
- }
577
-
578
- int io_uring_enter(int ring_fd, unsigned int to_submit,
579
- unsigned int min_complete, unsigned int flags)
580
- {
581
- return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit,
582
- min_complete, flags, NULL, 0);
583
- }
584
-
585
- int app_setup_uring(void) {
586
- struct io_uring_params p;
587
- void *sq_ptr, *cq_ptr;
588
-
589
- /* See io_uring_setup(2) for io_uring_params.flags you can set */
590
- memset(&p, 0, sizeof(p));
591
- ring_fd = io_uring_setup(QUEUE_DEPTH, &p);
592
- if (ring_fd < 0) {
593
- perror("io_uring_setup");
594
- return 1;
595
- }
596
-
597
- /*
598
- * io_uring communication happens via 2 shared kernel-user space ring
599
- * buffers, which can be jointly mapped with a single mmap() call in
600
- * kernels >= 5.4.
601
- */
602
-
603
- int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
604
- int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
605
-
606
- /* Rather than check for kernel version, the recommended way is to
607
- * check the features field of the io_uring_params structure, which is a
608
- * bitmask. If IORING_FEAT_SINGLE_MMAP is set, we can do away with the
609
- * second mmap() call to map in the completion ring separately.
610
- */
611
- if (p.features & IORING_FEAT_SINGLE_MMAP) {
612
- if (cring_sz > sring_sz)
613
- sring_sz = cring_sz;
614
- cring_sz = sring_sz;
615
- }
616
-
617
- /* Map in the submission and completion queue ring buffers.
618
- * Kernels < 5.4 only map in the submission queue, though.
619
- */
620
- sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
621
- MAP_SHARED | MAP_POPULATE,
622
- ring_fd, IORING_OFF_SQ_RING);
623
- if (sq_ptr == MAP_FAILED) {
624
- perror("mmap");
625
- return 1;
626
- }
627
-
628
- if (p.features & IORING_FEAT_SINGLE_MMAP) {
629
- cq_ptr = sq_ptr;
630
- } else {
631
- /* Map in the completion queue ring buffer in older kernels separately */
632
- cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
633
- MAP_SHARED | MAP_POPULATE,
634
- ring_fd, IORING_OFF_CQ_RING);
635
- if (cq_ptr == MAP_FAILED) {
636
- perror("mmap");
637
- return 1;
638
- }
639
- }
640
- /* Save useful fields for later easy reference */
641
- sring_tail = sq_ptr + p.sq_off.tail;
642
- sring_mask = sq_ptr + p.sq_off.ring_mask;
643
- sring_array = sq_ptr + p.sq_off.array;
644
-
645
- /* Map in the submission queue entries array */
646
- sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
647
- PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
648
- ring_fd, IORING_OFF_SQES);
649
- if (sqes == MAP_FAILED) {
650
- perror("mmap");
651
- return 1;
652
- }
653
-
654
- /* Save useful fields for later easy reference */
655
- cring_head = cq_ptr + p.cq_off.head;
656
- cring_tail = cq_ptr + p.cq_off.tail;
657
- cring_mask = cq_ptr + p.cq_off.ring_mask;
658
- cqes = cq_ptr + p.cq_off.cqes;
659
-
660
- return 0;
661
- }
662
-
663
- /*
664
- * Read from completion queue.
665
- * In this function, we read completion events from the completion queue.
666
- * We dequeue the CQE, update and head and return the result of the operation.
667
- * */
668
-
669
- int read_from_cq() {
670
- struct io_uring_cqe *cqe;
671
- unsigned head;
672
-
673
- /* Read barrier */
674
- head = io_uring_smp_load_acquire(cring_head);
675
- /*
676
- * Remember, this is a ring buffer. If head == tail, it means that the
677
- * buffer is empty.
678
- * */
679
- if (head == *cring_tail)
680
- return -1;
681
-
682
- /* Get the entry */
683
- cqe = &cqes[head & (*cring_mask)];
684
- if (cqe->res < 0)
685
- fprintf(stderr, "Error: %s\\n", strerror(abs(cqe->res)));
686
-
687
- head++;
688
-
689
- /* Write barrier so that update to the head are made visible */
690
- io_uring_smp_store_release(cring_head, head);
691
-
692
- return cqe->res;
693
- }
694
-
695
- /*
696
- * Submit a read or a write request to the submission queue.
697
- * */
698
-
699
- int submit_to_sq(int fd, int op) {
700
- unsigned index, tail;
701
-
702
- /* Add our submission queue entry to the tail of the SQE ring buffer */
703
- tail = *sring_tail;
704
- index = tail & *sring_mask;
705
- struct io_uring_sqe *sqe = &sqes[index];
706
- /* Fill in the parameters required for the read or write operation */
707
- sqe->opcode = op;
708
- sqe->fd = fd;
709
- sqe->addr = (unsigned long) buff;
710
- if (op == IORING_OP_READ) {
711
- memset(buff, 0, sizeof(buff));
712
- sqe->len = BLOCK_SZ;
713
- }
714
- else {
715
- sqe->len = strlen(buff);
716
- }
717
- sqe->off = offset;
718
-
719
- sring_array[index] = index;
720
- tail++;
721
-
722
- /* Update the tail */
723
- io_uring_smp_store_release(sring_tail, tail);
724
-
725
- /*
726
- * Tell the kernel we have submitted events with the io_uring_enter()
727
- * system call. We also pass in the IOURING_ENTER_GETEVENTS flag which
728
- * causes the io_uring_enter() call to wait until min_complete
729
- * (the 3rd param) events complete.
730
- * */
731
- int ret = io_uring_enter(ring_fd, 1,1,
732
- IORING_ENTER_GETEVENTS);
733
- if(ret < 0) {
734
- perror("io_uring_enter");
735
- return -1;
736
- }
737
-
738
- return ret;
739
- }
740
-
741
- int main(int argc, char *argv[]) {
742
- int res;
743
-
744
- /* Setup io_uring for use */
745
- if(app_setup_uring()) {
746
- fprintf(stderr, "Unable to setup uring!\\n");
747
- return 1;
748
- }
749
-
750
- /*
751
- * A while loop that reads from stdin and writes to stdout.
752
- * Breaks on EOF.
753
- */
754
- while (1) {
755
- /* Initiate read from stdin and wait for it to complete */
756
- submit_to_sq(STDIN_FILENO, IORING_OP_READ);
757
- /* Read completion queue entry */
758
- res = read_from_cq();
759
- if (res > 0) {
760
- /* Read successful. Write to stdout. */
761
- submit_to_sq(STDOUT_FILENO, IORING_OP_WRITE);
762
- read_from_cq();
763
- } else if (res == 0) {
764
- /* reached EOF */
765
- break;
766
- }
767
- else if (res < 0) {
768
- /* Error reading file */
769
- fprintf(stderr, "Error: %s\\n", strerror(abs(res)));
770
- break;
771
- }
772
- offset += res;
773
- }
774
-
775
- return 0;
776
- }
777
- .EE
778
- .SH SEE ALSO
779
- .BR io_uring_enter (2)
780
- .BR io_uring_register (2)
781
- .BR io_uring_setup (2)