polyphony 1.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +1 -0
  3. data/CHANGELOG.md +16 -3
  4. data/README.md +1 -0
  5. data/TODO.md +5 -13
  6. data/docs/cheat-sheet.md +248 -0
  7. data/docs/design-principles.md +59 -3
  8. data/docs/faq.md +15 -32
  9. data/docs/fiber-scheduling.md +14 -12
  10. data/docs/overview.md +140 -35
  11. data/docs/readme.md +4 -3
  12. data/docs/tutorial.md +19 -149
  13. data/examples/core/debug.rb +12 -0
  14. data/examples/core/rpc_benchmark.rb +136 -0
  15. data/ext/polyphony/polyphony.c +2 -1
  16. data/lib/polyphony/extensions/fiber.rb +1 -0
  17. data/lib/polyphony/extensions/io.rb +171 -161
  18. data/lib/polyphony/extensions/pipe.rb +3 -5
  19. data/lib/polyphony/extensions/socket.rb +45 -54
  20. data/lib/polyphony/version.rb +1 -1
  21. data/polyphony.gemspec +3 -1
  22. data/test/test_socket.rb +1 -1
  23. metadata +33 -149
  24. data/vendor/liburing/man/IO_URING_CHECK_VERSION.3 +0 -1
  25. data/vendor/liburing/man/IO_URING_VERSION_MAJOR.3 +0 -1
  26. data/vendor/liburing/man/IO_URING_VERSION_MINOR.3 +0 -1
  27. data/vendor/liburing/man/io_uring.7 +0 -781
  28. data/vendor/liburing/man/io_uring_buf_ring_add.3 +0 -53
  29. data/vendor/liburing/man/io_uring_buf_ring_advance.3 +0 -31
  30. data/vendor/liburing/man/io_uring_buf_ring_cq_advance.3 +0 -41
  31. data/vendor/liburing/man/io_uring_buf_ring_init.3 +0 -30
  32. data/vendor/liburing/man/io_uring_buf_ring_mask.3 +0 -27
  33. data/vendor/liburing/man/io_uring_check_version.3 +0 -72
  34. data/vendor/liburing/man/io_uring_close_ring_fd.3 +0 -43
  35. data/vendor/liburing/man/io_uring_cq_advance.3 +0 -49
  36. data/vendor/liburing/man/io_uring_cq_has_overflow.3 +0 -25
  37. data/vendor/liburing/man/io_uring_cq_ready.3 +0 -26
  38. data/vendor/liburing/man/io_uring_cqe_get_data.3 +0 -53
  39. data/vendor/liburing/man/io_uring_cqe_get_data64.3 +0 -1
  40. data/vendor/liburing/man/io_uring_cqe_seen.3 +0 -42
  41. data/vendor/liburing/man/io_uring_enter.2 +0 -1700
  42. data/vendor/liburing/man/io_uring_enter2.2 +0 -1
  43. data/vendor/liburing/man/io_uring_free_probe.3 +0 -27
  44. data/vendor/liburing/man/io_uring_get_events.3 +0 -33
  45. data/vendor/liburing/man/io_uring_get_probe.3 +0 -30
  46. data/vendor/liburing/man/io_uring_get_sqe.3 +0 -57
  47. data/vendor/liburing/man/io_uring_major_version.3 +0 -1
  48. data/vendor/liburing/man/io_uring_minor_version.3 +0 -1
  49. data/vendor/liburing/man/io_uring_opcode_supported.3 +0 -30
  50. data/vendor/liburing/man/io_uring_peek_cqe.3 +0 -38
  51. data/vendor/liburing/man/io_uring_prep_accept.3 +0 -197
  52. data/vendor/liburing/man/io_uring_prep_accept_direct.3 +0 -1
  53. data/vendor/liburing/man/io_uring_prep_cancel.3 +0 -118
  54. data/vendor/liburing/man/io_uring_prep_cancel64.3 +0 -1
  55. data/vendor/liburing/man/io_uring_prep_close.3 +0 -59
  56. data/vendor/liburing/man/io_uring_prep_close_direct.3 +0 -1
  57. data/vendor/liburing/man/io_uring_prep_connect.3 +0 -66
  58. data/vendor/liburing/man/io_uring_prep_fadvise.3 +0 -59
  59. data/vendor/liburing/man/io_uring_prep_fallocate.3 +0 -59
  60. data/vendor/liburing/man/io_uring_prep_fgetxattr.3 +0 -1
  61. data/vendor/liburing/man/io_uring_prep_files_update.3 +0 -92
  62. data/vendor/liburing/man/io_uring_prep_fsetxattr.3 +0 -1
  63. data/vendor/liburing/man/io_uring_prep_fsync.3 +0 -70
  64. data/vendor/liburing/man/io_uring_prep_getxattr.3 +0 -61
  65. data/vendor/liburing/man/io_uring_prep_link.3 +0 -1
  66. data/vendor/liburing/man/io_uring_prep_link_timeout.3 +0 -94
  67. data/vendor/liburing/man/io_uring_prep_linkat.3 +0 -91
  68. data/vendor/liburing/man/io_uring_prep_madvise.3 +0 -56
  69. data/vendor/liburing/man/io_uring_prep_mkdir.3 +0 -1
  70. data/vendor/liburing/man/io_uring_prep_mkdirat.3 +0 -83
  71. data/vendor/liburing/man/io_uring_prep_msg_ring.3 +0 -92
  72. data/vendor/liburing/man/io_uring_prep_msg_ring_cqe_flags.3 +0 -1
  73. data/vendor/liburing/man/io_uring_prep_multishot_accept.3 +0 -1
  74. data/vendor/liburing/man/io_uring_prep_multishot_accept_direct.3 +0 -1
  75. data/vendor/liburing/man/io_uring_prep_nop.3 +0 -28
  76. data/vendor/liburing/man/io_uring_prep_openat.3 +0 -117
  77. data/vendor/liburing/man/io_uring_prep_openat2.3 +0 -117
  78. data/vendor/liburing/man/io_uring_prep_openat2_direct.3 +0 -1
  79. data/vendor/liburing/man/io_uring_prep_openat_direct.3 +0 -1
  80. data/vendor/liburing/man/io_uring_prep_poll_add.3 +0 -72
  81. data/vendor/liburing/man/io_uring_prep_poll_multishot.3 +0 -1
  82. data/vendor/liburing/man/io_uring_prep_poll_remove.3 +0 -55
  83. data/vendor/liburing/man/io_uring_prep_poll_update.3 +0 -89
  84. data/vendor/liburing/man/io_uring_prep_provide_buffers.3 +0 -140
  85. data/vendor/liburing/man/io_uring_prep_read.3 +0 -69
  86. data/vendor/liburing/man/io_uring_prep_read_fixed.3 +0 -72
  87. data/vendor/liburing/man/io_uring_prep_readv.3 +0 -85
  88. data/vendor/liburing/man/io_uring_prep_readv2.3 +0 -111
  89. data/vendor/liburing/man/io_uring_prep_recv.3 +0 -105
  90. data/vendor/liburing/man/io_uring_prep_recv_multishot.3 +0 -1
  91. data/vendor/liburing/man/io_uring_prep_recvmsg.3 +0 -124
  92. data/vendor/liburing/man/io_uring_prep_recvmsg_multishot.3 +0 -1
  93. data/vendor/liburing/man/io_uring_prep_remove_buffers.3 +0 -52
  94. data/vendor/liburing/man/io_uring_prep_rename.3 +0 -1
  95. data/vendor/liburing/man/io_uring_prep_renameat.3 +0 -96
  96. data/vendor/liburing/man/io_uring_prep_send.3 +0 -66
  97. data/vendor/liburing/man/io_uring_prep_send_set_addr.3 +0 -38
  98. data/vendor/liburing/man/io_uring_prep_send_zc.3 +0 -96
  99. data/vendor/liburing/man/io_uring_prep_send_zc_fixed.3 +0 -1
  100. data/vendor/liburing/man/io_uring_prep_sendmsg.3 +0 -89
  101. data/vendor/liburing/man/io_uring_prep_sendmsg_zc.3 +0 -1
  102. data/vendor/liburing/man/io_uring_prep_setxattr.3 +0 -64
  103. data/vendor/liburing/man/io_uring_prep_shutdown.3 +0 -53
  104. data/vendor/liburing/man/io_uring_prep_socket.3 +0 -118
  105. data/vendor/liburing/man/io_uring_prep_socket_direct.3 +0 -1
  106. data/vendor/liburing/man/io_uring_prep_socket_direct_alloc.3 +0 -1
  107. data/vendor/liburing/man/io_uring_prep_splice.3 +0 -120
  108. data/vendor/liburing/man/io_uring_prep_statx.3 +0 -74
  109. data/vendor/liburing/man/io_uring_prep_symlink.3 +0 -1
  110. data/vendor/liburing/man/io_uring_prep_symlinkat.3 +0 -85
  111. data/vendor/liburing/man/io_uring_prep_sync_file_range.3 +0 -59
  112. data/vendor/liburing/man/io_uring_prep_tee.3 +0 -74
  113. data/vendor/liburing/man/io_uring_prep_timeout.3 +0 -95
  114. data/vendor/liburing/man/io_uring_prep_timeout_remove.3 +0 -1
  115. data/vendor/liburing/man/io_uring_prep_timeout_update.3 +0 -98
  116. data/vendor/liburing/man/io_uring_prep_unlink.3 +0 -1
  117. data/vendor/liburing/man/io_uring_prep_unlinkat.3 +0 -82
  118. data/vendor/liburing/man/io_uring_prep_write.3 +0 -67
  119. data/vendor/liburing/man/io_uring_prep_write_fixed.3 +0 -72
  120. data/vendor/liburing/man/io_uring_prep_writev.3 +0 -85
  121. data/vendor/liburing/man/io_uring_prep_writev2.3 +0 -111
  122. data/vendor/liburing/man/io_uring_queue_exit.3 +0 -26
  123. data/vendor/liburing/man/io_uring_queue_init.3 +0 -89
  124. data/vendor/liburing/man/io_uring_queue_init_params.3 +0 -1
  125. data/vendor/liburing/man/io_uring_recvmsg_cmsg_firsthdr.3 +0 -1
  126. data/vendor/liburing/man/io_uring_recvmsg_cmsg_nexthdr.3 +0 -1
  127. data/vendor/liburing/man/io_uring_recvmsg_name.3 +0 -1
  128. data/vendor/liburing/man/io_uring_recvmsg_out.3 +0 -82
  129. data/vendor/liburing/man/io_uring_recvmsg_payload.3 +0 -1
  130. data/vendor/liburing/man/io_uring_recvmsg_payload_length.3 +0 -1
  131. data/vendor/liburing/man/io_uring_recvmsg_validate.3 +0 -1
  132. data/vendor/liburing/man/io_uring_register.2 +0 -834
  133. data/vendor/liburing/man/io_uring_register_buf_ring.3 +0 -140
  134. data/vendor/liburing/man/io_uring_register_buffers.3 +0 -104
  135. data/vendor/liburing/man/io_uring_register_buffers_sparse.3 +0 -1
  136. data/vendor/liburing/man/io_uring_register_buffers_tags.3 +0 -1
  137. data/vendor/liburing/man/io_uring_register_buffers_update_tag.3 +0 -1
  138. data/vendor/liburing/man/io_uring_register_eventfd.3 +0 -51
  139. data/vendor/liburing/man/io_uring_register_eventfd_async.3 +0 -1
  140. data/vendor/liburing/man/io_uring_register_file_alloc_range.3 +0 -52
  141. data/vendor/liburing/man/io_uring_register_files.3 +0 -112
  142. data/vendor/liburing/man/io_uring_register_files_sparse.3 +0 -1
  143. data/vendor/liburing/man/io_uring_register_files_tags.3 +0 -1
  144. data/vendor/liburing/man/io_uring_register_files_update.3 +0 -1
  145. data/vendor/liburing/man/io_uring_register_files_update_tag.3 +0 -1
  146. data/vendor/liburing/man/io_uring_register_iowq_aff.3 +0 -61
  147. data/vendor/liburing/man/io_uring_register_iowq_max_workers.3 +0 -71
  148. data/vendor/liburing/man/io_uring_register_ring_fd.3 +0 -49
  149. data/vendor/liburing/man/io_uring_register_sync_cancel.3 +0 -71
  150. data/vendor/liburing/man/io_uring_setup.2 +0 -669
  151. data/vendor/liburing/man/io_uring_sq_ready.3 +0 -31
  152. data/vendor/liburing/man/io_uring_sq_space_left.3 +0 -25
  153. data/vendor/liburing/man/io_uring_sqe_set_data.3 +0 -48
  154. data/vendor/liburing/man/io_uring_sqe_set_data64.3 +0 -1
  155. data/vendor/liburing/man/io_uring_sqe_set_flags.3 +0 -87
  156. data/vendor/liburing/man/io_uring_sqring_wait.3 +0 -34
  157. data/vendor/liburing/man/io_uring_submit.3 +0 -46
  158. data/vendor/liburing/man/io_uring_submit_and_get_events.3 +0 -31
  159. data/vendor/liburing/man/io_uring_submit_and_wait.3 +0 -38
  160. data/vendor/liburing/man/io_uring_submit_and_wait_timeout.3 +0 -56
  161. data/vendor/liburing/man/io_uring_unregister_buf_ring.3 +0 -30
  162. data/vendor/liburing/man/io_uring_unregister_buffers.3 +0 -27
  163. data/vendor/liburing/man/io_uring_unregister_eventfd.3 +0 -1
  164. data/vendor/liburing/man/io_uring_unregister_files.3 +0 -27
  165. data/vendor/liburing/man/io_uring_unregister_iowq_aff.3 +0 -1
  166. data/vendor/liburing/man/io_uring_unregister_ring_fd.3 +0 -32
  167. data/vendor/liburing/man/io_uring_wait_cqe.3 +0 -40
  168. data/vendor/liburing/man/io_uring_wait_cqe_nr.3 +0 -43
  169. data/vendor/liburing/man/io_uring_wait_cqe_timeout.3 +0 -53
  170. data/vendor/liburing/man/io_uring_wait_cqes.3 +0 -56
@@ -1,781 +0,0 @@
1
- .\" Copyright (C) 2020 Shuveb Hussain <shuveb@gmail.com>
2
- .\" SPDX-License-Identifier: LGPL-2.0-or-later
3
- .\"
4
-
5
- .TH io_uring 7 2020-07-26 "Linux" "Linux Programmer's Manual"
6
- .SH NAME
7
- io_uring \- Asynchronous I/O facility
8
- .SH SYNOPSIS
9
- .nf
10
- .B "#include <linux/io_uring.h>"
11
- .fi
12
- .PP
13
- .SH DESCRIPTION
14
- .PP
15
- .B io_uring
16
- is a Linux-specific API for asynchronous I/O.
17
- It allows the user to submit one or more I/O requests,
18
- which are processed asynchronously without blocking the calling process.
19
- .B io_uring
20
- gets its name from ring buffers which are shared between user space and
21
- kernel space. This arrangement allows for efficient I/O,
22
- while avoiding the overhead of copying buffers between them,
23
- where possible.
24
- This interface makes
25
- .B io_uring
26
- different from other UNIX I/O APIs,
27
- wherein,
28
- rather than just communicate between kernel and user space with system calls,
29
- ring buffers are used as the main mode of communication.
30
- This arrangement has various performance benefits which are discussed in a
31
- separate section below.
32
- This man page uses the terms shared buffers, shared ring buffers and
33
- queues interchangeably.
34
- .PP
35
- The general programming model you need to follow for
36
- .B io_uring
37
- is outlined below
38
- .IP \(bu
39
- Set up shared buffers with
40
- .BR io_uring_setup (2)
41
- and
42
- .BR mmap (2),
43
- mapping into user space shared buffers for the submission queue (SQ) and the
44
- completion queue (CQ).
45
- You place I/O requests you want to make on the SQ,
46
- while the kernel places the results of those operations on the CQ.
47
- .IP \(bu
48
- For every I/O request you need to make (like to read a file, write a file,
49
- accept a socket connection, etc), you create a submission queue entry,
50
- or SQE,
51
- describe the I/O operation you need to get done and add it to the tail of
52
- the submission queue (SQ).
53
- Each I/O operation is,
54
- in essence,
55
- the equivalent of a system call you would have made otherwise,
56
- if you were not using
57
- .BR io_uring .
58
- You can add more than one SQE to the queue depending on the number of
59
- operations you want to request.
60
- .IP \(bu
61
- After you add one or more SQEs,
62
- you need to call
63
- .BR io_uring_enter (2)
64
- to tell the kernel to dequeue your I/O requests off the SQ and begin
65
- processing them.
66
- .IP \(bu
67
- For each SQE you submit,
68
- once it is done processing the request,
69
- the kernel places a completion queue event or CQE at the tail of the
70
- completion queue or CQ.
71
- The kernel places exactly one matching CQE in the CQ for every SQE you
72
- submit on the SQ.
73
- After you retrieve a CQE,
74
- minimally,
75
- you might be interested in checking the
76
- .I res
77
- field of the CQE structure,
78
- which corresponds to the return value of the system
79
- call's equivalent,
80
- had you used it directly without using
81
- .BR io_uring .
82
- For instance,
83
- a read operation under
84
- .BR io_uring ,
85
- started with the
86
- .BR IORING_OP_READ
87
- operation, issues the equivalent of the
88
- .BR read (2)
89
- system call. In practice, it mixes the semantics of
90
- .BR pread (2)
91
- and
92
- .BR preadv2 (2)
93
- in that it takes an explicit offset, and supports using -1 for the offset to
94
- indicate that the current file position should be used instead of passing in
95
- an explicit offset. See the opcode documentation for more details. Given that
96
- io_uring is an async interface,
97
- .I errno
98
- is never used for passing back error information. Instead,
99
- .I res
100
- will contain what the equivalent system call would have returned in case
101
- of success, and in case of error
102
- .I res
103
- will contain
104
- .I -errno .
105
- For example, if the normal read system call would have returned -1 and set
106
- .I errno
107
- to
108
- .B EINVAL ,
109
- then
110
- .I res
111
- would contain
112
- .B -EINVAL .
113
- If the normal system call would have returned a read size of 1024, then
114
- .I res
115
- would contain 1024.
116
- .IP \(bu
117
- Optionally,
118
- .BR io_uring_enter (2)
119
- can also wait for a specified number of requests to be processed by the kernel
120
- before it returns.
121
- If you specified a certain number of completions to wait for,
122
- the kernel would have placed at least those many number of CQEs on the CQ,
123
- which you can then readily read,
124
- right after the return from
125
- .BR io_uring_enter (2).
126
- .IP \(bu
127
- It is important to remember that I/O requests submitted to the kernel can
128
- complete in any order.
129
- It is not necessary for the kernel to process one request after another,
130
- in the order you placed them.
131
- Given that the interface is a ring,
132
- the requests are attempted in order,
133
- however that doesn't imply any sort of ordering on their completion.
134
- When more than one request is in flight,
135
- it is not possible to determine which one will complete first.
136
- When you dequeue CQEs off the CQ,
137
- you should always check which submitted request it corresponds to.
138
- The most common method for doing so is utilizing the
139
- .I user_data
140
- field in the request, which is passed back on the completion side.
141
- .PP
142
- Adding to and reading from the queues:
143
- .IP \(bu
144
- You add SQEs to the tail of the SQ.
145
- The kernel reads SQEs off the head of the queue.
146
- .IP \(bu
147
- The kernel adds CQEs to the tail of the CQ.
148
- You read CQEs off the head of the queue.
149
- .SS Submission queue polling
150
- One of the goals of
151
- .B io_uring
152
- is to provide a means for efficient I/O.
153
- To this end,
154
- .B io_uring
155
- supports a polling mode that lets you avoid the call to
156
- .BR io_uring_enter (2),
157
- which you use to inform the kernel that you have queued SQEs on to the SQ.
158
- With SQ Polling,
159
- .B io_uring
160
- starts a kernel thread that polls the submission queue for any I/O
161
- requests you submit by adding SQEs.
162
- With SQ Polling enabled,
163
- there is no need for you to call
164
- .BR io_uring_enter (2),
165
- letting you avoid the overhead of system calls.
166
- A designated kernel thread dequeues SQEs off the SQ as you add them and
167
- dispatches them for asynchronous processing.
168
- .SS Setting up io_uring
169
- .PP
170
- The main steps in setting up
171
- .B io_uring
172
- consist of mapping in the shared buffers with
173
- .BR mmap (2)
174
- calls.
175
- In the example program included in this man page,
176
- the function
177
- .BR app_setup_uring ()
178
- sets up
179
- .B io_uring
180
- with a QUEUE_DEPTH deep submission queue.
181
- Pay attention to the 2
182
- .BR mmap (2)
183
- calls that set up the shared submission and completion queues.
184
- If your kernel is older than version 5.4,
185
- three
186
- .BR mmap(2)
187
- calls are required.
188
- .PP
189
- .SS Submitting I/O requests
190
- The process of submitting a request consists of describing the I/O
191
- operation you need to get done using an
192
- .B io_uring_sqe
193
- structure instance.
194
- These details describe the equivalent system call and its parameters.
195
- Because the range of I/O operations Linux supports are very varied and the
196
- .B io_uring_sqe
197
- structure needs to be able to describe them,
198
- it has several fields,
199
- some packed into unions for space efficiency.
200
- Here is a simplified version of struct
201
- .B io_uring_sqe
202
- with some of the most often used fields:
203
- .PP
204
- .in +4n
205
- .EX
206
- struct io_uring_sqe {
207
- __u8 opcode; /* type of operation for this sqe */
208
- __s32 fd; /* file descriptor to do IO on */
209
- __u64 off; /* offset into file */
210
- __u64 addr; /* pointer to buffer or iovecs */
211
- __u32 len; /* buffer size or number of iovecs */
212
- __u64 user_data; /* data to be passed back at completion time */
213
- __u8 flags; /* IOSQE_ flags */
214
- ...
215
- };
216
- .EE
217
- .in
218
-
219
- Here is struct
220
- .B io_uring_sqe
221
- in full:
222
-
223
- .in +4n
224
- .EX
225
- struct io_uring_sqe {
226
- __u8 opcode; /* type of operation for this sqe */
227
- __u8 flags; /* IOSQE_ flags */
228
- __u16 ioprio; /* ioprio for the request */
229
- __s32 fd; /* file descriptor to do IO on */
230
- union {
231
- __u64 off; /* offset into file */
232
- __u64 addr2;
233
- };
234
- union {
235
- __u64 addr; /* pointer to buffer or iovecs */
236
- __u64 splice_off_in;
237
- };
238
- __u32 len; /* buffer size or number of iovecs */
239
- union {
240
- __kernel_rwf_t rw_flags;
241
- __u32 fsync_flags;
242
- __u16 poll_events; /* compatibility */
243
- __u32 poll32_events; /* word-reversed for BE */
244
- __u32 sync_range_flags;
245
- __u32 msg_flags;
246
- __u32 timeout_flags;
247
- __u32 accept_flags;
248
- __u32 cancel_flags;
249
- __u32 open_flags;
250
- __u32 statx_flags;
251
- __u32 fadvise_advice;
252
- __u32 splice_flags;
253
- };
254
- __u64 user_data; /* data to be passed back at completion time */
255
- union {
256
- struct {
257
- /* pack this to avoid bogus arm OABI complaints */
258
- union {
259
- /* index into fixed buffers, if used */
260
- __u16 buf_index;
261
- /* for grouped buffer selection */
262
- __u16 buf_group;
263
- } __attribute__((packed));
264
- /* personality to use, if used */
265
- __u16 personality;
266
- __s32 splice_fd_in;
267
- };
268
- __u64 __pad2[3];
269
- };
270
- };
271
- .EE
272
- .in
273
- .PP
274
- To submit an I/O request to
275
- .BR io_uring ,
276
- you need to acquire a submission queue entry (SQE) from the submission
277
- queue (SQ),
278
- fill it up with details of the operation you want to submit and call
279
- .BR io_uring_enter (2).
280
- There are helper functions of the form io_uring_prep_X to enable proper
281
- setup of the SQE. If you want to avoid calling
282
- .BR io_uring_enter (2),
283
- you have the option of setting up Submission Queue Polling.
284
- .PP
285
- SQEs are added to the tail of the submission queue.
286
- The kernel picks up SQEs off the head of the SQ.
287
- The general algorithm to get the next available SQE and update the tail is
288
- as follows.
289
- .PP
290
- .in +4n
291
- .EX
292
- struct io_uring_sqe *sqe;
293
- unsigned tail, index;
294
- tail = *sqring->tail;
295
- index = tail & (*sqring->ring_mask);
296
- sqe = &sqring->sqes[index];
297
- /* fill up details about this I/O request */
298
- describe_io(sqe);
299
- /* fill the sqe index into the SQ ring array */
300
- sqring->array[index] = index;
301
- tail++;
302
- atomic_store_release(sqring->tail, tail);
303
- .EE
304
- .in
305
- .PP
306
- To get the index of an entry,
307
- the application must mask the current tail index with the size mask of the
308
- ring.
309
- This holds true for both SQs and CQs.
310
- Once the SQE is acquired,
311
- the necessary fields are filled in,
312
- describing the request.
313
- While the CQ ring directly indexes the shared array of CQEs,
314
- the submission side has an indirection array between them.
315
- The submission side ring buffer is an index into this array,
316
- which in turn contains the index into the SQEs.
317
- .PP
318
- The following code snippet demonstrates how a read operation,
319
- an equivalent of a
320
- .BR preadv2 (2)
321
- system call is described by filling up an SQE with the necessary
322
- parameters.
323
- .PP
324
- .in +4n
325
- .EX
326
- struct iovec iovecs[16];
327
- ...
328
- sqe->opcode = IORING_OP_READV;
329
- sqe->fd = fd;
330
- sqe->addr = (unsigned long) iovecs;
331
- sqe->len = 16;
332
- sqe->off = offset;
333
- sqe->flags = 0;
334
- .EE
335
- .in
336
- .TP
337
- .B Memory ordering
338
- Modern compilers and CPUs freely reorder reads and writes without
339
- affecting the program's outcome to optimize performance.
340
- Some aspects of this need to be kept in mind on SMP systems since
341
- .B io_uring
342
- involves buffers shared between kernel and user space.
343
- These buffers are both visible and modifiable from kernel and user space.
344
- As heads and tails belonging to these shared buffers are updated by kernel
345
- and user space,
346
- changes need to be coherently visible on either side,
347
- irrespective of whether a CPU switch took place after the kernel-user mode
348
- switch happened.
349
- We use memory barriers to enforce this coherency.
350
- Being significantly large subjects on their own,
351
- memory barriers are out of scope for further discussion on this man page.
352
- .TP
353
- .B Letting the kernel know about I/O submissions
354
- Once you place one or more SQEs on to the SQ,
355
- you need to let the kernel know that you've done so.
356
- You can do this by calling the
357
- .BR io_uring_enter (2)
358
- system call.
359
- This system call is also capable of waiting for a specified count of
360
- events to complete.
361
- This way,
362
- you can be sure to find completion events in the completion queue without
363
- having to poll it for events later.
364
- .SS Reading completion events
365
- Similar to the submission queue (SQ),
366
- the completion queue (CQ) is a shared buffer between the kernel and user
367
- space.
368
- Whereas you placed submission queue entries on the tail of the SQ and the
369
- kernel read off the head,
370
- when it comes to the CQ,
371
- the kernel places completion queue events or CQEs on the tail of the CQ and
372
- you read off its head.
373
- .PP
374
- Submission is flexible (and thus a bit more complicated) since it needs to
375
- be able to encode different types of system calls that take various
376
- parameters.
377
- Completion,
378
- on the other hand is simpler since we're looking only for a return value
379
- back from the kernel.
380
- This is easily understood by looking at the completion queue event
381
- structure,
382
- struct
383
- .BR io_uring_cqe :
384
- .PP
385
- .in +4n
386
- .EX
387
- struct io_uring_cqe {
388
- __u64 user_data; /* sqe->data submission passed back */
389
- __s32 res; /* result code for this event */
390
- __u32 flags;
391
- };
392
- .EE
393
- .in
394
- .PP
395
- Here,
396
- .I user_data
397
- is custom data that is passed unchanged from submission to completion.
398
- That is,
399
- from SQEs to CQEs.
400
- This field can be used to set context,
401
- uniquely identifying submissions that got completed.
402
- Given that I/O requests can complete in any order,
403
- this field can be used to correlate a submission with a completion.
404
- .I res
405
- is the result from the system call that was performed as part of the
406
- submission;
407
- its return value.
408
-
409
- The
410
- .I flags
411
- field carries request-specific information. As of the 6.0 kernel, the following
412
- flags are defined:
413
-
414
- .TP
415
- .B IORING_CQE_F_BUFFER
416
- If set, the upper 16 bits of the flags field carries the buffer ID that was
417
- chosen for this request. The request must have been issued with
418
- .B IOSQE_BUFFER_SELECT
419
- set, and used with a request type that supports buffer selection. Additionally,
420
- buffers must have been provided upfront either via the
421
- .B IORING_OP_PROVIDE_BUFFERS
422
- or the
423
- .B IORING_REGISTER_PBUF_RING
424
- methods.
425
- .TP
426
- .B IORING_CQE_F_MORE
427
- If set, the application should expect more completions from the request. This
428
- is used for requests that can generate multiple completions, such as multi-shot
429
- requests, receive, or accept.
430
- .TP
431
- .B IORING_CQE_F_SOCK_NONEMPTY
432
- If set, upon receiving the data from the socket in the current request, the
433
- socket still had data left on completion of this request.
434
- .TP
435
- .B IORING_CQE_F_NOTIF
436
- Set for notification CQEs, as seen with the zero-copy networking send and
437
- receive support.
438
- .PP
439
- The general sequence to read completion events off the completion queue is
440
- as follows:
441
- .PP
442
- .in +4n
443
- .EX
444
- unsigned head;
445
- head = *cqring->head;
446
- if (head != atomic_load_acquire(cqring->tail)) {
447
- struct io_uring_cqe *cqe;
448
- unsigned index;
449
- index = head & (cqring->mask);
450
- cqe = &cqring->cqes[index];
451
- /* process completed CQE */
452
- process_cqe(cqe);
453
- /* CQE consumption complete */
454
- head++;
455
- }
456
- atomic_store_release(cqring->head, head);
457
- .EE
458
- .in
459
- .PP
460
- It helps to be reminded that the kernel adds CQEs to the tail of the CQ,
461
- while you need to dequeue them off the head.
462
- To get the index of an entry at the head,
463
- the application must mask the current head index with the size mask of the
464
- ring.
465
- Once the CQE has been consumed or processed,
466
- the head needs to be updated to reflect the consumption of the CQE.
467
- Attention should be paid to the read and write barriers to ensure
468
- successful read and update of the head.
469
- .SS io_uring performance
470
- Because of the shared ring buffers between kernel and user space,
471
- .B io_uring
472
- can be a zero-copy system.
473
- Copying buffers to and from becomes necessary when system calls that
474
- transfer data between kernel and user space are involved.
475
- But since the bulk of the communication in
476
- .B io_uring
477
- is via buffers shared between the kernel and user space,
478
- this huge performance overhead is completely avoided.
479
- .PP
480
- While system calls may not seem like a significant overhead,
481
- in high performance applications,
482
- making a lot of them will begin to matter.
483
- While workarounds the operating system has in place to deal with Spectre
484
- and Meltdown are ideally best done away with,
485
- unfortunately,
486
- some of these workarounds are around the system call interface,
487
- making system calls not as cheap as before on affected hardware.
488
- While newer hardware should not need these workarounds,
489
- hardware with these vulnerabilities can be expected to be in the wild for a
490
- long time.
491
- While using synchronous programming interfaces or even when using
492
- asynchronous programming interfaces under Linux,
493
- there is at least one system call involved in the submission of each
494
- request.
495
- In
496
- .BR io_uring ,
497
- on the other hand,
498
- you can batch several requests in one go,
499
- simply by queueing up multiple SQEs,
500
- each describing an I/O operation you want and make a single call to
501
- .BR io_uring_enter (2).
502
- This is possible due to
503
- .BR io_uring 's
504
- shared buffers based design.
505
- .PP
506
- While this batching in itself can avoid the overhead associated with
507
- potentially multiple and frequent system calls,
508
- you can reduce even this overhead further with Submission Queue Polling,
509
- by having the kernel poll and pick up your SQEs for processing as you add
510
- them to the submission queue. This avoids the
511
- .BR io_uring_enter (2)
512
- call you need to make to tell the kernel to pick SQEs up.
513
- For high-performance applications,
514
- this means even fewer system call overheads.
515
- .SH CONFORMING TO
516
- .B io_uring
517
- is Linux-specific.
518
- .SH EXAMPLES
519
- The following example uses
520
- .B io_uring
521
- to copy stdin to stdout.
522
- Using shell redirection,
523
- you should be able to copy files with this example.
524
- Because it uses a queue depth of only one,
525
- this example processes I/O requests one after the other.
526
- It is purposefully kept this way to aid understanding.
527
- In real-world scenarios however,
528
- you'll want to have a larger queue depth to parallelize I/O request
529
- processing so as to gain the kind of performance benefits
530
- .B io_uring
531
- provides with its asynchronous processing of requests.
532
- .PP
533
- .EX
534
- #include <stdio.h>
535
- #include <stdlib.h>
536
- #include <sys/stat.h>
537
- #include <sys/ioctl.h>
538
- #include <sys/syscall.h>
539
- #include <sys/mman.h>
540
- #include <sys/uio.h>
541
- #include <linux/fs.h>
542
- #include <fcntl.h>
543
- #include <unistd.h>
544
- #include <string.h>
545
- #include <stdatomic.h>
546
-
547
- #include <linux/io_uring.h>
548
-
549
- #define QUEUE_DEPTH 1
550
- #define BLOCK_SZ 1024
551
-
552
- /* Macros for barriers needed by io_uring */
553
- #define io_uring_smp_store_release(p, v) \\
554
- atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \\
555
- memory_order_release)
556
- #define io_uring_smp_load_acquire(p) \\
557
- atomic_load_explicit((_Atomic typeof(*(p)) *)(p), \\
558
- memory_order_acquire)
559
-
560
- int ring_fd;
561
- unsigned *sring_tail, *sring_mask, *sring_array,
562
- *cring_head, *cring_tail, *cring_mask;
563
- struct io_uring_sqe *sqes;
564
- struct io_uring_cqe *cqes;
565
- char buff[BLOCK_SZ];
566
- off_t offset;
567
-
568
- /*
569
- * System call wrappers provided since glibc does not yet
570
- * provide wrappers for io_uring system calls.
571
- * */
572
-
573
- int io_uring_setup(unsigned entries, struct io_uring_params *p)
574
- {
575
- return (int) syscall(__NR_io_uring_setup, entries, p);
576
- }
577
-
578
- int io_uring_enter(int ring_fd, unsigned int to_submit,
579
- unsigned int min_complete, unsigned int flags)
580
- {
581
- return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit,
582
- min_complete, flags, NULL, 0);
583
- }
584
-
585
- int app_setup_uring(void) {
586
- struct io_uring_params p;
587
- void *sq_ptr, *cq_ptr;
588
-
589
- /* See io_uring_setup(2) for io_uring_params.flags you can set */
590
- memset(&p, 0, sizeof(p));
591
- ring_fd = io_uring_setup(QUEUE_DEPTH, &p);
592
- if (ring_fd < 0) {
593
- perror("io_uring_setup");
594
- return 1;
595
- }
596
-
597
- /*
598
- * io_uring communication happens via 2 shared kernel-user space ring
599
- * buffers, which can be jointly mapped with a single mmap() call in
600
- * kernels >= 5.4.
601
- */
602
-
603
- int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
604
- int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
605
-
606
- /* Rather than check for kernel version, the recommended way is to
607
- * check the features field of the io_uring_params structure, which is a
608
- * bitmask. If IORING_FEAT_SINGLE_MMAP is set, we can do away with the
609
- * second mmap() call to map in the completion ring separately.
610
- */
611
- if (p.features & IORING_FEAT_SINGLE_MMAP) {
612
- if (cring_sz > sring_sz)
613
- sring_sz = cring_sz;
614
- cring_sz = sring_sz;
615
- }
616
-
617
- /* Map in the submission and completion queue ring buffers.
618
- * Kernels < 5.4 only map in the submission queue, though.
619
- */
620
- sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
621
- MAP_SHARED | MAP_POPULATE,
622
- ring_fd, IORING_OFF_SQ_RING);
623
- if (sq_ptr == MAP_FAILED) {
624
- perror("mmap");
625
- return 1;
626
- }
627
-
628
- if (p.features & IORING_FEAT_SINGLE_MMAP) {
629
- cq_ptr = sq_ptr;
630
- } else {
631
- /* Map in the completion queue ring buffer in older kernels separately */
632
- cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
633
- MAP_SHARED | MAP_POPULATE,
634
- ring_fd, IORING_OFF_CQ_RING);
635
- if (cq_ptr == MAP_FAILED) {
636
- perror("mmap");
637
- return 1;
638
- }
639
- }
640
- /* Save useful fields for later easy reference */
641
- sring_tail = sq_ptr + p.sq_off.tail;
642
- sring_mask = sq_ptr + p.sq_off.ring_mask;
643
- sring_array = sq_ptr + p.sq_off.array;
644
-
645
- /* Map in the submission queue entries array */
646
- sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
647
- PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
648
- ring_fd, IORING_OFF_SQES);
649
- if (sqes == MAP_FAILED) {
650
- perror("mmap");
651
- return 1;
652
- }
653
-
654
- /* Save useful fields for later easy reference */
655
- cring_head = cq_ptr + p.cq_off.head;
656
- cring_tail = cq_ptr + p.cq_off.tail;
657
- cring_mask = cq_ptr + p.cq_off.ring_mask;
658
- cqes = cq_ptr + p.cq_off.cqes;
659
-
660
- return 0;
661
- }
662
-
663
- /*
664
- * Read from completion queue.
665
- * In this function, we read completion events from the completion queue.
666
- * We dequeue the CQE, update and head and return the result of the operation.
667
- * */
668
-
669
- int read_from_cq() {
670
- struct io_uring_cqe *cqe;
671
- unsigned head;
672
-
673
- /* Read barrier */
674
- head = io_uring_smp_load_acquire(cring_head);
675
- /*
676
- * Remember, this is a ring buffer. If head == tail, it means that the
677
- * buffer is empty.
678
- * */
679
- if (head == *cring_tail)
680
- return -1;
681
-
682
- /* Get the entry */
683
- cqe = &cqes[head & (*cring_mask)];
684
- if (cqe->res < 0)
685
- fprintf(stderr, "Error: %s\\n", strerror(abs(cqe->res)));
686
-
687
- head++;
688
-
689
- /* Write barrier so that update to the head are made visible */
690
- io_uring_smp_store_release(cring_head, head);
691
-
692
- return cqe->res;
693
- }
694
-
695
- /*
696
- * Submit a read or a write request to the submission queue.
697
- * */
698
-
699
- int submit_to_sq(int fd, int op) {
700
- unsigned index, tail;
701
-
702
- /* Add our submission queue entry to the tail of the SQE ring buffer */
703
- tail = *sring_tail;
704
- index = tail & *sring_mask;
705
- struct io_uring_sqe *sqe = &sqes[index];
706
- /* Fill in the parameters required for the read or write operation */
707
- sqe->opcode = op;
708
- sqe->fd = fd;
709
- sqe->addr = (unsigned long) buff;
710
- if (op == IORING_OP_READ) {
711
- memset(buff, 0, sizeof(buff));
712
- sqe->len = BLOCK_SZ;
713
- }
714
- else {
715
- sqe->len = strlen(buff);
716
- }
717
- sqe->off = offset;
718
-
719
- sring_array[index] = index;
720
- tail++;
721
-
722
- /* Update the tail */
723
- io_uring_smp_store_release(sring_tail, tail);
724
-
725
- /*
726
- * Tell the kernel we have submitted events with the io_uring_enter()
727
- * system call. We also pass in the IOURING_ENTER_GETEVENTS flag which
728
- * causes the io_uring_enter() call to wait until min_complete
729
- * (the 3rd param) events complete.
730
- * */
731
- int ret = io_uring_enter(ring_fd, 1,1,
732
- IORING_ENTER_GETEVENTS);
733
- if(ret < 0) {
734
- perror("io_uring_enter");
735
- return -1;
736
- }
737
-
738
- return ret;
739
- }
740
-
741
- int main(int argc, char *argv[]) {
742
- int res;
743
-
744
- /* Setup io_uring for use */
745
- if(app_setup_uring()) {
746
- fprintf(stderr, "Unable to setup uring!\\n");
747
- return 1;
748
- }
749
-
750
- /*
751
- * A while loop that reads from stdin and writes to stdout.
752
- * Breaks on EOF.
753
- */
754
- while (1) {
755
- /* Initiate read from stdin and wait for it to complete */
756
- submit_to_sq(STDIN_FILENO, IORING_OP_READ);
757
- /* Read completion queue entry */
758
- res = read_from_cq();
759
- if (res > 0) {
760
- /* Read successful. Write to stdout. */
761
- submit_to_sq(STDOUT_FILENO, IORING_OP_WRITE);
762
- read_from_cq();
763
- } else if (res == 0) {
764
- /* reached EOF */
765
- break;
766
- }
767
- else if (res < 0) {
768
- /* Error reading file */
769
- fprintf(stderr, "Error: %s\\n", strerror(abs(res)));
770
- break;
771
- }
772
- offset += res;
773
- }
774
-
775
- return 0;
776
- }
777
- .EE
778
- .SH SEE ALSO
779
- .BR io_uring_enter (2)
780
- .BR io_uring_register (2)
781
- .BR io_uring_setup (2)