polyphony 0.45.5 → 0.46.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/test.yml +2 -0
  3. data/.gitmodules +0 -0
  4. data/CHANGELOG.md +4 -0
  5. data/Gemfile.lock +1 -1
  6. data/README.md +3 -3
  7. data/Rakefile +1 -1
  8. data/TODO.md +4 -4
  9. data/examples/performance/thread-vs-fiber/polyphony_server.rb +1 -2
  10. data/ext/liburing/liburing.h +585 -0
  11. data/ext/liburing/liburing/README.md +4 -0
  12. data/ext/liburing/liburing/barrier.h +73 -0
  13. data/ext/liburing/liburing/compat.h +15 -0
  14. data/ext/liburing/liburing/io_uring.h +343 -0
  15. data/ext/liburing/queue.c +333 -0
  16. data/ext/liburing/register.c +187 -0
  17. data/ext/liburing/setup.c +210 -0
  18. data/ext/liburing/syscall.c +54 -0
  19. data/ext/liburing/syscall.h +18 -0
  20. data/ext/polyphony/backend.h +0 -14
  21. data/ext/polyphony/backend_common.h +109 -0
  22. data/ext/polyphony/backend_io_uring.c +884 -0
  23. data/ext/polyphony/backend_io_uring_context.c +73 -0
  24. data/ext/polyphony/backend_io_uring_context.h +52 -0
  25. data/ext/polyphony/{libev_backend.c → backend_libev.c} +202 -294
  26. data/ext/polyphony/event.c +1 -1
  27. data/ext/polyphony/extconf.rb +31 -13
  28. data/ext/polyphony/fiber.c +29 -22
  29. data/ext/polyphony/libev.c +4 -0
  30. data/ext/polyphony/libev.h +8 -2
  31. data/ext/polyphony/liburing.c +8 -0
  32. data/ext/polyphony/playground.c +51 -0
  33. data/ext/polyphony/polyphony.c +5 -5
  34. data/ext/polyphony/polyphony.h +16 -12
  35. data/ext/polyphony/polyphony_ext.c +10 -4
  36. data/ext/polyphony/queue.c +1 -1
  37. data/ext/polyphony/thread.c +11 -9
  38. data/lib/polyphony/adapters/trace.rb +2 -2
  39. data/lib/polyphony/core/global_api.rb +1 -4
  40. data/lib/polyphony/extensions/debug.rb +13 -0
  41. data/lib/polyphony/extensions/fiber.rb +2 -2
  42. data/lib/polyphony/extensions/socket.rb +59 -10
  43. data/lib/polyphony/version.rb +1 -1
  44. data/test/helper.rb +36 -4
  45. data/test/io_uring_test.rb +55 -0
  46. data/test/stress.rb +5 -2
  47. data/test/test_backend.rb +4 -6
  48. data/test/test_ext.rb +1 -2
  49. data/test/test_fiber.rb +22 -16
  50. data/test/test_global_api.rb +33 -35
  51. data/test/test_throttler.rb +3 -6
  52. data/test/test_trace.rb +7 -5
  53. metadata +22 -3
@@ -0,0 +1,4 @@
1
+ ## Updating the liburing source code
2
+
3
+ - copy liburing/src/**/* into ext/liburing
4
+ - move ext/liburing/include/**/* to ext/liburing/
@@ -0,0 +1,73 @@
1
+ /* SPDX-License-Identifier: MIT */
2
+ #ifndef LIBURING_BARRIER_H
3
+ #define LIBURING_BARRIER_H
4
+
5
+ /*
6
+ From the kernel documentation file refcount-vs-atomic.rst:
7
+
8
+ A RELEASE memory ordering guarantees that all prior loads and
9
+ stores (all po-earlier instructions) on the same CPU are completed
10
+ before the operation. It also guarantees that all po-earlier
11
+ stores on the same CPU and all propagated stores from other CPUs
12
+ must propagate to all other CPUs before the release operation
13
+ (A-cumulative property). This is implemented using
14
+ :c:func:`smp_store_release`.
15
+
16
+ An ACQUIRE memory ordering guarantees that all post loads and
17
+ stores (all po-later instructions) on the same CPU are
18
+ completed after the acquire operation. It also guarantees that all
19
+ po-later stores on the same CPU must propagate to all other CPUs
20
+ after the acquire operation executes. This is implemented using
21
+ :c:func:`smp_acquire__after_ctrl_dep`.
22
+ */
23
+
24
+ #ifdef __cplusplus
25
+ #include <atomic>
26
+
27
+ template <typename T>
28
+ static inline void IO_URING_WRITE_ONCE(T &var, T val)
29
+ {
30
+ std::atomic_store_explicit(reinterpret_cast<std::atomic<T> *>(&var),
31
+ val, std::memory_order_relaxed);
32
+ }
33
+ template <typename T>
34
+ static inline T IO_URING_READ_ONCE(const T &var)
35
+ {
36
+ return std::atomic_load_explicit(
37
+ reinterpret_cast<const std::atomic<T> *>(&var),
38
+ std::memory_order_relaxed);
39
+ }
40
+
41
+ template <typename T>
42
+ static inline void io_uring_smp_store_release(T *p, T v)
43
+ {
44
+ std::atomic_store_explicit(reinterpret_cast<std::atomic<T> *>(p), v,
45
+ std::memory_order_release);
46
+ }
47
+
48
+ template <typename T>
49
+ static inline T io_uring_smp_load_acquire(const T *p)
50
+ {
51
+ return std::atomic_load_explicit(
52
+ reinterpret_cast<const std::atomic<T> *>(p),
53
+ std::memory_order_acquire);
54
+ }
55
+ #else
56
+ #include <stdatomic.h>
57
+
58
+ #define IO_URING_WRITE_ONCE(var, val) \
59
+ atomic_store_explicit((_Atomic typeof(var) *)&(var), \
60
+ (val), memory_order_relaxed)
61
+ #define IO_URING_READ_ONCE(var) \
62
+ atomic_load_explicit((_Atomic typeof(var) *)&(var), \
63
+ memory_order_relaxed)
64
+
65
+ #define io_uring_smp_store_release(p, v) \
66
+ atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \
67
+ memory_order_release)
68
+ #define io_uring_smp_load_acquire(p) \
69
+ atomic_load_explicit((_Atomic typeof(*(p)) *)(p), \
70
+ memory_order_acquire)
71
+ #endif
72
+
73
+ #endif /* defined(LIBURING_BARRIER_H) */
@@ -0,0 +1,15 @@
1
+ /* SPDX-License-Identifier: MIT */
2
+ #ifndef LIBURING_COMPAT_H
3
+ #define LIBURING_COMPAT_H
4
+
5
+ #include <linux/time_types.h>
6
+
7
+ #include <inttypes.h>
8
+
9
+ struct open_how {
10
+ uint64_t flags;
11
+ uint64_t mode;
12
+ uint64_t resolve;
13
+ };
14
+
15
+ #endif
@@ -0,0 +1,343 @@
1
+ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
2
+ /*
3
+ * Header file for the io_uring interface.
4
+ *
5
+ * Copyright (C) 2019 Jens Axboe
6
+ * Copyright (C) 2019 Christoph Hellwig
7
+ */
8
+ #ifndef LINUX_IO_URING_H
9
+ #define LINUX_IO_URING_H
10
+
11
+ #include <linux/fs.h>
12
+ #include <linux/types.h>
13
+
14
+ #ifdef __cplusplus
15
+ extern "C" {
16
+ #endif
17
+
18
+ /*
19
+ * IO submission data structure (Submission Queue Entry)
20
+ */
21
+ struct io_uring_sqe {
22
+ __u8 opcode; /* type of operation for this sqe */
23
+ __u8 flags; /* IOSQE_ flags */
24
+ __u16 ioprio; /* ioprio for the request */
25
+ __s32 fd; /* file descriptor to do IO on */
26
+ union {
27
+ __u64 off; /* offset into file */
28
+ __u64 addr2;
29
+ };
30
+ union {
31
+ __u64 addr; /* pointer to buffer or iovecs */
32
+ __u64 splice_off_in;
33
+ };
34
+ __u32 len; /* buffer size or number of iovecs */
35
+ union {
36
+ __kernel_rwf_t rw_flags;
37
+ __u32 fsync_flags;
38
+ __u16 poll_events; /* compatibility */
39
+ __u32 poll32_events; /* word-reversed for BE */
40
+ __u32 sync_range_flags;
41
+ __u32 msg_flags;
42
+ __u32 timeout_flags;
43
+ __u32 accept_flags;
44
+ __u32 cancel_flags;
45
+ __u32 open_flags;
46
+ __u32 statx_flags;
47
+ __u32 fadvise_advice;
48
+ __u32 splice_flags;
49
+ };
50
+ __u64 user_data; /* data to be passed back at completion time */
51
+ union {
52
+ struct {
53
+ /* pack this to avoid bogus arm OABI complaints */
54
+ union {
55
+ /* index into fixed buffers, if used */
56
+ __u16 buf_index;
57
+ /* for grouped buffer selection */
58
+ __u16 buf_group;
59
+ } __attribute__((packed));
60
+ /* personality to use, if used */
61
+ __u16 personality;
62
+ __s32 splice_fd_in;
63
+ };
64
+ __u64 __pad2[3];
65
+ };
66
+ };
67
+
68
+ enum {
69
+ IOSQE_FIXED_FILE_BIT,
70
+ IOSQE_IO_DRAIN_BIT,
71
+ IOSQE_IO_LINK_BIT,
72
+ IOSQE_IO_HARDLINK_BIT,
73
+ IOSQE_ASYNC_BIT,
74
+ IOSQE_BUFFER_SELECT_BIT,
75
+ };
76
+
77
+ /*
78
+ * sqe->flags
79
+ */
80
+ /* use fixed fileset */
81
+ #define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT)
82
+ /* issue after inflight IO */
83
+ #define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT)
84
+ /* links next sqe */
85
+ #define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT)
86
+ /* like LINK, but stronger */
87
+ #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
88
+ /* always go async */
89
+ #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
90
+ /* select buffer from sqe->buf_group */
91
+ #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
92
+
93
+ /*
94
+ * io_uring_setup() flags
95
+ */
96
+ #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */
97
+ #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
98
+ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
99
+ #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
100
+ #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
101
+ #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
102
+ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
103
+
104
+ enum {
105
+ IORING_OP_NOP,
106
+ IORING_OP_READV,
107
+ IORING_OP_WRITEV,
108
+ IORING_OP_FSYNC,
109
+ IORING_OP_READ_FIXED,
110
+ IORING_OP_WRITE_FIXED,
111
+ IORING_OP_POLL_ADD,
112
+ IORING_OP_POLL_REMOVE,
113
+ IORING_OP_SYNC_FILE_RANGE,
114
+ IORING_OP_SENDMSG,
115
+ IORING_OP_RECVMSG,
116
+ IORING_OP_TIMEOUT,
117
+ IORING_OP_TIMEOUT_REMOVE,
118
+ IORING_OP_ACCEPT,
119
+ IORING_OP_ASYNC_CANCEL,
120
+ IORING_OP_LINK_TIMEOUT,
121
+ IORING_OP_CONNECT,
122
+ IORING_OP_FALLOCATE,
123
+ IORING_OP_OPENAT,
124
+ IORING_OP_CLOSE,
125
+ IORING_OP_FILES_UPDATE,
126
+ IORING_OP_STATX,
127
+ IORING_OP_READ,
128
+ IORING_OP_WRITE,
129
+ IORING_OP_FADVISE,
130
+ IORING_OP_MADVISE,
131
+ IORING_OP_SEND,
132
+ IORING_OP_RECV,
133
+ IORING_OP_OPENAT2,
134
+ IORING_OP_EPOLL_CTL,
135
+ IORING_OP_SPLICE,
136
+ IORING_OP_PROVIDE_BUFFERS,
137
+ IORING_OP_REMOVE_BUFFERS,
138
+ IORING_OP_TEE,
139
+ IORING_OP_SHUTDOWN,
140
+
141
+ /* this goes last, obviously */
142
+ IORING_OP_LAST,
143
+ };
144
+
145
+ /*
146
+ * sqe->fsync_flags
147
+ */
148
+ #define IORING_FSYNC_DATASYNC (1U << 0)
149
+
150
+ /*
151
+ * sqe->timeout_flags
152
+ */
153
+ #define IORING_TIMEOUT_ABS (1U << 0)
154
+
155
+ /*
156
+ * sqe->splice_flags
157
+ * extends splice(2) flags
158
+ */
159
+ #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
160
+
161
+ /*
162
+ * IO completion data structure (Completion Queue Entry)
163
+ */
164
+ struct io_uring_cqe {
165
+ __u64 user_data; /* sqe->data submission passed back */
166
+ __s32 res; /* result code for this event */
167
+ __u32 flags;
168
+ };
169
+
170
+ /*
171
+ * cqe->flags
172
+ *
173
+ * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
174
+ */
175
+ #define IORING_CQE_F_BUFFER (1U << 0)
176
+
177
+ enum {
178
+ IORING_CQE_BUFFER_SHIFT = 16,
179
+ };
180
+
181
+ /*
182
+ * Magic offsets for the application to mmap the data it needs
183
+ */
184
+ #define IORING_OFF_SQ_RING 0ULL
185
+ #define IORING_OFF_CQ_RING 0x8000000ULL
186
+ #define IORING_OFF_SQES 0x10000000ULL
187
+
188
+ /*
189
+ * Filled with the offset for mmap(2)
190
+ */
191
+ struct io_sqring_offsets {
192
+ __u32 head;
193
+ __u32 tail;
194
+ __u32 ring_mask;
195
+ __u32 ring_entries;
196
+ __u32 flags;
197
+ __u32 dropped;
198
+ __u32 array;
199
+ __u32 resv1;
200
+ __u64 resv2;
201
+ };
202
+
203
+ /*
204
+ * sq_ring->flags
205
+ */
206
+ #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */
207
+ #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */
208
+
209
+ struct io_cqring_offsets {
210
+ __u32 head;
211
+ __u32 tail;
212
+ __u32 ring_mask;
213
+ __u32 ring_entries;
214
+ __u32 overflow;
215
+ __u32 cqes;
216
+ __u32 flags;
217
+ __u32 resv1;
218
+ __u64 resv2;
219
+ };
220
+
221
+ /*
222
+ * cq_ring->flags
223
+ */
224
+
225
+ /* disable eventfd notifications */
226
+ #define IORING_CQ_EVENTFD_DISABLED (1U << 0)
227
+
228
+ /*
229
+ * io_uring_enter(2) flags
230
+ */
231
+ #define IORING_ENTER_GETEVENTS (1U << 0)
232
+ #define IORING_ENTER_SQ_WAKEUP (1U << 1)
233
+ #define IORING_ENTER_SQ_WAIT (1U << 2)
234
+
235
+ /*
236
+ * Passed in for io_uring_setup(2). Copied back with updated info on success
237
+ */
238
+ struct io_uring_params {
239
+ __u32 sq_entries;
240
+ __u32 cq_entries;
241
+ __u32 flags;
242
+ __u32 sq_thread_cpu;
243
+ __u32 sq_thread_idle;
244
+ __u32 features;
245
+ __u32 wq_fd;
246
+ __u32 resv[3];
247
+ struct io_sqring_offsets sq_off;
248
+ struct io_cqring_offsets cq_off;
249
+ };
250
+
251
+ /*
252
+ * io_uring_params->features flags
253
+ */
254
+ #define IORING_FEAT_SINGLE_MMAP (1U << 0)
255
+ #define IORING_FEAT_NODROP (1U << 1)
256
+ #define IORING_FEAT_SUBMIT_STABLE (1U << 2)
257
+ #define IORING_FEAT_RW_CUR_POS (1U << 3)
258
+ #define IORING_FEAT_CUR_PERSONALITY (1U << 4)
259
+ #define IORING_FEAT_FAST_POLL (1U << 5)
260
+ #define IORING_FEAT_POLL_32BITS (1U << 6)
261
+ #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7)
262
+
263
+ /*
264
+ * io_uring_register(2) opcodes and arguments
265
+ */
266
+ enum {
267
+ IORING_REGISTER_BUFFERS = 0,
268
+ IORING_UNREGISTER_BUFFERS = 1,
269
+ IORING_REGISTER_FILES = 2,
270
+ IORING_UNREGISTER_FILES = 3,
271
+ IORING_REGISTER_EVENTFD = 4,
272
+ IORING_UNREGISTER_EVENTFD = 5,
273
+ IORING_REGISTER_FILES_UPDATE = 6,
274
+ IORING_REGISTER_EVENTFD_ASYNC = 7,
275
+ IORING_REGISTER_PROBE = 8,
276
+ IORING_REGISTER_PERSONALITY = 9,
277
+ IORING_UNREGISTER_PERSONALITY = 10,
278
+ IORING_REGISTER_RESTRICTIONS = 11,
279
+ IORING_REGISTER_ENABLE_RINGS = 12,
280
+
281
+ /* this goes last */
282
+ IORING_REGISTER_LAST
283
+ };
284
+
285
+ struct io_uring_files_update {
286
+ __u32 offset;
287
+ __u32 resv;
288
+ __aligned_u64 /* __s32 * */ fds;
289
+ };
290
+
291
+ #define IO_URING_OP_SUPPORTED (1U << 0)
292
+
293
+ struct io_uring_probe_op {
294
+ __u8 op;
295
+ __u8 resv;
296
+ __u16 flags; /* IO_URING_OP_* flags */
297
+ __u32 resv2;
298
+ };
299
+
300
+ struct io_uring_probe {
301
+ __u8 last_op; /* last opcode supported */
302
+ __u8 ops_len; /* length of ops[] array below */
303
+ __u16 resv;
304
+ __u32 resv2[3];
305
+ struct io_uring_probe_op ops[0];
306
+ };
307
+
308
+ struct io_uring_restriction {
309
+ __u16 opcode;
310
+ union {
311
+ __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */
312
+ __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */
313
+ __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */
314
+ };
315
+ __u8 resv;
316
+ __u32 resv2[3];
317
+ };
318
+
319
+ /*
320
+ * io_uring_restriction->opcode values
321
+ */
322
+ enum {
323
+ /* Allow an io_uring_register(2) opcode */
324
+ IORING_RESTRICTION_REGISTER_OP = 0,
325
+
326
+ /* Allow an sqe opcode */
327
+ IORING_RESTRICTION_SQE_OP = 1,
328
+
329
+ /* Allow sqe flags */
330
+ IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2,
331
+
332
+ /* Require sqe flags (these flags must be set on each submission) */
333
+ IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3,
334
+
335
+ IORING_RESTRICTION_LAST
336
+ };
337
+
338
+
339
+ #ifdef __cplusplus
340
+ }
341
+ #endif
342
+
343
+ #endif
@@ -0,0 +1,333 @@
1
+ /* SPDX-License-Identifier: MIT */
2
+ #include <sys/types.h>
3
+ #include <sys/stat.h>
4
+ #include <sys/mman.h>
5
+ #include <unistd.h>
6
+ #include <errno.h>
7
+ #include <string.h>
8
+ #include <stdbool.h>
9
+
10
+ #include "liburing/compat.h"
11
+ #include "liburing/io_uring.h"
12
+ #include "liburing.h"
13
+ #include "liburing/barrier.h"
14
+
15
+ #include "syscall.h"
16
+
17
+ /*
18
+ * Returns true if we're not using SQ thread (thus nobody submits but us)
19
+ * or if IORING_SQ_NEED_WAKEUP is set, so submit thread must be explicitly
20
+ * awakened. For the latter case, we set the thread wakeup flag.
21
+ */
22
+ static inline bool sq_ring_needs_enter(struct io_uring *ring, unsigned *flags)
23
+ {
24
+ if (!(ring->flags & IORING_SETUP_SQPOLL))
25
+ return true;
26
+ if (IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_NEED_WAKEUP) {
27
+ *flags |= IORING_ENTER_SQ_WAKEUP;
28
+ return true;
29
+ }
30
+
31
+ return false;
32
+ }
33
+
34
+ static inline bool cq_ring_needs_flush(struct io_uring *ring)
35
+ {
36
+ return IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_CQ_OVERFLOW;
37
+ }
38
+
39
+ static int __io_uring_peek_cqe(struct io_uring *ring,
40
+ struct io_uring_cqe **cqe_ptr)
41
+ {
42
+ struct io_uring_cqe *cqe;
43
+ unsigned head;
44
+ int err = 0;
45
+
46
+ do {
47
+ io_uring_for_each_cqe(ring, head, cqe)
48
+ break;
49
+ if (cqe) {
50
+ if (cqe->user_data == LIBURING_UDATA_TIMEOUT) {
51
+ if (cqe->res < 0)
52
+ err = cqe->res;
53
+ io_uring_cq_advance(ring, 1);
54
+ if (!err)
55
+ continue;
56
+ cqe = NULL;
57
+ }
58
+ }
59
+ break;
60
+ } while (1);
61
+
62
+ *cqe_ptr = cqe;
63
+ return err;
64
+ }
65
+
66
+ int __io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
67
+ unsigned submit, unsigned wait_nr, sigset_t *sigmask)
68
+ {
69
+ struct io_uring_cqe *cqe = NULL;
70
+ const int to_wait = wait_nr;
71
+ int ret = 0, err;
72
+
73
+ do {
74
+ bool cq_overflow_flush = false;
75
+ unsigned flags = 0;
76
+
77
+ err = __io_uring_peek_cqe(ring, &cqe);
78
+ if (err)
79
+ break;
80
+ if (!cqe && !to_wait && !submit) {
81
+ if (!cq_ring_needs_flush(ring)) {
82
+ err = -EAGAIN;
83
+ break;
84
+ }
85
+ cq_overflow_flush = true;
86
+ }
87
+ if (wait_nr && cqe)
88
+ wait_nr--;
89
+ if (wait_nr || cq_overflow_flush)
90
+ flags = IORING_ENTER_GETEVENTS;
91
+ if (submit)
92
+ sq_ring_needs_enter(ring, &flags);
93
+ if (wait_nr || submit || cq_overflow_flush)
94
+ ret = __sys_io_uring_enter(ring->ring_fd, submit,
95
+ wait_nr, flags, sigmask);
96
+ if (ret < 0) {
97
+ err = -errno;
98
+ } else if (ret == (int)submit) {
99
+ submit = 0;
100
+ /*
101
+ * When SETUP_IOPOLL is set, __sys_io_uring enter()
102
+ * must be called to reap new completions but the call
103
+ * won't be made if both wait_nr and submit are zero
104
+ * so preserve wait_nr.
105
+ */
106
+ if (!(ring->flags & IORING_SETUP_IOPOLL))
107
+ wait_nr = 0;
108
+ } else {
109
+ submit -= ret;
110
+ }
111
+ if (cqe)
112
+ break;
113
+ } while (!err);
114
+
115
+ *cqe_ptr = cqe;
116
+ return err;
117
+ }
118
+
119
+ /*
120
+ * Fill in an array of IO completions up to count, if any are available.
121
+ * Returns the amount of IO completions filled.
122
+ */
123
+ unsigned io_uring_peek_batch_cqe(struct io_uring *ring,
124
+ struct io_uring_cqe **cqes, unsigned count)
125
+ {
126
+ unsigned ready;
127
+ bool overflow_checked = false;
128
+
129
+ again:
130
+ ready = io_uring_cq_ready(ring);
131
+ if (ready) {
132
+ unsigned head = *ring->cq.khead;
133
+ unsigned mask = *ring->cq.kring_mask;
134
+ unsigned last;
135
+ int i = 0;
136
+
137
+ count = count > ready ? ready : count;
138
+ last = head + count;
139
+ for (;head != last; head++, i++)
140
+ cqes[i] = &ring->cq.cqes[head & mask];
141
+
142
+ return count;
143
+ }
144
+
145
+ if (overflow_checked)
146
+ goto done;
147
+
148
+ if (cq_ring_needs_flush(ring)) {
149
+ __sys_io_uring_enter(ring->ring_fd, 0, 0,
150
+ IORING_ENTER_GETEVENTS, NULL);
151
+ overflow_checked = true;
152
+ goto again;
153
+ }
154
+
155
+ done:
156
+ return 0;
157
+ }
158
+
159
+ /*
160
+ * Sync internal state with kernel ring state on the SQ side. Returns the
161
+ * number of pending items in the SQ ring, for the shared ring.
162
+ */
163
+ int __io_uring_flush_sq(struct io_uring *ring)
164
+ {
165
+ struct io_uring_sq *sq = &ring->sq;
166
+ const unsigned mask = *sq->kring_mask;
167
+ unsigned ktail, to_submit;
168
+
169
+ if (sq->sqe_head == sq->sqe_tail) {
170
+ ktail = *sq->ktail;
171
+ goto out;
172
+ }
173
+
174
+ /*
175
+ * Fill in sqes that we have queued up, adding them to the kernel ring
176
+ */
177
+ ktail = *sq->ktail;
178
+ to_submit = sq->sqe_tail - sq->sqe_head;
179
+ while (to_submit--) {
180
+ sq->array[ktail & mask] = sq->sqe_head & mask;
181
+ ktail++;
182
+ sq->sqe_head++;
183
+ }
184
+
185
+ /*
186
+ * Ensure that the kernel sees the SQE updates before it sees the tail
187
+ * update.
188
+ */
189
+ io_uring_smp_store_release(sq->ktail, ktail);
190
+ out:
191
+ return ktail - *sq->khead;
192
+ }
193
+
194
+ /*
195
+ * Like io_uring_wait_cqe(), except it accepts a timeout value as well. Note
196
+ * that an sqe is used internally to handle the timeout. Applications using
197
+ * this function must never set sqe->user_data to LIBURING_UDATA_TIMEOUT!
198
+ *
199
+ * If 'ts' is specified, the application need not call io_uring_submit() before
200
+ * calling this function, as we will do that on its behalf. From this it also
201
+ * follows that this function isn't safe to use for applications that split SQ
202
+ * and CQ handling between two threads and expect that to work without
203
+ * synchronization, as this function manipulates both the SQ and CQ side.
204
+ */
205
+ int io_uring_wait_cqes(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
206
+ unsigned wait_nr, struct __kernel_timespec *ts,
207
+ sigset_t *sigmask)
208
+ {
209
+ unsigned to_submit = 0;
210
+
211
+ if (ts) {
212
+ struct io_uring_sqe *sqe;
213
+ int ret;
214
+
215
+ /*
216
+ * If the SQ ring is full, we may need to submit IO first
217
+ */
218
+ sqe = io_uring_get_sqe(ring);
219
+ if (!sqe) {
220
+ ret = io_uring_submit(ring);
221
+ if (ret < 0)
222
+ return ret;
223
+ sqe = io_uring_get_sqe(ring);
224
+ if (!sqe)
225
+ return -EAGAIN;
226
+ }
227
+ io_uring_prep_timeout(sqe, ts, wait_nr, 0);
228
+ sqe->user_data = LIBURING_UDATA_TIMEOUT;
229
+ to_submit = __io_uring_flush_sq(ring);
230
+ }
231
+
232
+ return __io_uring_get_cqe(ring, cqe_ptr, to_submit, wait_nr, sigmask);
233
+ }
234
+
235
+ /*
236
+ * See io_uring_wait_cqes() - this function is the same, it just always uses
237
+ * '1' as the wait_nr.
238
+ */
239
+ int io_uring_wait_cqe_timeout(struct io_uring *ring,
240
+ struct io_uring_cqe **cqe_ptr,
241
+ struct __kernel_timespec *ts)
242
+ {
243
+ return io_uring_wait_cqes(ring, cqe_ptr, 1, ts, NULL);
244
+ }
245
+
246
+ /*
247
+ * Submit sqes acquired from io_uring_get_sqe() to the kernel.
248
+ *
249
+ * Returns number of sqes submitted
250
+ */
251
+ static int __io_uring_submit(struct io_uring *ring, unsigned submitted,
252
+ unsigned wait_nr)
253
+ {
254
+ unsigned flags;
255
+ int ret;
256
+
257
+ flags = 0;
258
+ if (sq_ring_needs_enter(ring, &flags) || wait_nr) {
259
+ if (wait_nr || (ring->flags & IORING_SETUP_IOPOLL))
260
+ flags |= IORING_ENTER_GETEVENTS;
261
+
262
+ ret = __sys_io_uring_enter(ring->ring_fd, submitted, wait_nr,
263
+ flags, NULL);
264
+ if (ret < 0)
265
+ return -errno;
266
+ } else
267
+ ret = submitted;
268
+
269
+ return ret;
270
+ }
271
+
272
+ static int __io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr)
273
+ {
274
+ return __io_uring_submit(ring, __io_uring_flush_sq(ring), wait_nr);
275
+ }
276
+
277
+ /*
278
+ * Submit sqes acquired from io_uring_get_sqe() to the kernel.
279
+ *
280
+ * Returns number of sqes submitted
281
+ */
282
+ int io_uring_submit(struct io_uring *ring)
283
+ {
284
+ return __io_uring_submit_and_wait(ring, 0);
285
+ }
286
+
287
+ /*
288
+ * Like io_uring_submit(), but allows waiting for events as well.
289
+ *
290
+ * Returns number of sqes submitted
291
+ */
292
+ int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr)
293
+ {
294
+ return __io_uring_submit_and_wait(ring, wait_nr);
295
+ }
296
+
297
+ static inline struct io_uring_sqe *
298
+ __io_uring_get_sqe(struct io_uring_sq *sq, unsigned int __head)
299
+ {
300
+ unsigned int __next = (sq)->sqe_tail + 1;
301
+ struct io_uring_sqe *__sqe = NULL;
302
+
303
+ if (__next - __head <= *(sq)->kring_entries) {
304
+ __sqe = &(sq)->sqes[(sq)->sqe_tail & *(sq)->kring_mask];
305
+ (sq)->sqe_tail = __next;
306
+ }
307
+ return __sqe;
308
+ }
309
+
310
+ /*
311
+ * Return an sqe to fill. Application must later call io_uring_submit()
312
+ * when it's ready to tell the kernel about it. The caller may call this
313
+ * function multiple times before calling io_uring_submit().
314
+ *
315
+ * Returns a vacant sqe, or NULL if we're full.
316
+ */
317
+ struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
318
+ {
319
+ struct io_uring_sq *sq = &ring->sq;
320
+
321
+ return __io_uring_get_sqe(sq, io_uring_smp_load_acquire(sq->khead));
322
+ }
323
+
324
+ int __io_uring_sqring_wait(struct io_uring *ring)
325
+ {
326
+ int ret;
327
+
328
+ ret = __sys_io_uring_enter(ring->ring_fd, 0, 0, IORING_ENTER_SQ_WAIT,
329
+ NULL);
330
+ if (ret < 0)
331
+ ret = -errno;
332
+ return ret;
333
+ }