polyphony 0.45.5 → 0.46.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/test.yml +2 -0
  3. data/.gitmodules +0 -0
  4. data/CHANGELOG.md +4 -0
  5. data/Gemfile.lock +1 -1
  6. data/README.md +3 -3
  7. data/Rakefile +1 -1
  8. data/TODO.md +4 -4
  9. data/examples/performance/thread-vs-fiber/polyphony_server.rb +1 -2
  10. data/ext/liburing/liburing.h +585 -0
  11. data/ext/liburing/liburing/README.md +4 -0
  12. data/ext/liburing/liburing/barrier.h +73 -0
  13. data/ext/liburing/liburing/compat.h +15 -0
  14. data/ext/liburing/liburing/io_uring.h +343 -0
  15. data/ext/liburing/queue.c +333 -0
  16. data/ext/liburing/register.c +187 -0
  17. data/ext/liburing/setup.c +210 -0
  18. data/ext/liburing/syscall.c +54 -0
  19. data/ext/liburing/syscall.h +18 -0
  20. data/ext/polyphony/backend.h +0 -14
  21. data/ext/polyphony/backend_common.h +109 -0
  22. data/ext/polyphony/backend_io_uring.c +884 -0
  23. data/ext/polyphony/backend_io_uring_context.c +73 -0
  24. data/ext/polyphony/backend_io_uring_context.h +52 -0
  25. data/ext/polyphony/{libev_backend.c → backend_libev.c} +202 -294
  26. data/ext/polyphony/event.c +1 -1
  27. data/ext/polyphony/extconf.rb +31 -13
  28. data/ext/polyphony/fiber.c +29 -22
  29. data/ext/polyphony/libev.c +4 -0
  30. data/ext/polyphony/libev.h +8 -2
  31. data/ext/polyphony/liburing.c +8 -0
  32. data/ext/polyphony/playground.c +51 -0
  33. data/ext/polyphony/polyphony.c +5 -5
  34. data/ext/polyphony/polyphony.h +16 -12
  35. data/ext/polyphony/polyphony_ext.c +10 -4
  36. data/ext/polyphony/queue.c +1 -1
  37. data/ext/polyphony/thread.c +11 -9
  38. data/lib/polyphony/adapters/trace.rb +2 -2
  39. data/lib/polyphony/core/global_api.rb +1 -4
  40. data/lib/polyphony/extensions/debug.rb +13 -0
  41. data/lib/polyphony/extensions/fiber.rb +2 -2
  42. data/lib/polyphony/extensions/socket.rb +59 -10
  43. data/lib/polyphony/version.rb +1 -1
  44. data/test/helper.rb +36 -4
  45. data/test/io_uring_test.rb +55 -0
  46. data/test/stress.rb +5 -2
  47. data/test/test_backend.rb +4 -6
  48. data/test/test_ext.rb +1 -2
  49. data/test/test_fiber.rb +22 -16
  50. data/test/test_global_api.rb +33 -35
  51. data/test/test_throttler.rb +3 -6
  52. data/test/test_trace.rb +7 -5
  53. metadata +22 -3
@@ -0,0 +1,4 @@
1
+ ## Updating the liburing source code
2
+
3
+ - copy liburing/src/**/* into ext/liburing
4
+ - move ext/liburing/include/**/* to ext/liburing/
@@ -0,0 +1,73 @@
1
+ /* SPDX-License-Identifier: MIT */
2
+ #ifndef LIBURING_BARRIER_H
3
+ #define LIBURING_BARRIER_H
4
+
5
+ /*
6
+ From the kernel documentation file refcount-vs-atomic.rst:
7
+
8
+ A RELEASE memory ordering guarantees that all prior loads and
9
+ stores (all po-earlier instructions) on the same CPU are completed
10
+ before the operation. It also guarantees that all po-earlier
11
+ stores on the same CPU and all propagated stores from other CPUs
12
+ must propagate to all other CPUs before the release operation
13
+ (A-cumulative property). This is implemented using
14
+ :c:func:`smp_store_release`.
15
+
16
+ An ACQUIRE memory ordering guarantees that all post loads and
17
+ stores (all po-later instructions) on the same CPU are
18
+ completed after the acquire operation. It also guarantees that all
19
+ po-later stores on the same CPU must propagate to all other CPUs
20
+ after the acquire operation executes. This is implemented using
21
+ :c:func:`smp_acquire__after_ctrl_dep`.
22
+ */
23
+
24
+ #ifdef __cplusplus
25
+ #include <atomic>
26
+
27
+ template <typename T>
28
+ static inline void IO_URING_WRITE_ONCE(T &var, T val)
29
+ {
30
+ std::atomic_store_explicit(reinterpret_cast<std::atomic<T> *>(&var),
31
+ val, std::memory_order_relaxed);
32
+ }
33
+ template <typename T>
34
+ static inline T IO_URING_READ_ONCE(const T &var)
35
+ {
36
+ return std::atomic_load_explicit(
37
+ reinterpret_cast<const std::atomic<T> *>(&var),
38
+ std::memory_order_relaxed);
39
+ }
40
+
41
+ template <typename T>
42
+ static inline void io_uring_smp_store_release(T *p, T v)
43
+ {
44
+ std::atomic_store_explicit(reinterpret_cast<std::atomic<T> *>(p), v,
45
+ std::memory_order_release);
46
+ }
47
+
48
+ template <typename T>
49
+ static inline T io_uring_smp_load_acquire(const T *p)
50
+ {
51
+ return std::atomic_load_explicit(
52
+ reinterpret_cast<const std::atomic<T> *>(p),
53
+ std::memory_order_acquire);
54
+ }
55
+ #else
56
+ #include <stdatomic.h>
57
+
58
+ #define IO_URING_WRITE_ONCE(var, val) \
59
+ atomic_store_explicit((_Atomic typeof(var) *)&(var), \
60
+ (val), memory_order_relaxed)
61
+ #define IO_URING_READ_ONCE(var) \
62
+ atomic_load_explicit((_Atomic typeof(var) *)&(var), \
63
+ memory_order_relaxed)
64
+
65
+ #define io_uring_smp_store_release(p, v) \
66
+ atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \
67
+ memory_order_release)
68
+ #define io_uring_smp_load_acquire(p) \
69
+ atomic_load_explicit((_Atomic typeof(*(p)) *)(p), \
70
+ memory_order_acquire)
71
+ #endif
72
+
73
+ #endif /* defined(LIBURING_BARRIER_H) */
@@ -0,0 +1,15 @@
1
+ /* SPDX-License-Identifier: MIT */
2
+ #ifndef LIBURING_COMPAT_H
3
+ #define LIBURING_COMPAT_H
4
+
5
+ #include <linux/time_types.h>
6
+
7
+ #include <inttypes.h>
8
+
9
+ struct open_how {
10
+ uint64_t flags;
11
+ uint64_t mode;
12
+ uint64_t resolve;
13
+ };
14
+
15
+ #endif
@@ -0,0 +1,343 @@
1
+ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
2
+ /*
3
+ * Header file for the io_uring interface.
4
+ *
5
+ * Copyright (C) 2019 Jens Axboe
6
+ * Copyright (C) 2019 Christoph Hellwig
7
+ */
8
+ #ifndef LINUX_IO_URING_H
9
+ #define LINUX_IO_URING_H
10
+
11
+ #include <linux/fs.h>
12
+ #include <linux/types.h>
13
+
14
+ #ifdef __cplusplus
15
+ extern "C" {
16
+ #endif
17
+
18
+ /*
19
+ * IO submission data structure (Submission Queue Entry)
20
+ */
21
+ struct io_uring_sqe {
22
+ __u8 opcode; /* type of operation for this sqe */
23
+ __u8 flags; /* IOSQE_ flags */
24
+ __u16 ioprio; /* ioprio for the request */
25
+ __s32 fd; /* file descriptor to do IO on */
26
+ union {
27
+ __u64 off; /* offset into file */
28
+ __u64 addr2;
29
+ };
30
+ union {
31
+ __u64 addr; /* pointer to buffer or iovecs */
32
+ __u64 splice_off_in;
33
+ };
34
+ __u32 len; /* buffer size or number of iovecs */
35
+ union {
36
+ __kernel_rwf_t rw_flags;
37
+ __u32 fsync_flags;
38
+ __u16 poll_events; /* compatibility */
39
+ __u32 poll32_events; /* word-reversed for BE */
40
+ __u32 sync_range_flags;
41
+ __u32 msg_flags;
42
+ __u32 timeout_flags;
43
+ __u32 accept_flags;
44
+ __u32 cancel_flags;
45
+ __u32 open_flags;
46
+ __u32 statx_flags;
47
+ __u32 fadvise_advice;
48
+ __u32 splice_flags;
49
+ };
50
+ __u64 user_data; /* data to be passed back at completion time */
51
+ union {
52
+ struct {
53
+ /* pack this to avoid bogus arm OABI complaints */
54
+ union {
55
+ /* index into fixed buffers, if used */
56
+ __u16 buf_index;
57
+ /* for grouped buffer selection */
58
+ __u16 buf_group;
59
+ } __attribute__((packed));
60
+ /* personality to use, if used */
61
+ __u16 personality;
62
+ __s32 splice_fd_in;
63
+ };
64
+ __u64 __pad2[3];
65
+ };
66
+ };
67
+
68
+ enum {
69
+ IOSQE_FIXED_FILE_BIT,
70
+ IOSQE_IO_DRAIN_BIT,
71
+ IOSQE_IO_LINK_BIT,
72
+ IOSQE_IO_HARDLINK_BIT,
73
+ IOSQE_ASYNC_BIT,
74
+ IOSQE_BUFFER_SELECT_BIT,
75
+ };
76
+
77
+ /*
78
+ * sqe->flags
79
+ */
80
+ /* use fixed fileset */
81
+ #define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT)
82
+ /* issue after inflight IO */
83
+ #define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT)
84
+ /* links next sqe */
85
+ #define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT)
86
+ /* like LINK, but stronger */
87
+ #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
88
+ /* always go async */
89
+ #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
90
+ /* select buffer from sqe->buf_group */
91
+ #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
92
+
93
+ /*
94
+ * io_uring_setup() flags
95
+ */
96
+ #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */
97
+ #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
98
+ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
99
+ #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
100
+ #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
101
+ #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
102
+ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
103
+
104
+ enum {
105
+ IORING_OP_NOP,
106
+ IORING_OP_READV,
107
+ IORING_OP_WRITEV,
108
+ IORING_OP_FSYNC,
109
+ IORING_OP_READ_FIXED,
110
+ IORING_OP_WRITE_FIXED,
111
+ IORING_OP_POLL_ADD,
112
+ IORING_OP_POLL_REMOVE,
113
+ IORING_OP_SYNC_FILE_RANGE,
114
+ IORING_OP_SENDMSG,
115
+ IORING_OP_RECVMSG,
116
+ IORING_OP_TIMEOUT,
117
+ IORING_OP_TIMEOUT_REMOVE,
118
+ IORING_OP_ACCEPT,
119
+ IORING_OP_ASYNC_CANCEL,
120
+ IORING_OP_LINK_TIMEOUT,
121
+ IORING_OP_CONNECT,
122
+ IORING_OP_FALLOCATE,
123
+ IORING_OP_OPENAT,
124
+ IORING_OP_CLOSE,
125
+ IORING_OP_FILES_UPDATE,
126
+ IORING_OP_STATX,
127
+ IORING_OP_READ,
128
+ IORING_OP_WRITE,
129
+ IORING_OP_FADVISE,
130
+ IORING_OP_MADVISE,
131
+ IORING_OP_SEND,
132
+ IORING_OP_RECV,
133
+ IORING_OP_OPENAT2,
134
+ IORING_OP_EPOLL_CTL,
135
+ IORING_OP_SPLICE,
136
+ IORING_OP_PROVIDE_BUFFERS,
137
+ IORING_OP_REMOVE_BUFFERS,
138
+ IORING_OP_TEE,
139
+ IORING_OP_SHUTDOWN,
140
+
141
+ /* this goes last, obviously */
142
+ IORING_OP_LAST,
143
+ };
144
+
145
+ /*
146
+ * sqe->fsync_flags
147
+ */
148
+ #define IORING_FSYNC_DATASYNC (1U << 0)
149
+
150
+ /*
151
+ * sqe->timeout_flags
152
+ */
153
+ #define IORING_TIMEOUT_ABS (1U << 0)
154
+
155
+ /*
156
+ * sqe->splice_flags
157
+ * extends splice(2) flags
158
+ */
159
+ #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
160
+
161
+ /*
162
+ * IO completion data structure (Completion Queue Entry)
163
+ */
164
+ struct io_uring_cqe {
165
+ __u64 user_data; /* sqe->data submission passed back */
166
+ __s32 res; /* result code for this event */
167
+ __u32 flags;
168
+ };
169
+
170
+ /*
171
+ * cqe->flags
172
+ *
173
+ * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
174
+ */
175
+ #define IORING_CQE_F_BUFFER (1U << 0)
176
+
177
+ enum {
178
+ IORING_CQE_BUFFER_SHIFT = 16,
179
+ };
180
+
181
+ /*
182
+ * Magic offsets for the application to mmap the data it needs
183
+ */
184
+ #define IORING_OFF_SQ_RING 0ULL
185
+ #define IORING_OFF_CQ_RING 0x8000000ULL
186
+ #define IORING_OFF_SQES 0x10000000ULL
187
+
188
+ /*
189
+ * Filled with the offset for mmap(2)
190
+ */
191
+ struct io_sqring_offsets {
192
+ __u32 head;
193
+ __u32 tail;
194
+ __u32 ring_mask;
195
+ __u32 ring_entries;
196
+ __u32 flags;
197
+ __u32 dropped;
198
+ __u32 array;
199
+ __u32 resv1;
200
+ __u64 resv2;
201
+ };
202
+
203
+ /*
204
+ * sq_ring->flags
205
+ */
206
+ #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */
207
+ #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */
208
+
209
+ struct io_cqring_offsets {
210
+ __u32 head;
211
+ __u32 tail;
212
+ __u32 ring_mask;
213
+ __u32 ring_entries;
214
+ __u32 overflow;
215
+ __u32 cqes;
216
+ __u32 flags;
217
+ __u32 resv1;
218
+ __u64 resv2;
219
+ };
220
+
221
+ /*
222
+ * cq_ring->flags
223
+ */
224
+
225
+ /* disable eventfd notifications */
226
+ #define IORING_CQ_EVENTFD_DISABLED (1U << 0)
227
+
228
+ /*
229
+ * io_uring_enter(2) flags
230
+ */
231
+ #define IORING_ENTER_GETEVENTS (1U << 0)
232
+ #define IORING_ENTER_SQ_WAKEUP (1U << 1)
233
+ #define IORING_ENTER_SQ_WAIT (1U << 2)
234
+
235
+ /*
236
+ * Passed in for io_uring_setup(2). Copied back with updated info on success
237
+ */
238
+ struct io_uring_params {
239
+ __u32 sq_entries;
240
+ __u32 cq_entries;
241
+ __u32 flags;
242
+ __u32 sq_thread_cpu;
243
+ __u32 sq_thread_idle;
244
+ __u32 features;
245
+ __u32 wq_fd;
246
+ __u32 resv[3];
247
+ struct io_sqring_offsets sq_off;
248
+ struct io_cqring_offsets cq_off;
249
+ };
250
+
251
+ /*
252
+ * io_uring_params->features flags
253
+ */
254
+ #define IORING_FEAT_SINGLE_MMAP (1U << 0)
255
+ #define IORING_FEAT_NODROP (1U << 1)
256
+ #define IORING_FEAT_SUBMIT_STABLE (1U << 2)
257
+ #define IORING_FEAT_RW_CUR_POS (1U << 3)
258
+ #define IORING_FEAT_CUR_PERSONALITY (1U << 4)
259
+ #define IORING_FEAT_FAST_POLL (1U << 5)
260
+ #define IORING_FEAT_POLL_32BITS (1U << 6)
261
+ #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7)
262
+
263
+ /*
264
+ * io_uring_register(2) opcodes and arguments
265
+ */
266
+ enum {
267
+ IORING_REGISTER_BUFFERS = 0,
268
+ IORING_UNREGISTER_BUFFERS = 1,
269
+ IORING_REGISTER_FILES = 2,
270
+ IORING_UNREGISTER_FILES = 3,
271
+ IORING_REGISTER_EVENTFD = 4,
272
+ IORING_UNREGISTER_EVENTFD = 5,
273
+ IORING_REGISTER_FILES_UPDATE = 6,
274
+ IORING_REGISTER_EVENTFD_ASYNC = 7,
275
+ IORING_REGISTER_PROBE = 8,
276
+ IORING_REGISTER_PERSONALITY = 9,
277
+ IORING_UNREGISTER_PERSONALITY = 10,
278
+ IORING_REGISTER_RESTRICTIONS = 11,
279
+ IORING_REGISTER_ENABLE_RINGS = 12,
280
+
281
+ /* this goes last */
282
+ IORING_REGISTER_LAST
283
+ };
284
+
285
+ struct io_uring_files_update {
286
+ __u32 offset;
287
+ __u32 resv;
288
+ __aligned_u64 /* __s32 * */ fds;
289
+ };
290
+
291
+ #define IO_URING_OP_SUPPORTED (1U << 0)
292
+
293
+ struct io_uring_probe_op {
294
+ __u8 op;
295
+ __u8 resv;
296
+ __u16 flags; /* IO_URING_OP_* flags */
297
+ __u32 resv2;
298
+ };
299
+
300
+ struct io_uring_probe {
301
+ __u8 last_op; /* last opcode supported */
302
+ __u8 ops_len; /* length of ops[] array below */
303
+ __u16 resv;
304
+ __u32 resv2[3];
305
+ struct io_uring_probe_op ops[0];
306
+ };
307
+
308
+ struct io_uring_restriction {
309
+ __u16 opcode;
310
+ union {
311
+ __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */
312
+ __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */
313
+ __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */
314
+ };
315
+ __u8 resv;
316
+ __u32 resv2[3];
317
+ };
318
+
319
+ /*
320
+ * io_uring_restriction->opcode values
321
+ */
322
+ enum {
323
+ /* Allow an io_uring_register(2) opcode */
324
+ IORING_RESTRICTION_REGISTER_OP = 0,
325
+
326
+ /* Allow an sqe opcode */
327
+ IORING_RESTRICTION_SQE_OP = 1,
328
+
329
+ /* Allow sqe flags */
330
+ IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2,
331
+
332
+ /* Require sqe flags (these flags must be set on each submission) */
333
+ IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3,
334
+
335
+ IORING_RESTRICTION_LAST
336
+ };
337
+
338
+
339
+ #ifdef __cplusplus
340
+ }
341
+ #endif
342
+
343
+ #endif
@@ -0,0 +1,333 @@
1
+ /* SPDX-License-Identifier: MIT */
2
+ #include <sys/types.h>
3
+ #include <sys/stat.h>
4
+ #include <sys/mman.h>
5
+ #include <unistd.h>
6
+ #include <errno.h>
7
+ #include <string.h>
8
+ #include <stdbool.h>
9
+
10
+ #include "liburing/compat.h"
11
+ #include "liburing/io_uring.h"
12
+ #include "liburing.h"
13
+ #include "liburing/barrier.h"
14
+
15
+ #include "syscall.h"
16
+
17
+ /*
18
+ * Returns true if we're not using SQ thread (thus nobody submits but us)
19
+ * or if IORING_SQ_NEED_WAKEUP is set, so submit thread must be explicitly
20
+ * awakened. For the latter case, we set the thread wakeup flag.
21
+ */
22
+ static inline bool sq_ring_needs_enter(struct io_uring *ring, unsigned *flags)
23
+ {
24
+ if (!(ring->flags & IORING_SETUP_SQPOLL))
25
+ return true;
26
+ if (IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_NEED_WAKEUP) {
27
+ *flags |= IORING_ENTER_SQ_WAKEUP;
28
+ return true;
29
+ }
30
+
31
+ return false;
32
+ }
33
+
34
+ static inline bool cq_ring_needs_flush(struct io_uring *ring)
35
+ {
36
+ return IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_CQ_OVERFLOW;
37
+ }
38
+
39
+ static int __io_uring_peek_cqe(struct io_uring *ring,
40
+ struct io_uring_cqe **cqe_ptr)
41
+ {
42
+ struct io_uring_cqe *cqe;
43
+ unsigned head;
44
+ int err = 0;
45
+
46
+ do {
47
+ io_uring_for_each_cqe(ring, head, cqe)
48
+ break;
49
+ if (cqe) {
50
+ if (cqe->user_data == LIBURING_UDATA_TIMEOUT) {
51
+ if (cqe->res < 0)
52
+ err = cqe->res;
53
+ io_uring_cq_advance(ring, 1);
54
+ if (!err)
55
+ continue;
56
+ cqe = NULL;
57
+ }
58
+ }
59
+ break;
60
+ } while (1);
61
+
62
+ *cqe_ptr = cqe;
63
+ return err;
64
+ }
65
+
66
+ int __io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
67
+ unsigned submit, unsigned wait_nr, sigset_t *sigmask)
68
+ {
69
+ struct io_uring_cqe *cqe = NULL;
70
+ const int to_wait = wait_nr;
71
+ int ret = 0, err;
72
+
73
+ do {
74
+ bool cq_overflow_flush = false;
75
+ unsigned flags = 0;
76
+
77
+ err = __io_uring_peek_cqe(ring, &cqe);
78
+ if (err)
79
+ break;
80
+ if (!cqe && !to_wait && !submit) {
81
+ if (!cq_ring_needs_flush(ring)) {
82
+ err = -EAGAIN;
83
+ break;
84
+ }
85
+ cq_overflow_flush = true;
86
+ }
87
+ if (wait_nr && cqe)
88
+ wait_nr--;
89
+ if (wait_nr || cq_overflow_flush)
90
+ flags = IORING_ENTER_GETEVENTS;
91
+ if (submit)
92
+ sq_ring_needs_enter(ring, &flags);
93
+ if (wait_nr || submit || cq_overflow_flush)
94
+ ret = __sys_io_uring_enter(ring->ring_fd, submit,
95
+ wait_nr, flags, sigmask);
96
+ if (ret < 0) {
97
+ err = -errno;
98
+ } else if (ret == (int)submit) {
99
+ submit = 0;
100
+ /*
101
+ * When SETUP_IOPOLL is set, __sys_io_uring enter()
102
+ * must be called to reap new completions but the call
103
+ * won't be made if both wait_nr and submit are zero
104
+ * so preserve wait_nr.
105
+ */
106
+ if (!(ring->flags & IORING_SETUP_IOPOLL))
107
+ wait_nr = 0;
108
+ } else {
109
+ submit -= ret;
110
+ }
111
+ if (cqe)
112
+ break;
113
+ } while (!err);
114
+
115
+ *cqe_ptr = cqe;
116
+ return err;
117
+ }
118
+
119
+ /*
120
+ * Fill in an array of IO completions up to count, if any are available.
121
+ * Returns the amount of IO completions filled.
122
+ */
123
+ unsigned io_uring_peek_batch_cqe(struct io_uring *ring,
124
+ struct io_uring_cqe **cqes, unsigned count)
125
+ {
126
+ unsigned ready;
127
+ bool overflow_checked = false;
128
+
129
+ again:
130
+ ready = io_uring_cq_ready(ring);
131
+ if (ready) {
132
+ unsigned head = *ring->cq.khead;
133
+ unsigned mask = *ring->cq.kring_mask;
134
+ unsigned last;
135
+ int i = 0;
136
+
137
+ count = count > ready ? ready : count;
138
+ last = head + count;
139
+ for (;head != last; head++, i++)
140
+ cqes[i] = &ring->cq.cqes[head & mask];
141
+
142
+ return count;
143
+ }
144
+
145
+ if (overflow_checked)
146
+ goto done;
147
+
148
+ if (cq_ring_needs_flush(ring)) {
149
+ __sys_io_uring_enter(ring->ring_fd, 0, 0,
150
+ IORING_ENTER_GETEVENTS, NULL);
151
+ overflow_checked = true;
152
+ goto again;
153
+ }
154
+
155
+ done:
156
+ return 0;
157
+ }
158
+
159
+ /*
160
+ * Sync internal state with kernel ring state on the SQ side. Returns the
161
+ * number of pending items in the SQ ring, for the shared ring.
162
+ */
163
+ int __io_uring_flush_sq(struct io_uring *ring)
164
+ {
165
+ struct io_uring_sq *sq = &ring->sq;
166
+ const unsigned mask = *sq->kring_mask;
167
+ unsigned ktail, to_submit;
168
+
169
+ if (sq->sqe_head == sq->sqe_tail) {
170
+ ktail = *sq->ktail;
171
+ goto out;
172
+ }
173
+
174
+ /*
175
+ * Fill in sqes that we have queued up, adding them to the kernel ring
176
+ */
177
+ ktail = *sq->ktail;
178
+ to_submit = sq->sqe_tail - sq->sqe_head;
179
+ while (to_submit--) {
180
+ sq->array[ktail & mask] = sq->sqe_head & mask;
181
+ ktail++;
182
+ sq->sqe_head++;
183
+ }
184
+
185
+ /*
186
+ * Ensure that the kernel sees the SQE updates before it sees the tail
187
+ * update.
188
+ */
189
+ io_uring_smp_store_release(sq->ktail, ktail);
190
+ out:
191
+ return ktail - *sq->khead;
192
+ }
193
+
194
+ /*
195
+ * Like io_uring_wait_cqe(), except it accepts a timeout value as well. Note
196
+ * that an sqe is used internally to handle the timeout. Applications using
197
+ * this function must never set sqe->user_data to LIBURING_UDATA_TIMEOUT!
198
+ *
199
+ * If 'ts' is specified, the application need not call io_uring_submit() before
200
+ * calling this function, as we will do that on its behalf. From this it also
201
+ * follows that this function isn't safe to use for applications that split SQ
202
+ * and CQ handling between two threads and expect that to work without
203
+ * synchronization, as this function manipulates both the SQ and CQ side.
204
+ */
205
+ int io_uring_wait_cqes(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
206
+ unsigned wait_nr, struct __kernel_timespec *ts,
207
+ sigset_t *sigmask)
208
+ {
209
+ unsigned to_submit = 0;
210
+
211
+ if (ts) {
212
+ struct io_uring_sqe *sqe;
213
+ int ret;
214
+
215
+ /*
216
+ * If the SQ ring is full, we may need to submit IO first
217
+ */
218
+ sqe = io_uring_get_sqe(ring);
219
+ if (!sqe) {
220
+ ret = io_uring_submit(ring);
221
+ if (ret < 0)
222
+ return ret;
223
+ sqe = io_uring_get_sqe(ring);
224
+ if (!sqe)
225
+ return -EAGAIN;
226
+ }
227
+ io_uring_prep_timeout(sqe, ts, wait_nr, 0);
228
+ sqe->user_data = LIBURING_UDATA_TIMEOUT;
229
+ to_submit = __io_uring_flush_sq(ring);
230
+ }
231
+
232
+ return __io_uring_get_cqe(ring, cqe_ptr, to_submit, wait_nr, sigmask);
233
+ }
234
+
235
+ /*
236
+ * See io_uring_wait_cqes() - this function is the same, it just always uses
237
+ * '1' as the wait_nr.
238
+ */
239
+ int io_uring_wait_cqe_timeout(struct io_uring *ring,
240
+ struct io_uring_cqe **cqe_ptr,
241
+ struct __kernel_timespec *ts)
242
+ {
243
+ return io_uring_wait_cqes(ring, cqe_ptr, 1, ts, NULL);
244
+ }
245
+
246
+ /*
247
+ * Submit sqes acquired from io_uring_get_sqe() to the kernel.
248
+ *
249
+ * Returns number of sqes submitted
250
+ */
251
+ static int __io_uring_submit(struct io_uring *ring, unsigned submitted,
252
+ unsigned wait_nr)
253
+ {
254
+ unsigned flags;
255
+ int ret;
256
+
257
+ flags = 0;
258
+ if (sq_ring_needs_enter(ring, &flags) || wait_nr) {
259
+ if (wait_nr || (ring->flags & IORING_SETUP_IOPOLL))
260
+ flags |= IORING_ENTER_GETEVENTS;
261
+
262
+ ret = __sys_io_uring_enter(ring->ring_fd, submitted, wait_nr,
263
+ flags, NULL);
264
+ if (ret < 0)
265
+ return -errno;
266
+ } else
267
+ ret = submitted;
268
+
269
+ return ret;
270
+ }
271
+
272
+ static int __io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr)
273
+ {
274
+ return __io_uring_submit(ring, __io_uring_flush_sq(ring), wait_nr);
275
+ }
276
+
277
+ /*
278
+ * Submit sqes acquired from io_uring_get_sqe() to the kernel.
279
+ *
280
+ * Returns number of sqes submitted
281
+ */
282
+ int io_uring_submit(struct io_uring *ring)
283
+ {
284
+ return __io_uring_submit_and_wait(ring, 0);
285
+ }
286
+
287
+ /*
288
+ * Like io_uring_submit(), but allows waiting for events as well.
289
+ *
290
+ * Returns number of sqes submitted
291
+ */
292
+ int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr)
293
+ {
294
+ return __io_uring_submit_and_wait(ring, wait_nr);
295
+ }
296
+
297
+ static inline struct io_uring_sqe *
298
+ __io_uring_get_sqe(struct io_uring_sq *sq, unsigned int __head)
299
+ {
300
+ unsigned int __next = (sq)->sqe_tail + 1;
301
+ struct io_uring_sqe *__sqe = NULL;
302
+
303
+ if (__next - __head <= *(sq)->kring_entries) {
304
+ __sqe = &(sq)->sqes[(sq)->sqe_tail & *(sq)->kring_mask];
305
+ (sq)->sqe_tail = __next;
306
+ }
307
+ return __sqe;
308
+ }
309
+
310
+ /*
311
+ * Return an sqe to fill. Application must later call io_uring_submit()
312
+ * when it's ready to tell the kernel about it. The caller may call this
313
+ * function multiple times before calling io_uring_submit().
314
+ *
315
+ * Returns a vacant sqe, or NULL if we're full.
316
+ */
317
+ struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
318
+ {
319
+ struct io_uring_sq *sq = &ring->sq;
320
+
321
+ return __io_uring_get_sqe(sq, io_uring_smp_load_acquire(sq->khead));
322
+ }
323
+
324
+ int __io_uring_sqring_wait(struct io_uring *ring)
325
+ {
326
+ int ret;
327
+
328
+ ret = __sys_io_uring_enter(ring->ring_fd, 0, 0, IORING_ENTER_SQ_WAIT,
329
+ NULL);
330
+ if (ret < 0)
331
+ ret = -errno;
332
+ return ret;
333
+ }