libev_scheduler 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * libev native API header
3
3
  *
4
- * Copyright (c) 2007-2019 Marc Alexander Lehmann <libev@schmorp.de>
4
+ * Copyright (c) 2007-2020 Marc Alexander Lehmann <libev@schmorp.de>
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without modifica-
@@ -151,7 +151,10 @@ EV_CPP(extern "C" {)
151
151
 
152
152
  /*****************************************************************************/
153
153
 
154
- typedef double ev_tstamp;
154
+ #ifndef EV_TSTAMP_T
155
+ # define EV_TSTAMP_T double
156
+ #endif
157
+ typedef EV_TSTAMP_T ev_tstamp;
155
158
 
156
159
  #include <string.h> /* for memmove */
157
160
 
@@ -212,7 +215,7 @@ struct ev_loop;
212
215
  /*****************************************************************************/
213
216
 
214
217
  #define EV_VERSION_MAJOR 4
215
- #define EV_VERSION_MINOR 27
218
+ #define EV_VERSION_MINOR 33
216
219
 
217
220
  /* eventmask, revents, events... */
218
221
  enum {
@@ -389,14 +392,12 @@ typedef struct ev_stat
389
392
  } ev_stat;
390
393
  #endif
391
394
 
392
- #if EV_IDLE_ENABLE
393
395
  /* invoked when the nothing else needs to be done, keeps the process from blocking */
394
396
  /* revent EV_IDLE */
395
397
  typedef struct ev_idle
396
398
  {
397
399
  EV_WATCHER (ev_idle)
398
400
  } ev_idle;
399
- #endif
400
401
 
401
402
  /* invoked for each run of the mainloop, just before the blocking call */
402
403
  /* you can still change events in any way you like */
@@ -413,23 +414,19 @@ typedef struct ev_check
413
414
  EV_WATCHER (ev_check)
414
415
  } ev_check;
415
416
 
416
- #if EV_FORK_ENABLE
417
417
  /* the callback gets invoked before check in the child process when a fork was detected */
418
418
  /* revent EV_FORK */
419
419
  typedef struct ev_fork
420
420
  {
421
421
  EV_WATCHER (ev_fork)
422
422
  } ev_fork;
423
- #endif
424
423
 
425
- #if EV_CLEANUP_ENABLE
426
424
  /* is invoked just before the loop gets destroyed */
427
425
  /* revent EV_CLEANUP */
428
426
  typedef struct ev_cleanup
429
427
  {
430
428
  EV_WATCHER (ev_cleanup)
431
429
  } ev_cleanup;
432
- #endif
433
430
 
434
431
  #if EV_EMBED_ENABLE
435
432
  /* used to embed an event loop inside another */
@@ -439,16 +436,18 @@ typedef struct ev_embed
439
436
  EV_WATCHER (ev_embed)
440
437
 
441
438
  struct ev_loop *other; /* ro */
439
+ #undef EV_IO_ENABLE
440
+ #define EV_IO_ENABLE 1
442
441
  ev_io io; /* private */
442
+ #undef EV_PREPARE_ENABLE
443
+ #define EV_PREPARE_ENABLE 1
443
444
  ev_prepare prepare; /* private */
444
445
  ev_check check; /* unused */
445
446
  ev_timer timer; /* unused */
446
447
  ev_periodic periodic; /* unused */
447
448
  ev_idle idle; /* unused */
448
449
  ev_fork fork; /* private */
449
- #if EV_CLEANUP_ENABLE
450
450
  ev_cleanup cleanup; /* unused */
451
- #endif
452
451
  } ev_embed;
453
452
  #endif
454
453
 
@@ -501,17 +500,18 @@ union ev_any_watcher
501
500
  /* flag bits for ev_default_loop and ev_loop_new */
502
501
  enum {
503
502
  /* the default */
504
- EVFLAG_AUTO = 0x00000000U, /* not quite a mask */
503
+ EVFLAG_AUTO = 0x00000000U, /* not quite a mask */
505
504
  /* flag bits */
506
- EVFLAG_NOENV = 0x01000000U, /* do NOT consult environment */
507
- EVFLAG_FORKCHECK = 0x02000000U, /* check for a fork in each iteration */
505
+ EVFLAG_NOENV = 0x01000000U, /* do NOT consult environment */
506
+ EVFLAG_FORKCHECK = 0x02000000U, /* check for a fork in each iteration */
508
507
  /* debugging/feature disable */
509
- EVFLAG_NOINOTIFY = 0x00100000U, /* do not attempt to use inotify */
508
+ EVFLAG_NOINOTIFY = 0x00100000U, /* do not attempt to use inotify */
510
509
  #if EV_COMPAT3
511
- EVFLAG_NOSIGFD = 0, /* compatibility to pre-3.9 */
510
+ EVFLAG_NOSIGFD = 0, /* compatibility to pre-3.9 */
512
511
  #endif
513
- EVFLAG_SIGNALFD = 0x00200000U, /* attempt to use signalfd */
514
- EVFLAG_NOSIGMASK = 0x00400000U /* avoid modifying the signal mask */
512
+ EVFLAG_SIGNALFD = 0x00200000U, /* attempt to use signalfd */
513
+ EVFLAG_NOSIGMASK = 0x00400000U, /* avoid modifying the signal mask */
514
+ EVFLAG_NOTIMERFD = 0x00800000U /* avoid creating a timerfd */
515
515
  };
516
516
 
517
517
  /* method bits to be ored together */
@@ -522,8 +522,9 @@ enum {
522
522
  EVBACKEND_KQUEUE = 0x00000008U, /* bsd, broken on osx */
523
523
  EVBACKEND_DEVPOLL = 0x00000010U, /* solaris 8 */ /* NYI */
524
524
  EVBACKEND_PORT = 0x00000020U, /* solaris 10 */
525
- EVBACKEND_LINUXAIO = 0x00000040U, /* Linuix AIO */
526
- EVBACKEND_ALL = 0x0000007FU, /* all known backends */
525
+ EVBACKEND_LINUXAIO = 0x00000040U, /* linux AIO, 4.19+ */
526
+ EVBACKEND_IOURING = 0x00000080U, /* linux io_uring, 5.1+ */
527
+ EVBACKEND_ALL = 0x000000FFU, /* all known backends */
527
528
  EVBACKEND_MASK = 0x0000FFFFU /* all future backends */
528
529
  };
529
530
 
@@ -655,6 +656,8 @@ EV_API_DECL void ev_unref (EV_P) EV_NOEXCEPT;
655
656
  */
656
657
  EV_API_DECL void ev_once (EV_P_ int fd, int events, ev_tstamp timeout, void (*cb)(int revents, void *arg), void *arg) EV_NOEXCEPT;
657
658
 
659
+ EV_API_DECL void ev_invoke_pending (EV_P); /* invoke all pending watchers */
660
+
658
661
  # if EV_FEATURE_API
659
662
  EV_API_DECL unsigned int ev_iteration (EV_P) EV_NOEXCEPT; /* number of loop iterations */
660
663
  EV_API_DECL unsigned int ev_depth (EV_P) EV_NOEXCEPT; /* #ev_loop enters - #ev_loop leaves */
@@ -672,7 +675,6 @@ EV_API_DECL void ev_set_invoke_pending_cb (EV_P_ ev_loop_callback invoke_pending
672
675
  EV_API_DECL void ev_set_loop_release_cb (EV_P_ void (*release)(EV_P) EV_NOEXCEPT, void (*acquire)(EV_P) EV_NOEXCEPT) EV_NOEXCEPT;
673
676
 
674
677
  EV_API_DECL unsigned int ev_pending_count (EV_P) EV_NOEXCEPT; /* number of pending events, if any */
675
- EV_API_DECL void ev_invoke_pending (EV_P); /* invoke all pending watchers */
676
678
 
677
679
  /*
678
680
  * stop/start the timer handling.
@@ -692,6 +694,7 @@ EV_API_DECL void ev_resume (EV_P) EV_NOEXCEPT;
692
694
  ev_set_cb ((ev), cb_); \
693
695
  } while (0)
694
696
 
697
+ #define ev_io_modify(ev,events_) do { (ev)->events = (ev)->events & EV__IOFDSET | (events_); } while (0)
695
698
  #define ev_io_set(ev,fd_,events_) do { (ev)->fd = (fd_); (ev)->events = (events_) | EV__IOFDSET; } while (0)
696
699
  #define ev_timer_set(ev,after_,repeat_) do { ((ev_watcher_time *)(ev))->at = (after_); (ev)->repeat = (repeat_); } while (0)
697
700
  #define ev_periodic_set(ev,ofs_,ival_,rcb_) do { (ev)->offset = (ofs_); (ev)->interval = (ival_); (ev)->reschedule_cb = (rcb_); } while (0)
@@ -737,6 +740,7 @@ EV_API_DECL void ev_resume (EV_P) EV_NOEXCEPT;
737
740
  #define ev_periodic_at(ev) (+((ev_watcher_time *)(ev))->at)
738
741
 
739
742
  #ifndef ev_set_cb
743
+ /* memmove is used here to avoid strict aliasing violations, and hopefully is optimized out by any reasonable compiler */
740
744
  # define ev_set_cb(ev,cb_) (ev_cb_ (ev) = (cb_), memmove (&((ev_watcher *)(ev))->cb, &ev_cb_ (ev), sizeof (ev_cb_ (ev))))
741
745
  #endif
742
746
 
@@ -853,4 +857,3 @@ EV_API_DECL void ev_async_send (EV_P_ ev_async *w) EV_NOEXCEPT;
853
857
  EV_CPP(})
854
858
 
855
859
  #endif
856
-
@@ -93,10 +93,10 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
93
93
  ev.events = (nev & EV_READ ? EPOLLIN : 0)
94
94
  | (nev & EV_WRITE ? EPOLLOUT : 0);
95
95
 
96
- if (expect_true (!epoll_ctl (backend_fd, oev && oldmask != nev ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev)))
96
+ if (ecb_expect_true (!epoll_ctl (backend_fd, oev && oldmask != nev ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev)))
97
97
  return;
98
98
 
99
- if (expect_true (errno == ENOENT))
99
+ if (ecb_expect_true (errno == ENOENT))
100
100
  {
101
101
  /* if ENOENT then the fd went away, so try to do the right thing */
102
102
  if (!nev)
@@ -105,7 +105,7 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
105
105
  if (!epoll_ctl (backend_fd, EPOLL_CTL_ADD, fd, &ev))
106
106
  return;
107
107
  }
108
- else if (expect_true (errno == EEXIST))
108
+ else if (ecb_expect_true (errno == EEXIST))
109
109
  {
110
110
  /* EEXIST means we ignored a previous DEL, but the fd is still active */
111
111
  /* if the kernel mask is the same as the new mask, we assume it hasn't changed */
@@ -115,7 +115,7 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
115
115
  if (!epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev))
116
116
  return;
117
117
  }
118
- else if (expect_true (errno == EPERM))
118
+ else if (ecb_expect_true (errno == EPERM))
119
119
  {
120
120
  /* EPERM means the fd is always ready, but epoll is too snobbish */
121
121
  /* to handle it, unlike select or poll. */
@@ -146,16 +146,16 @@ epoll_poll (EV_P_ ev_tstamp timeout)
146
146
  int i;
147
147
  int eventcnt;
148
148
 
149
- if (expect_false (epoll_epermcnt))
150
- timeout = 0.;
149
+ if (ecb_expect_false (epoll_epermcnt))
150
+ timeout = EV_TS_CONST (0.);
151
151
 
152
152
  /* epoll wait times cannot be larger than (LONG_MAX - 999UL) / HZ msecs, which is below */
153
153
  /* the default libev max wait time, however. */
154
154
  EV_RELEASE_CB;
155
- eventcnt = epoll_wait (backend_fd, epoll_events, epoll_eventmax, timeout * 1e3);
155
+ eventcnt = epoll_wait (backend_fd, epoll_events, epoll_eventmax, EV_TS_TO_MSEC (timeout));
156
156
  EV_ACQUIRE_CB;
157
157
 
158
- if (expect_false (eventcnt < 0))
158
+ if (ecb_expect_false (eventcnt < 0))
159
159
  {
160
160
  if (errno != EINTR)
161
161
  ev_syserr ("(libev) epoll_wait");
@@ -178,14 +178,14 @@ epoll_poll (EV_P_ ev_tstamp timeout)
178
178
  * other spurious notifications will be found by epoll_ctl, below
179
179
  * we assume that fd is always in range, as we never shrink the anfds array
180
180
  */
181
- if (expect_false ((uint32_t)anfds [fd].egen != (uint32_t)(ev->data.u64 >> 32)))
181
+ if (ecb_expect_false ((uint32_t)anfds [fd].egen != (uint32_t)(ev->data.u64 >> 32)))
182
182
  {
183
183
  /* recreate kernel state */
184
184
  postfork |= 2;
185
185
  continue;
186
186
  }
187
187
 
188
- if (expect_false (got & ~want))
188
+ if (ecb_expect_false (got & ~want))
189
189
  {
190
190
  anfds [fd].emask = want;
191
191
 
@@ -197,6 +197,8 @@ epoll_poll (EV_P_ ev_tstamp timeout)
197
197
  * above with the gencounter check (== our fd is not the event fd), and
198
198
  * partially here, when epoll_ctl returns an error (== a child has the fd
199
199
  * but we closed it).
200
+ * note: for events such as POLLHUP, where we can't know whether it refers
201
+ * to EV_READ or EV_WRITE, we might issue redundant EPOLL_CTL_MOD calls.
200
202
  */
201
203
  ev->events = (want & EV_READ ? EPOLLIN : 0)
202
204
  | (want & EV_WRITE ? EPOLLOUT : 0);
@@ -214,7 +216,7 @@ epoll_poll (EV_P_ ev_tstamp timeout)
214
216
  }
215
217
 
216
218
  /* if the receive array was full, increase its size */
217
- if (expect_false (eventcnt == epoll_eventmax))
219
+ if (ecb_expect_false (eventcnt == epoll_eventmax))
218
220
  {
219
221
  ev_free (epoll_events);
220
222
  epoll_eventmax = array_nextsize (sizeof (struct epoll_event), epoll_eventmax, epoll_eventmax + 1);
@@ -264,7 +266,7 @@ epoll_init (EV_P_ int flags)
264
266
  if ((backend_fd = epoll_epoll_create ()) < 0)
265
267
  return 0;
266
268
 
267
- backend_mintime = 1e-3; /* epoll does sometimes return early, this is just to avoid the worst */
269
+ backend_mintime = EV_TS_CONST (1e-3); /* epoll does sometimes return early, this is just to avoid the worst */
268
270
  backend_modify = epoll_modify;
269
271
  backend_poll = epoll_poll;
270
272
 
@@ -282,8 +284,8 @@ epoll_destroy (EV_P)
282
284
  array_free (epoll_eperm, EMPTY);
283
285
  }
284
286
 
285
- inline_size
286
- void
287
+ ecb_cold
288
+ static void
287
289
  epoll_fork (EV_P)
288
290
  {
289
291
  close (backend_fd);
@@ -0,0 +1,694 @@
1
+ /*
2
+ * libev linux io_uring fd activity backend
3
+ *
4
+ * Copyright (c) 2019-2020 Marc Alexander Lehmann <libev@schmorp.de>
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without modifica-
8
+ * tion, are permitted provided that the following conditions are met:
9
+ *
10
+ * 1. Redistributions of source code must retain the above copyright notice,
11
+ * this list of conditions and the following disclaimer.
12
+ *
13
+ * 2. Redistributions in binary form must reproduce the above copyright
14
+ * notice, this list of conditions and the following disclaimer in the
15
+ * documentation and/or other materials provided with the distribution.
16
+ *
17
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
18
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
19
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
20
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
21
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
23
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
24
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
25
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
26
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ *
28
+ * Alternatively, the contents of this file may be used under the terms of
29
+ * the GNU General Public License ("GPL") version 2 or any later version,
30
+ * in which case the provisions of the GPL are applicable instead of
31
+ * the above. If you wish to allow the use of your version of this file
32
+ * only under the terms of the GPL and not to allow others to use your
33
+ * version of this file under the BSD license, indicate your decision
34
+ * by deleting the provisions above and replace them with the notice
35
+ * and other provisions required by the GPL. If you do not delete the
36
+ * provisions above, a recipient may use your version of this file under
37
+ * either the BSD or the GPL.
38
+ */
39
+
40
+ /*
41
+ * general notes about linux io_uring:
42
+ *
43
+ * a) it's the best interface I have seen so far. on linux.
44
+ * b) best is not necessarily very good.
45
+ * c) it's better than the aio mess, doesn't suffer from the fork problems
46
+ * of linux aio or epoll and so on and so on. and you could do event stuff
47
+ * without any syscalls. what's not to like?
48
+ * d) ok, it's vastly more complex, but that's ok, really.
49
+ * e) why two mmaps instead of one? one would be more space-efficient,
50
+ * and I can't see what benefit two would have (other than being
51
+ * somehow resizable/relocatable, but that's apparently not possible).
52
+ * f) hmm, it's practically undebuggable (gdb can't access the memory, and
53
+ * the bizarre way structure offsets are communicated makes it hard to
54
+ * just print the ring buffer heads, even *iff* the memory were visible
55
+ * in gdb. but then, that's also ok, really.
56
+ * g) well, you cannot specify a timeout when waiting for events. no,
57
+ * seriously, the interface doesn't support a timeout. never seen _that_
58
+ * before. sure, you can use a timerfd, but that's another syscall
59
+ * you could have avoided. overall, this bizarre omission smells
60
+ * like a µ-optimisation by the io_uring author for his personal
61
+ * applications, to the detriment of everybody else who just wants
62
+ * an event loop. but, umm, ok, if that's all, it could be worse.
63
+ * (from what I gather from the author Jens Axboe, it simply didn't
64
+ * occur to him, and he made good on it by adding an unlimited nuber
65
+ * of timeouts later :).
66
+ * h) initially there was a hardcoded limit of 4096 outstanding events.
67
+ * later versions not only bump this to 32k, but also can handle
68
+ * an unlimited amount of events, so this only affects the batch size.
69
+ * i) unlike linux aio, you *can* register more then the limit
70
+ * of fd events. while early verisons of io_uring signalled an overflow
71
+ * and you ended up getting wet. 5.5+ does not do this anymore.
72
+ * j) but, oh my! it had exactly the same bugs as the linux aio backend,
73
+ * where some undocumented poll combinations just fail. fortunately,
74
+ * after finally reaching the author, he was more than willing to fix
75
+ * this probably in 5.6+.
76
+ * k) overall, the *API* itself is, I dare to say, not a total trainwreck.
77
+ * once the bugs ae fixed (probably in 5.6+), it will be without
78
+ * competition.
79
+ */
80
+
81
+ /* TODO: use internal TIMEOUT */
82
+ /* TODO: take advantage of single mmap, NODROP etc. */
83
+ /* TODO: resize cq/sq size independently */
84
+
85
+ #include <sys/timerfd.h>
86
+ #include <sys/mman.h>
87
+ #include <poll.h>
88
+ #include <stdint.h>
89
+
90
+ #define IOURING_INIT_ENTRIES 32
91
+
92
+ /*****************************************************************************/
93
+ /* syscall wrapdadoop - this section has the raw api/abi definitions */
94
+
95
+ #include <linux/fs.h>
96
+ #include <linux/types.h>
97
+
98
+ /* mostly directly taken from the kernel or documentation */
99
+
100
+ struct io_uring_sqe
101
+ {
102
+ __u8 opcode;
103
+ __u8 flags;
104
+ __u16 ioprio;
105
+ __s32 fd;
106
+ union {
107
+ __u64 off;
108
+ __u64 addr2;
109
+ };
110
+ __u64 addr;
111
+ __u32 len;
112
+ union {
113
+ __kernel_rwf_t rw_flags;
114
+ __u32 fsync_flags;
115
+ __u16 poll_events;
116
+ __u32 sync_range_flags;
117
+ __u32 msg_flags;
118
+ __u32 timeout_flags;
119
+ __u32 accept_flags;
120
+ __u32 cancel_flags;
121
+ __u32 open_flags;
122
+ __u32 statx_flags;
123
+ };
124
+ __u64 user_data;
125
+ union {
126
+ __u16 buf_index;
127
+ __u64 __pad2[3];
128
+ };
129
+ };
130
+
131
+ struct io_uring_cqe
132
+ {
133
+ __u64 user_data;
134
+ __s32 res;
135
+ __u32 flags;
136
+ };
137
+
138
+ struct io_sqring_offsets
139
+ {
140
+ __u32 head;
141
+ __u32 tail;
142
+ __u32 ring_mask;
143
+ __u32 ring_entries;
144
+ __u32 flags;
145
+ __u32 dropped;
146
+ __u32 array;
147
+ __u32 resv1;
148
+ __u64 resv2;
149
+ };
150
+
151
+ struct io_cqring_offsets
152
+ {
153
+ __u32 head;
154
+ __u32 tail;
155
+ __u32 ring_mask;
156
+ __u32 ring_entries;
157
+ __u32 overflow;
158
+ __u32 cqes;
159
+ __u64 resv[2];
160
+ };
161
+
162
+ struct io_uring_params
163
+ {
164
+ __u32 sq_entries;
165
+ __u32 cq_entries;
166
+ __u32 flags;
167
+ __u32 sq_thread_cpu;
168
+ __u32 sq_thread_idle;
169
+ __u32 features;
170
+ __u32 resv[4];
171
+ struct io_sqring_offsets sq_off;
172
+ struct io_cqring_offsets cq_off;
173
+ };
174
+
175
+ #define IORING_SETUP_CQSIZE 0x00000008
176
+
177
+ #define IORING_OP_POLL_ADD 6
178
+ #define IORING_OP_POLL_REMOVE 7
179
+ #define IORING_OP_TIMEOUT 11
180
+ #define IORING_OP_TIMEOUT_REMOVE 12
181
+
182
+ /* relative or absolute, reference clock is CLOCK_MONOTONIC */
183
+ struct iouring_kernel_timespec
184
+ {
185
+ int64_t tv_sec;
186
+ long long tv_nsec;
187
+ };
188
+
189
+ #define IORING_TIMEOUT_ABS 0x00000001
190
+
191
+ #define IORING_ENTER_GETEVENTS 0x01
192
+
193
+ #define IORING_OFF_SQ_RING 0x00000000ULL
194
+ #define IORING_OFF_CQ_RING 0x08000000ULL
195
+ #define IORING_OFF_SQES 0x10000000ULL
196
+
197
+ #define IORING_FEAT_SINGLE_MMAP 0x00000001
198
+ #define IORING_FEAT_NODROP 0x00000002
199
+ #define IORING_FEAT_SUBMIT_STABLE 0x00000004
200
+
201
+ inline_size
202
+ int
203
+ evsys_io_uring_setup (unsigned entries, struct io_uring_params *params)
204
+ {
205
+ return ev_syscall2 (SYS_io_uring_setup, entries, params);
206
+ }
207
+
208
+ inline_size
209
+ int
210
+ evsys_io_uring_enter (int fd, unsigned to_submit, unsigned min_complete, unsigned flags, const sigset_t *sig, size_t sigsz)
211
+ {
212
+ return ev_syscall6 (SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigsz);
213
+ }
214
+
215
+ /*****************************************************************************/
216
+ /* actual backed implementation */
217
+
218
+ /* we hope that volatile will make the compiler access this variables only once */
219
+ #define EV_SQ_VAR(name) *(volatile unsigned *)((char *)iouring_sq_ring + iouring_sq_ ## name)
220
+ #define EV_CQ_VAR(name) *(volatile unsigned *)((char *)iouring_cq_ring + iouring_cq_ ## name)
221
+
222
+ /* the index array */
223
+ #define EV_SQ_ARRAY ((unsigned *)((char *)iouring_sq_ring + iouring_sq_array))
224
+
225
+ /* the submit/completion queue entries */
226
+ #define EV_SQES ((struct io_uring_sqe *) iouring_sqes)
227
+ #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes))
228
+
229
+ inline_speed
230
+ int
231
+ iouring_enter (EV_P_ ev_tstamp timeout)
232
+ {
233
+ int res;
234
+
235
+ EV_RELEASE_CB;
236
+
237
+ res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1,
238
+ timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0);
239
+
240
+ assert (("libev: io_uring_enter did not consume all sqes", (res < 0 || res == iouring_to_submit)));
241
+
242
+ iouring_to_submit = 0;
243
+
244
+ EV_ACQUIRE_CB;
245
+
246
+ return res;
247
+ }
248
+
249
+ /* TODO: can we move things around so we don't need this forward-reference? */
250
+ static void
251
+ iouring_poll (EV_P_ ev_tstamp timeout);
252
+
253
+ static
254
+ struct io_uring_sqe *
255
+ iouring_sqe_get (EV_P)
256
+ {
257
+ unsigned tail;
258
+
259
+ for (;;)
260
+ {
261
+ tail = EV_SQ_VAR (tail);
262
+
263
+ if (ecb_expect_true (tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)))
264
+ break; /* whats the problem, we have free sqes */
265
+
266
+ /* queue full, need to flush and possibly handle some events */
267
+
268
+ #if EV_FEATURE_CODE
269
+ /* first we ask the kernel nicely, most often this frees up some sqes */
270
+ int res = iouring_enter (EV_A_ EV_TS_CONST (0.));
271
+
272
+ ECB_MEMORY_FENCE_ACQUIRE; /* better safe than sorry */
273
+
274
+ if (res >= 0)
275
+ continue; /* yes, it worked, try again */
276
+ #endif
277
+
278
+ /* some problem, possibly EBUSY - do the full poll and let it handle any issues */
279
+
280
+ iouring_poll (EV_A_ EV_TS_CONST (0.));
281
+ /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE for us */
282
+ }
283
+
284
+ /*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/
285
+
286
+ return EV_SQES + (tail & EV_SQ_VAR (ring_mask));
287
+ }
288
+
289
+ inline_size
290
+ struct io_uring_sqe *
291
+ iouring_sqe_submit (EV_P_ struct io_uring_sqe *sqe)
292
+ {
293
+ unsigned idx = sqe - EV_SQES;
294
+
295
+ EV_SQ_ARRAY [idx] = idx;
296
+ ECB_MEMORY_FENCE_RELEASE;
297
+ ++EV_SQ_VAR (tail);
298
+ /*ECB_MEMORY_FENCE_RELEASE; /* for the time being we assume this is not needed */
299
+ ++iouring_to_submit;
300
+ }
301
+
302
+ /*****************************************************************************/
303
+
304
+ /* when the timerfd expires we simply note the fact,
305
+ * as the purpose of the timerfd is to wake us up, nothing else.
306
+ * the next iteration should re-set it.
307
+ */
308
+ static void
309
+ iouring_tfd_cb (EV_P_ struct ev_io *w, int revents)
310
+ {
311
+ iouring_tfd_to = EV_TSTAMP_HUGE;
312
+ }
313
+
314
+ /* called for full and partial cleanup */
315
+ ecb_cold
316
+ static int
317
+ iouring_internal_destroy (EV_P)
318
+ {
319
+ close (iouring_tfd);
320
+ close (iouring_fd);
321
+
322
+ if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size);
323
+ if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size);
324
+ if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size );
325
+
326
+ if (ev_is_active (&iouring_tfd_w))
327
+ {
328
+ ev_ref (EV_A);
329
+ ev_io_stop (EV_A_ &iouring_tfd_w);
330
+ }
331
+ }
332
+
333
+ ecb_cold
334
+ static int
335
+ iouring_internal_init (EV_P)
336
+ {
337
+ struct io_uring_params params = { 0 };
338
+
339
+ iouring_to_submit = 0;
340
+
341
+ iouring_tfd = -1;
342
+ iouring_sq_ring = MAP_FAILED;
343
+ iouring_cq_ring = MAP_FAILED;
344
+ iouring_sqes = MAP_FAILED;
345
+
346
+ if (!have_monotonic) /* cannot really happen, but what if11 */
347
+ return -1;
348
+
349
+ for (;;)
350
+ {
351
+ iouring_fd = evsys_io_uring_setup (iouring_entries, &params);
352
+
353
+ if (iouring_fd >= 0)
354
+ break; /* yippie */
355
+
356
+ if (errno != EINVAL)
357
+ return -1; /* we failed */
358
+
359
+ #if TODO
360
+ if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE))
361
+ return -1; /* we require the above features */
362
+ #endif
363
+
364
+ /* EINVAL: lots of possible reasons, but maybe
365
+ * it is because we hit the unqueryable hardcoded size limit
366
+ */
367
+
368
+ /* we hit the limit already, give up */
369
+ if (iouring_max_entries)
370
+ return -1;
371
+
372
+ /* first time we hit EINVAL? assume we hit the limit, so go back and retry */
373
+ iouring_entries >>= 1;
374
+ iouring_max_entries = iouring_entries;
375
+ }
376
+
377
+ iouring_sq_ring_size = params.sq_off.array + params.sq_entries * sizeof (unsigned);
378
+ iouring_cq_ring_size = params.cq_off.cqes + params.cq_entries * sizeof (struct io_uring_cqe);
379
+ iouring_sqes_size = params.sq_entries * sizeof (struct io_uring_sqe);
380
+
381
+ iouring_sq_ring = mmap (0, iouring_sq_ring_size, PROT_READ | PROT_WRITE,
382
+ MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQ_RING);
383
+ iouring_cq_ring = mmap (0, iouring_cq_ring_size, PROT_READ | PROT_WRITE,
384
+ MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_CQ_RING);
385
+ iouring_sqes = mmap (0, iouring_sqes_size, PROT_READ | PROT_WRITE,
386
+ MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQES);
387
+
388
+ if (iouring_sq_ring == MAP_FAILED || iouring_cq_ring == MAP_FAILED || iouring_sqes == MAP_FAILED)
389
+ return -1;
390
+
391
+ iouring_sq_head = params.sq_off.head;
392
+ iouring_sq_tail = params.sq_off.tail;
393
+ iouring_sq_ring_mask = params.sq_off.ring_mask;
394
+ iouring_sq_ring_entries = params.sq_off.ring_entries;
395
+ iouring_sq_flags = params.sq_off.flags;
396
+ iouring_sq_dropped = params.sq_off.dropped;
397
+ iouring_sq_array = params.sq_off.array;
398
+
399
+ iouring_cq_head = params.cq_off.head;
400
+ iouring_cq_tail = params.cq_off.tail;
401
+ iouring_cq_ring_mask = params.cq_off.ring_mask;
402
+ iouring_cq_ring_entries = params.cq_off.ring_entries;
403
+ iouring_cq_overflow = params.cq_off.overflow;
404
+ iouring_cq_cqes = params.cq_off.cqes;
405
+
406
+ iouring_tfd = timerfd_create (CLOCK_MONOTONIC, TFD_CLOEXEC);
407
+
408
+ if (iouring_tfd < 0)
409
+ return iouring_tfd;
410
+
411
+ iouring_tfd_to = EV_TSTAMP_HUGE;
412
+
413
+ return 0;
414
+ }
415
+
416
+ ecb_cold
417
+ static void
418
+ iouring_fork (EV_P)
419
+ {
420
+ iouring_internal_destroy (EV_A);
421
+
422
+ while (iouring_internal_init (EV_A) < 0)
423
+ ev_syserr ("(libev) io_uring_setup");
424
+
425
+ fd_rearm_all (EV_A);
426
+
427
+ ev_io_stop (EV_A_ &iouring_tfd_w);
428
+ ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ);
429
+ ev_io_start (EV_A_ &iouring_tfd_w);
430
+ }
431
+
432
+ /*****************************************************************************/
433
+
434
+ static void
435
+ iouring_modify (EV_P_ int fd, int oev, int nev)
436
+ {
437
+ if (oev)
438
+ {
439
+ /* we assume the sqe's are all "properly" initialised */
440
+ struct io_uring_sqe *sqe = iouring_sqe_get (EV_A);
441
+ sqe->opcode = IORING_OP_POLL_REMOVE;
442
+ sqe->fd = fd;
443
+ /* Jens Axboe notified me that user_data is not what is documented, but is
444
+ * some kind of unique ID that has to match, otherwise the request cannot
445
+ * be removed. Since we don't *really* have that, we pass in the old
446
+ * generation counter - if that fails, too bad, it will hopefully be removed
447
+ * at close time and then be ignored. */
448
+ sqe->addr = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32);
449
+ sqe->user_data = (uint64_t)-1;
450
+ iouring_sqe_submit (EV_A_ sqe);
451
+
452
+ /* increment generation counter to avoid handling old events */
453
+ ++anfds [fd].egen;
454
+ }
455
+
456
+ if (nev)
457
+ {
458
+ struct io_uring_sqe *sqe = iouring_sqe_get (EV_A);
459
+ sqe->opcode = IORING_OP_POLL_ADD;
460
+ sqe->fd = fd;
461
+ sqe->addr = 0;
462
+ sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32);
463
+ sqe->poll_events =
464
+ (nev & EV_READ ? POLLIN : 0)
465
+ | (nev & EV_WRITE ? POLLOUT : 0);
466
+ iouring_sqe_submit (EV_A_ sqe);
467
+ }
468
+ }
469
+
470
+ inline_size
471
+ void
472
+ iouring_tfd_update (EV_P_ ev_tstamp timeout)
473
+ {
474
+ ev_tstamp tfd_to = mn_now + timeout;
475
+
476
+ /* we assume there will be many iterations per timer change, so
477
+ * we only re-set the timerfd when we have to because its expiry
478
+ * is too late.
479
+ */
480
+ if (ecb_expect_false (tfd_to < iouring_tfd_to))
481
+ {
482
+ struct itimerspec its;
483
+
484
+ iouring_tfd_to = tfd_to;
485
+ EV_TS_SET (its.it_interval, 0.);
486
+ EV_TS_SET (its.it_value, tfd_to);
487
+
488
+ if (timerfd_settime (iouring_tfd, TFD_TIMER_ABSTIME, &its, 0) < 0)
489
+ assert (("libev: iouring timerfd_settime failed", 0));
490
+ }
491
+ }
492
+
493
+ inline_size
494
+ void
495
+ iouring_process_cqe (EV_P_ struct io_uring_cqe *cqe)
496
+ {
497
+ int fd = cqe->user_data & 0xffffffffU;
498
+ uint32_t gen = cqe->user_data >> 32;
499
+ int res = cqe->res;
500
+
501
+ /* user_data -1 is a remove that we are not atm. interested in */
502
+ if (cqe->user_data == (uint64_t)-1)
503
+ return;
504
+
505
+ assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax));
506
+
507
+ /* documentation lies, of course. the result value is NOT like
508
+ * normal syscalls, but like linux raw syscalls, i.e. negative
509
+ * error numbers. fortunate, as otherwise there would be no way
510
+ * to get error codes at all. still, why not document this?
511
+ */
512
+
513
+ /* ignore event if generation doesn't match */
514
+ /* other than skipping removal events, */
515
+ /* this should actually be very rare */
516
+ if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen))
517
+ return;
518
+
519
+ if (ecb_expect_false (res < 0))
520
+ {
521
+ /*TODO: EINVAL handling (was something failed with this fd)*/
522
+
523
+ if (res == -EBADF)
524
+ {
525
+ assert (("libev: event loop rejected bad fd", res != -EBADF));
526
+ fd_kill (EV_A_ fd);
527
+ }
528
+ else
529
+ {
530
+ errno = -res;
531
+ ev_syserr ("(libev) IORING_OP_POLL_ADD");
532
+ }
533
+
534
+ return;
535
+ }
536
+
537
+ /* feed events, we do not expect or handle POLLNVAL */
538
+ fd_event (
539
+ EV_A_
540
+ fd,
541
+ (res & (POLLOUT | POLLERR | POLLHUP) ? EV_WRITE : 0)
542
+ | (res & (POLLIN | POLLERR | POLLHUP) ? EV_READ : 0)
543
+ );
544
+
545
+ /* io_uring is oneshot, so we need to re-arm the fd next iteration */
546
+ /* this also means we usually have to do at least one syscall per iteration */
547
+ anfds [fd].events = 0;
548
+ fd_change (EV_A_ fd, EV_ANFD_REIFY);
549
+ }
550
+
551
+ /* called when the event queue overflows */
552
+ ecb_cold
553
+ static void
554
+ iouring_overflow (EV_P)
555
+ {
556
+ /* we have two options, resize the queue (by tearing down
557
+ * everything and recreating it, or living with it
558
+ * and polling.
559
+ * we implement this by resizing the queue, and, if that fails,
560
+ * we just recreate the state on every failure, which
561
+ * kind of is a very inefficient poll.
562
+ * one danger is, due to the bios toward lower fds,
563
+ * we will only really get events for those, so
564
+ * maybe we need a poll() fallback, after all.
565
+ */
566
+ /*EV_CQ_VAR (overflow) = 0;*/ /* need to do this if we keep the state and poll manually */
567
+
568
+ fd_rearm_all (EV_A);
569
+
570
+ /* we double the size until we hit the hard-to-probe maximum */
571
+ if (!iouring_max_entries)
572
+ {
573
+ iouring_entries <<= 1;
574
+ iouring_fork (EV_A);
575
+ }
576
+ else
577
+ {
578
+ /* we hit the kernel limit, we should fall back to something else.
579
+ * we can either poll() a few times and hope for the best,
580
+ * poll always, or switch to epoll.
581
+ * TODO: is this necessary with newer kernels?
582
+ */
583
+
584
+ iouring_internal_destroy (EV_A);
585
+
586
+ /* this should make it so that on return, we don't call any uring functions */
587
+ iouring_to_submit = 0;
588
+
589
+ for (;;)
590
+ {
591
+ backend = epoll_init (EV_A_ 0);
592
+
593
+ if (backend)
594
+ break;
595
+
596
+ ev_syserr ("(libev) iouring switch to epoll");
597
+ }
598
+ }
599
+ }
600
+
601
+ /* handle any events in the completion queue, return true if there were any */
602
+ static int
603
+ iouring_handle_cq (EV_P)
604
+ {
605
+ unsigned head, tail, mask;
606
+
607
+ head = EV_CQ_VAR (head);
608
+ ECB_MEMORY_FENCE_ACQUIRE;
609
+ tail = EV_CQ_VAR (tail);
610
+
611
+ if (head == tail)
612
+ return 0;
613
+
614
+ /* it can only overflow if we have events, yes, yes? */
615
+ if (ecb_expect_false (EV_CQ_VAR (overflow)))
616
+ {
617
+ iouring_overflow (EV_A);
618
+ return 1;
619
+ }
620
+
621
+ mask = EV_CQ_VAR (ring_mask);
622
+
623
+ do
624
+ iouring_process_cqe (EV_A_ &EV_CQES [head++ & mask]);
625
+ while (head != tail);
626
+
627
+ EV_CQ_VAR (head) = head;
628
+ ECB_MEMORY_FENCE_RELEASE;
629
+
630
+ return 1;
631
+ }
632
+
633
+ static void
634
+ iouring_poll (EV_P_ ev_tstamp timeout)
635
+ {
636
+ /* if we have events, no need for extra syscalls, but we might have to queue events */
637
+ /* we also clar the timeout if there are outstanding fdchanges */
638
+ /* the latter should only happen if both the sq and cq are full, most likely */
639
+ /* because we have a lot of event sources that immediately complete */
640
+ /* TODO: fdchacngecnt is always 0 because fd_reify does not have two buffers yet */
641
+ if (iouring_handle_cq (EV_A) || fdchangecnt)
642
+ timeout = EV_TS_CONST (0.);
643
+ else
644
+ /* no events, so maybe wait for some */
645
+ iouring_tfd_update (EV_A_ timeout);
646
+
647
+ /* only enter the kernel if we have something to submit, or we need to wait */
648
+ if (timeout || iouring_to_submit)
649
+ {
650
+ int res = iouring_enter (EV_A_ timeout);
651
+
652
+ if (ecb_expect_false (res < 0))
653
+ if (errno == EINTR)
654
+ /* ignore */;
655
+ else if (errno == EBUSY)
656
+ /* cq full, cannot submit - should be rare because we flush the cq first, so simply ignore */;
657
+ else
658
+ ev_syserr ("(libev) iouring setup");
659
+ else
660
+ iouring_handle_cq (EV_A);
661
+ }
662
+ }
663
+
664
+ inline_size
665
+ int
666
+ iouring_init (EV_P_ int flags)
667
+ {
668
+ iouring_entries = IOURING_INIT_ENTRIES;
669
+ iouring_max_entries = 0;
670
+
671
+ if (iouring_internal_init (EV_A) < 0)
672
+ {
673
+ iouring_internal_destroy (EV_A);
674
+ return 0;
675
+ }
676
+
677
+ ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ);
678
+ ev_set_priority (&iouring_tfd_w, EV_MINPRI);
679
+ ev_io_start (EV_A_ &iouring_tfd_w);
680
+ ev_unref (EV_A); /* watcher should not keep loop alive */
681
+
682
+ backend_modify = iouring_modify;
683
+ backend_poll = iouring_poll;
684
+
685
+ return EVBACKEND_IOURING;
686
+ }
687
+
688
+ inline_size
689
+ void
690
+ iouring_destroy (EV_P)
691
+ {
692
+ iouring_internal_destroy (EV_A);
693
+ }
694
+