nio4r 2.4.0 → 2.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * libev native API header
3
3
  *
4
- * Copyright (c) 2007-2018 Marc Alexander Lehmann <libev@schmorp.de>
4
+ * Copyright (c) 2007-2019 Marc Alexander Lehmann <libev@schmorp.de>
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without modifica-
@@ -212,7 +212,7 @@ struct ev_loop;
212
212
  /*****************************************************************************/
213
213
 
214
214
  #define EV_VERSION_MAJOR 4
215
- #define EV_VERSION_MINOR 25
215
+ #define EV_VERSION_MINOR 27
216
216
 
217
217
  /* eventmask, revents, events... */
218
218
  enum {
@@ -516,14 +516,15 @@ enum {
516
516
 
517
517
  /* method bits to be ored together */
518
518
  enum {
519
- EVBACKEND_SELECT = 0x00000001U, /* available just about anywhere */
520
- EVBACKEND_POLL = 0x00000002U, /* !win, !aix, broken on osx */
521
- EVBACKEND_EPOLL = 0x00000004U, /* linux */
522
- EVBACKEND_KQUEUE = 0x00000008U, /* bsd, broken on osx */
523
- EVBACKEND_DEVPOLL = 0x00000010U, /* solaris 8 */ /* NYI */
524
- EVBACKEND_PORT = 0x00000020U, /* solaris 10 */
525
- EVBACKEND_ALL = 0x0000003FU, /* all known backends */
526
- EVBACKEND_MASK = 0x0000FFFFU /* all future backends */
519
+ EVBACKEND_SELECT = 0x00000001U, /* available just about anywhere */
520
+ EVBACKEND_POLL = 0x00000002U, /* !win, !aix, broken on osx */
521
+ EVBACKEND_EPOLL = 0x00000004U, /* linux */
522
+ EVBACKEND_KQUEUE = 0x00000008U, /* bsd, broken on osx */
523
+ EVBACKEND_DEVPOLL = 0x00000010U, /* solaris 8 */ /* NYI */
524
+ EVBACKEND_PORT = 0x00000020U, /* solaris 10 */
525
+ EVBACKEND_LINUXAIO = 0x00000040U, /* Linuix AIO */
526
+ EVBACKEND_ALL = 0x0000007FU, /* all known backends */
527
+ EVBACKEND_MASK = 0x0000FFFFU /* all future backends */
527
528
  };
528
529
 
529
530
  #if EV_PROTOTYPES
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * libev epoll fd activity backend
3
3
  *
4
- * Copyright (c) 2007,2008,2009,2010,2011 Marc Alexander Lehmann <libev@schmorp.de>
4
+ * Copyright (c) 2007,2008,2009,2010,2011,2016,2017,2019 Marc Alexander Lehmann <libev@schmorp.de>
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without modifica-
@@ -124,12 +124,14 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
124
124
  /* add fd to epoll_eperms, if not already inside */
125
125
  if (!(oldmask & EV_EMASK_EPERM))
126
126
  {
127
- array_needsize (int, epoll_eperms, epoll_epermmax, epoll_epermcnt + 1, EMPTY2);
127
+ array_needsize (int, epoll_eperms, epoll_epermmax, epoll_epermcnt + 1, array_needsize_noinit);
128
128
  epoll_eperms [epoll_epermcnt++] = fd;
129
129
  }
130
130
 
131
131
  return;
132
132
  }
133
+ else
134
+ assert (("libev: I/O watcher with invalid fd found in epoll_ctl", errno != EBADF && errno != ELOOP && errno != EINVAL));
133
135
 
134
136
  fd_kill (EV_A_ fd);
135
137
 
@@ -235,21 +237,32 @@ epoll_poll (EV_P_ ev_tstamp timeout)
235
237
  }
236
238
  }
237
239
 
238
- inline_size
239
- int
240
- epoll_init (EV_P_ int flags)
240
+ static int
241
+ epoll_epoll_create (void)
241
242
  {
243
+ int fd;
244
+
242
245
  #if defined EPOLL_CLOEXEC && !defined __ANDROID__
243
- backend_fd = epoll_create1 (EPOLL_CLOEXEC);
246
+ fd = epoll_create1 (EPOLL_CLOEXEC);
244
247
 
245
- if (backend_fd < 0 && (errno == EINVAL || errno == ENOSYS))
248
+ if (fd < 0 && (errno == EINVAL || errno == ENOSYS))
246
249
  #endif
247
- backend_fd = epoll_create (256);
250
+ {
251
+ fd = epoll_create (256);
248
252
 
249
- if (backend_fd < 0)
250
- return 0;
253
+ if (fd >= 0)
254
+ fcntl (fd, F_SETFD, FD_CLOEXEC);
255
+ }
251
256
 
252
- fcntl (backend_fd, F_SETFD, FD_CLOEXEC);
257
+ return fd;
258
+ }
259
+
260
+ inline_size
261
+ int
262
+ epoll_init (EV_P_ int flags)
263
+ {
264
+ if ((backend_fd = epoll_epoll_create ()) < 0)
265
+ return 0;
253
266
 
254
267
  backend_mintime = 1e-3; /* epoll does sometimes return early, this is just to avoid the worst */
255
268
  backend_modify = epoll_modify;
@@ -275,11 +288,9 @@ epoll_fork (EV_P)
275
288
  {
276
289
  close (backend_fd);
277
290
 
278
- while ((backend_fd = epoll_create (256)) < 0)
291
+ while ((backend_fd = epoll_epoll_create ()) < 0)
279
292
  ev_syserr ("(libev) epoll_create");
280
293
 
281
- fcntl (backend_fd, F_SETFD, FD_CLOEXEC);
282
-
283
294
  fd_rearm_all (EV_A);
284
295
  }
285
296
 
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * libev kqueue backend
3
3
  *
4
- * Copyright (c) 2007,2008,2009,2010,2011,2012,2013 Marc Alexander Lehmann <libev@schmorp.de>
4
+ * Copyright (c) 2007,2008,2009,2010,2011,2012,2013,2016,2019 Marc Alexander Lehmann <libev@schmorp.de>
5
5
  * All rights reserved.
6
6
  *
7
7
  * Redistribution and use in source and binary forms, with or without modifica-
@@ -48,7 +48,7 @@ void
48
48
  kqueue_change (EV_P_ int fd, int filter, int flags, int fflags)
49
49
  {
50
50
  ++kqueue_changecnt;
51
- array_needsize (struct kevent, kqueue_changes, kqueue_changemax, kqueue_changecnt, EMPTY2);
51
+ array_needsize (struct kevent, kqueue_changes, kqueue_changemax, kqueue_changecnt, array_needsize_noinit);
52
52
 
53
53
  EV_SET (&kqueue_changes [kqueue_changecnt - 1], fd, filter, flags, fflags, 0, 0);
54
54
  }
@@ -106,7 +106,7 @@ kqueue_poll (EV_P_ ev_tstamp timeout)
106
106
  if (expect_false (res < 0))
107
107
  {
108
108
  if (errno != EINTR)
109
- ev_syserr ("(libev) kevent");
109
+ ev_syserr ("(libev) kqueue kevent");
110
110
 
111
111
  return;
112
112
  }
@@ -129,10 +129,16 @@ kqueue_poll (EV_P_ ev_tstamp timeout)
129
129
  if (fd_valid (fd))
130
130
  kqueue_modify (EV_A_ fd, 0, anfds [fd].events);
131
131
  else
132
- fd_kill (EV_A_ fd);
132
+ {
133
+ assert (("libev: kqueue found invalid fd", 0));
134
+ fd_kill (EV_A_ fd);
135
+ }
133
136
  }
134
137
  else /* on all other errors, we error out on the fd */
135
- fd_kill (EV_A_ fd);
138
+ {
139
+ assert (("libev: kqueue found invalid fd", 0));
140
+ fd_kill (EV_A_ fd);
141
+ }
136
142
  }
137
143
  }
138
144
  else
@@ -0,0 +1,642 @@
1
+ /*
2
+ * libev linux aio fd activity backend
3
+ *
4
+ * Copyright (c) 2019 Marc Alexander Lehmann <libev@schmorp.de>
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without modifica-
8
+ * tion, are permitted provided that the following conditions are met:
9
+ *
10
+ * 1. Redistributions of source code must retain the above copyright notice,
11
+ * this list of conditions and the following disclaimer.
12
+ *
13
+ * 2. Redistributions in binary form must reproduce the above copyright
14
+ * notice, this list of conditions and the following disclaimer in the
15
+ * documentation and/or other materials provided with the distribution.
16
+ *
17
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
18
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
19
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
20
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
21
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
23
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
24
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
25
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
26
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ *
28
+ * Alternatively, the contents of this file may be used under the terms of
29
+ * the GNU General Public License ("GPL") version 2 or any later version,
30
+ * in which case the provisions of the GPL are applicable instead of
31
+ * the above. If you wish to allow the use of your version of this file
32
+ * only under the terms of the GPL and not to allow others to use your
33
+ * version of this file under the BSD license, indicate your decision
34
+ * by deleting the provisions above and replace them with the notice
35
+ * and other provisions required by the GPL. If you do not delete the
36
+ * provisions above, a recipient may use your version of this file under
37
+ * either the BSD or the GPL.
38
+ */
39
+
40
+ /*
41
+ * general notes about linux aio:
42
+ *
43
+ * a) at first, the linux aio IOCB_CMD_POLL functionality introduced in
44
+ * 4.18 looks too good to be true: both watchers and events can be
45
+ * batched, and events can even be handled in userspace using
46
+ * a ring buffer shared with the kernel. watchers can be canceled
47
+ * regardless of whether the fd has been closed. no problems with fork.
48
+ * ok, the ring buffer is 200% undocumented (there isn't even a
49
+ * header file), but otherwise, it's pure bliss!
50
+ * b) ok, watchers are one-shot, so you have to re-arm active ones
51
+ * on every iteration. so much for syscall-less event handling,
52
+ * but at least these re-arms can be batched, no big deal, right?
53
+ * c) well, linux as usual: the documentation lies to you: io_submit
54
+ * sometimes returns EINVAL because the kernel doesn't feel like
55
+ * handling your poll mask - ttys can be polled for POLLOUT,
56
+ * POLLOUT|POLLIN, but polling for POLLIN fails. just great,
57
+ * so we have to fall back to something else (hello, epoll),
58
+ * but at least the fallback can be slow, because these are
59
+ * exceptional cases, right?
60
+ * d) hmm, you have to tell the kernel the maximum number of watchers
61
+ * you want to queue when initialising the aio context. but of
62
+ * course the real limit is magically calculated in the kernel, and
63
+ * is often higher then we asked for. so we just have to destroy
64
+ * the aio context and re-create it a bit larger if we hit the limit.
65
+ * (starts to remind you of epoll? well, it's a bit more deterministic
66
+ * and less gambling, but still ugly as hell).
67
+ * e) that's when you find out you can also hit an arbitrary system-wide
68
+ * limit. or the kernel simply doesn't want to handle your watchers.
69
+ * what the fuck do we do then? you guessed it, in the middle
70
+ * of event handling we have to switch to 100% epoll polling. and
71
+ * that better is as fast as normal epoll polling, so you practically
72
+ * have to use the normal epoll backend with all its quirks.
73
+ * f) end result of this train wreck: it inherits all the disadvantages
74
+ * from epoll, while adding a number on its own. why even bother to use
75
+ * it? because if conditions are right and your fds are supported and you
76
+ * don't hit a limit, this backend is actually faster, doesn't gamble with
77
+ * your fds, batches watchers and events and doesn't require costly state
78
+ * recreates. well, until it does.
79
+ * g) all of this makes this backend use almost twice as much code as epoll.
80
+ * which in turn uses twice as much code as poll. and that#s not counting
81
+ * the fact that this backend also depends on the epoll backend, making
82
+ * it three times as much code as poll, or kqueue.
83
+ * h) bleah. why can't linux just do kqueue. sure kqueue is ugly, but by now
84
+ * it's clear that whatever linux comes up with is far, far, far worse.
85
+ */
86
+
87
+ #include <sys/time.h> /* actually linux/time.h, but we must assume they are compatible */
88
+ #include <poll.h>
89
+ #include <linux/aio_abi.h>
90
+
91
+ /*****************************************************************************/
92
+ /* syscall wrapdadoop - this section has the raw api/abi definitions */
93
+
94
+ #include <sys/syscall.h> /* no glibc wrappers */
95
+
96
+ /* aio_abi.h is not versioned in any way, so we cannot test for its existance */
97
+ #define IOCB_CMD_POLL 5
98
+
99
+ /* taken from linux/fs/aio.c. yup, that's a .c file.
100
+ * not only is this totally undocumented, not even the source code
101
+ * can tell you what the future semantics of compat_features and
102
+ * incompat_features are, or what header_length actually is for.
103
+ */
104
+ #define AIO_RING_MAGIC 0xa10a10a1
105
+ #define EV_AIO_RING_INCOMPAT_FEATURES 0
106
+ struct aio_ring
107
+ {
108
+ unsigned id; /* kernel internal index number */
109
+ unsigned nr; /* number of io_events */
110
+ unsigned head; /* Written to by userland or by kernel. */
111
+ unsigned tail;
112
+
113
+ unsigned magic;
114
+ unsigned compat_features;
115
+ unsigned incompat_features;
116
+ unsigned header_length; /* size of aio_ring */
117
+
118
+ struct io_event io_events[0];
119
+ };
120
+
121
+ /*
122
+ * define some syscall wrappers for common architectures
123
+ * this is mostly for nice looks during debugging, not performance.
124
+ * our syscalls return < 0, not == -1, on error. which is good
125
+ * enough for linux aio.
126
+ * TODO: arm is also common nowadays, maybe even mips and x86
127
+ * TODO: after implementing this, it suddenly looks like overkill, but its hard to remove...
128
+ */
129
+ #if __GNUC__ && __linux && ECB_AMD64 && !defined __OPTIMIZE_SIZE__
130
+ /* the costly errno access probably kills this for size optimisation */
131
+
132
+ #define ev_syscall(nr,narg,arg1,arg2,arg3,arg4,arg5) \
133
+ ({ \
134
+ long res; \
135
+ register unsigned long r5 __asm__ ("r8" ); \
136
+ register unsigned long r4 __asm__ ("r10"); \
137
+ register unsigned long r3 __asm__ ("rdx"); \
138
+ register unsigned long r2 __asm__ ("rsi"); \
139
+ register unsigned long r1 __asm__ ("rdi"); \
140
+ if (narg >= 5) r5 = (unsigned long)(arg5); \
141
+ if (narg >= 4) r4 = (unsigned long)(arg4); \
142
+ if (narg >= 3) r3 = (unsigned long)(arg3); \
143
+ if (narg >= 2) r2 = (unsigned long)(arg2); \
144
+ if (narg >= 1) r1 = (unsigned long)(arg1); \
145
+ __asm__ __volatile__ ( \
146
+ "syscall\n\t" \
147
+ : "=a" (res) \
148
+ : "0" (nr), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5) \
149
+ : "cc", "r11", "cx", "memory"); \
150
+ errno = -res; \
151
+ res; \
152
+ })
153
+
154
+ #endif
155
+
156
+ #ifdef ev_syscall
157
+ #define ev_syscall0(nr) ev_syscall (nr, 0, 0, 0, 0, 0, 0
158
+ #define ev_syscall1(nr,arg1) ev_syscall (nr, 1, arg1, 0, 0, 0, 0)
159
+ #define ev_syscall2(nr,arg1,arg2) ev_syscall (nr, 2, arg1, arg2, 0, 0, 0)
160
+ #define ev_syscall3(nr,arg1,arg2,arg3) ev_syscall (nr, 3, arg1, arg2, arg3, 0, 0)
161
+ #define ev_syscall4(nr,arg1,arg2,arg3,arg4) ev_syscall (nr, 3, arg1, arg2, arg3, arg4, 0)
162
+ #define ev_syscall5(nr,arg1,arg2,arg3,arg4,arg5) ev_syscall (nr, 5, arg1, arg2, arg3, arg4, arg5)
163
+ #else
164
+ #define ev_syscall0(nr) syscall (nr)
165
+ #define ev_syscall1(nr,arg1) syscall (nr, arg1)
166
+ #define ev_syscall2(nr,arg1,arg2) syscall (nr, arg1, arg2)
167
+ #define ev_syscall3(nr,arg1,arg2,arg3) syscall (nr, arg1, arg2, arg3)
168
+ #define ev_syscall4(nr,arg1,arg2,arg3,arg4) syscall (nr, arg1, arg2, arg3, arg4)
169
+ #define ev_syscall5(nr,arg1,arg2,arg3,arg4,arg5) syscall (nr, arg1, arg2, arg3, arg4, arg5)
170
+ #endif
171
+
172
+ inline_size
173
+ int
174
+ evsys_io_setup (unsigned nr_events, aio_context_t *ctx_idp)
175
+ {
176
+ return ev_syscall2 (SYS_io_setup, nr_events, ctx_idp);
177
+ }
178
+
179
+ inline_size
180
+ int
181
+ evsys_io_destroy (aio_context_t ctx_id)
182
+ {
183
+ return ev_syscall1 (SYS_io_destroy, ctx_id);
184
+ }
185
+
186
+ inline_size
187
+ int
188
+ evsys_io_submit (aio_context_t ctx_id, long nr, struct iocb *cbp[])
189
+ {
190
+ return ev_syscall3 (SYS_io_submit, ctx_id, nr, cbp);
191
+ }
192
+
193
+ inline_size
194
+ int
195
+ evsys_io_cancel (aio_context_t ctx_id, struct iocb *cbp, struct io_event *result)
196
+ {
197
+ return ev_syscall3 (SYS_io_cancel, ctx_id, cbp, result);
198
+ }
199
+
200
+ inline_size
201
+ int
202
+ evsys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
203
+ {
204
+ return ev_syscall5 (SYS_io_getevents, ctx_id, min_nr, nr, events, timeout);
205
+ }
206
+
207
+ /*****************************************************************************/
208
+ /* actual backed implementation */
209
+
210
+ ecb_cold
211
+ static int
212
+ linuxaio_nr_events (EV_P)
213
+ {
214
+ /* we start with 16 iocbs and incraese from there
215
+ * that's tiny, but the kernel has a rather low system-wide
216
+ * limit that can be reached quickly, so let's be parsimonious
217
+ * with this resource.
218
+ * Rest assured, the kernel generously rounds up small and big numbers
219
+ * in different ways (but doesn't seem to charge you for it).
220
+ * The 15 here is because the kernel usually has a power of two as aio-max-nr,
221
+ * and this helps to take advantage of that limit.
222
+ */
223
+
224
+ /* we try to fill 4kB pages exactly.
225
+ * the ring buffer header is 32 bytes, every io event is 32 bytes.
226
+ * the kernel takes the io requests number, doubles it, adds 2
227
+ * and adds the ring buffer.
228
+ * the way we use this is by starting low, and then roughly doubling the
229
+ * size each time we hit a limit.
230
+ */
231
+
232
+ int requests = 15 << linuxaio_iteration;
233
+ int one_page = (4096
234
+ / sizeof (struct io_event) ) / 2; /* how many fit into one page */
235
+ int first_page = ((4096 - sizeof (struct aio_ring))
236
+ / sizeof (struct io_event) - 2) / 2; /* how many fit into the first page */
237
+
238
+ /* if everything fits into one page, use count exactly */
239
+ if (requests > first_page)
240
+ /* otherwise, round down to full pages and add the first page */
241
+ requests = requests / one_page * one_page + first_page;
242
+
243
+ return requests;
244
+ }
245
+
246
+ /* we use out own wrapper structure in case we ever want to do something "clever" */
247
+ typedef struct aniocb
248
+ {
249
+ struct iocb io;
250
+ /*int inuse;*/
251
+ } *ANIOCBP;
252
+
253
+ inline_size
254
+ void
255
+ linuxaio_array_needsize_iocbp (ANIOCBP *base, int offset, int count)
256
+ {
257
+ while (count--)
258
+ {
259
+ /* TODO: quite the overhead to allocate every iocb separately, maybe use our own allocator? */
260
+ ANIOCBP iocb = (ANIOCBP)ev_malloc (sizeof (*iocb));
261
+
262
+ /* full zero initialise is probably not required at the moment, but
263
+ * this is not well documented, so we better do it.
264
+ */
265
+ memset (iocb, 0, sizeof (*iocb));
266
+
267
+ iocb->io.aio_lio_opcode = IOCB_CMD_POLL;
268
+ iocb->io.aio_data = offset;
269
+ iocb->io.aio_fildes = offset;
270
+
271
+ base [offset++] = iocb;
272
+ }
273
+ }
274
+
275
+ ecb_cold
276
+ static void
277
+ linuxaio_free_iocbp (EV_P)
278
+ {
279
+ while (linuxaio_iocbpmax--)
280
+ ev_free (linuxaio_iocbps [linuxaio_iocbpmax]);
281
+
282
+ linuxaio_iocbpmax = 0; /* next resize will completely reallocate the array, at some overhead */
283
+ }
284
+
285
+ static void
286
+ linuxaio_modify (EV_P_ int fd, int oev, int nev)
287
+ {
288
+ array_needsize (ANIOCBP, linuxaio_iocbps, linuxaio_iocbpmax, fd + 1, linuxaio_array_needsize_iocbp);
289
+ ANIOCBP iocb = linuxaio_iocbps [fd];
290
+
291
+ if (iocb->io.aio_reqprio < 0)
292
+ {
293
+ /* we handed this fd over to epoll, so undo this first */
294
+ /* we do it manually because the optimisations on epoll_modify won't do us any good */
295
+ epoll_ctl (backend_fd, EPOLL_CTL_DEL, fd, 0);
296
+ anfds [fd].emask = 0;
297
+ iocb->io.aio_reqprio = 0;
298
+ }
299
+
300
+ if (iocb->io.aio_buf)
301
+ {
302
+ evsys_io_cancel (linuxaio_ctx, &iocb->io, (struct io_event *)0);
303
+ /* on relevant kernels, io_cancel fails with EINPROGRES if everything is fine */
304
+ assert (("libev: linuxaio unexpected io_cancel failed", errno == EINPROGRESS));
305
+ }
306
+
307
+ if (nev)
308
+ {
309
+ iocb->io.aio_buf =
310
+ (nev & EV_READ ? POLLIN : 0)
311
+ | (nev & EV_WRITE ? POLLOUT : 0);
312
+
313
+ /* queue iocb up for io_submit */
314
+ /* this assumes we only ever get one call per fd per loop iteration */
315
+ ++linuxaio_submitcnt;
316
+ array_needsize (struct iocb *, linuxaio_submits, linuxaio_submitmax, linuxaio_submitcnt, array_needsize_noinit);
317
+ linuxaio_submits [linuxaio_submitcnt - 1] = &iocb->io;
318
+ }
319
+ }
320
+
321
+ static void
322
+ linuxaio_epoll_cb (EV_P_ struct ev_io *w, int revents)
323
+ {
324
+ epoll_poll (EV_A_ 0);
325
+ }
326
+
327
+ inline_speed
328
+ void
329
+ linuxaio_fd_rearm (EV_P_ int fd)
330
+ {
331
+ anfds [fd].events = 0;
332
+ linuxaio_iocbps [fd]->io.aio_buf = 0;
333
+ fd_change (EV_A_ fd, EV_ANFD_REIFY);
334
+ }
335
+
336
+ static void
337
+ linuxaio_parse_events (EV_P_ struct io_event *ev, int nr)
338
+ {
339
+ while (nr)
340
+ {
341
+ int fd = ev->data;
342
+ int res = ev->res;
343
+
344
+ assert (("libev: iocb fd must be in-bounds", fd >= 0 && fd < anfdmax));
345
+
346
+ /* feed events, we do not expect or handle POLLNVAL */
347
+ fd_event (
348
+ EV_A_
349
+ fd,
350
+ (res & (POLLOUT | POLLERR | POLLHUP) ? EV_WRITE : 0)
351
+ | (res & (POLLIN | POLLERR | POLLHUP) ? EV_READ : 0)
352
+ );
353
+
354
+ /* linux aio is oneshot: rearm fd. TODO: this does more work than strictly needed */
355
+ linuxaio_fd_rearm (EV_A_ fd);
356
+
357
+ --nr;
358
+ ++ev;
359
+ }
360
+ }
361
+
362
+ /* get any events from ring buffer, return true if any were handled */
363
+ static int
364
+ linuxaio_get_events_from_ring (EV_P)
365
+ {
366
+ struct aio_ring *ring = (struct aio_ring *)linuxaio_ctx;
367
+
368
+ /* the kernel reads and writes both of these variables, */
369
+ /* as a C extension, we assume that volatile use here */
370
+ /* both makes reads atomic and once-only */
371
+ unsigned head = *(volatile unsigned *)&ring->head;
372
+ unsigned tail = *(volatile unsigned *)&ring->tail;
373
+
374
+ if (head == tail)
375
+ return 0;
376
+
377
+ /* make sure the events up to tail are visible */
378
+ ECB_MEMORY_FENCE_ACQUIRE;
379
+
380
+ /* parse all available events, but only once, to avoid starvation */
381
+ if (tail > head) /* normal case around */
382
+ linuxaio_parse_events (EV_A_ ring->io_events + head, tail - head);
383
+ else /* wrapped around */
384
+ {
385
+ linuxaio_parse_events (EV_A_ ring->io_events + head, ring->nr - head);
386
+ linuxaio_parse_events (EV_A_ ring->io_events, tail);
387
+ }
388
+
389
+ ECB_MEMORY_FENCE_RELEASE;
390
+ /* as an extension to C, we hope that the volatile will make this atomic and once-only */
391
+ *(volatile unsigned *)&ring->head = tail;
392
+
393
+ return 1;
394
+ }
395
+
396
+ inline_size
397
+ int
398
+ linuxaio_ringbuf_valid (EV_P)
399
+ {
400
+ struct aio_ring *ring = (struct aio_ring *)linuxaio_ctx;
401
+
402
+ return expect_true (ring->magic == AIO_RING_MAGIC)
403
+ && ring->incompat_features == EV_AIO_RING_INCOMPAT_FEATURES
404
+ && ring->header_length == sizeof (struct aio_ring); /* TODO: or use it to find io_event[0]? */
405
+ }
406
+
407
+ /* read at least one event from kernel, or timeout */
408
+ inline_size
409
+ void
410
+ linuxaio_get_events (EV_P_ ev_tstamp timeout)
411
+ {
412
+ struct timespec ts;
413
+ struct io_event ioev[8]; /* 256 octet stack space */
414
+ int want = 1; /* how many events to request */
415
+ int ringbuf_valid = linuxaio_ringbuf_valid (EV_A);
416
+
417
+ if (expect_true (ringbuf_valid))
418
+ {
419
+ /* if the ring buffer has any events, we don't wait or call the kernel at all */
420
+ if (linuxaio_get_events_from_ring (EV_A))
421
+ return;
422
+
423
+ /* if the ring buffer is empty, and we don't have a timeout, then don't call the kernel */
424
+ if (!timeout)
425
+ return;
426
+ }
427
+ else
428
+ /* no ringbuffer, request slightly larger batch */
429
+ want = sizeof (ioev) / sizeof (ioev [0]);
430
+
431
+ /* no events, so wait for some
432
+ * for fairness reasons, we do this in a loop, to fetch all events
433
+ */
434
+ for (;;)
435
+ {
436
+ int res;
437
+
438
+ EV_RELEASE_CB;
439
+
440
+ ts.tv_sec = (long)timeout;
441
+ ts.tv_nsec = (long)((timeout - ts.tv_sec) * 1e9);
442
+
443
+ res = evsys_io_getevents (linuxaio_ctx, 1, want, ioev, &ts);
444
+
445
+ EV_ACQUIRE_CB;
446
+
447
+ if (res < 0)
448
+ if (errno == EINTR)
449
+ /* ignored, retry */;
450
+ else
451
+ ev_syserr ("(libev) linuxaio io_getevents");
452
+ else if (res)
453
+ {
454
+ /* at least one event available, handle them */
455
+ linuxaio_parse_events (EV_A_ ioev, res);
456
+
457
+ if (expect_true (ringbuf_valid))
458
+ {
459
+ /* if we have a ring buffer, handle any remaining events in it */
460
+ linuxaio_get_events_from_ring (EV_A);
461
+
462
+ /* at this point, we should have handled all outstanding events */
463
+ break;
464
+ }
465
+ else if (res < want)
466
+ /* otherwise, if there were fewere events than we wanted, we assume there are no more */
467
+ break;
468
+ }
469
+ else
470
+ break; /* no events from the kernel, we are done */
471
+
472
+ timeout = 0; /* only wait in the first iteration */
473
+ }
474
+ }
475
+
476
+ inline_size
477
+ int
478
+ linuxaio_io_setup (EV_P)
479
+ {
480
+ linuxaio_ctx = 0;
481
+ return evsys_io_setup (linuxaio_nr_events (EV_A), &linuxaio_ctx);
482
+ }
483
+
484
+ static void
485
+ linuxaio_poll (EV_P_ ev_tstamp timeout)
486
+ {
487
+ int submitted;
488
+
489
+ /* first phase: submit new iocbs */
490
+
491
+ /* io_submit might return less than the requested number of iocbs */
492
+ /* this is, afaics, only because of errors, but we go by the book and use a loop, */
493
+ /* which allows us to pinpoint the erroneous iocb */
494
+ for (submitted = 0; submitted < linuxaio_submitcnt; )
495
+ {
496
+ int res = evsys_io_submit (linuxaio_ctx, linuxaio_submitcnt - submitted, linuxaio_submits + submitted);
497
+
498
+ if (expect_false (res < 0))
499
+ if (errno == EINVAL)
500
+ {
501
+ /* This happens for unsupported fds, officially, but in my testing,
502
+ * also randomly happens for supported fds. We fall back to good old
503
+ * poll() here, under the assumption that this is a very rare case.
504
+ * See https://lore.kernel.org/patchwork/patch/1047453/ to see
505
+ * discussion about such a case (ttys) where polling for POLLIN
506
+ * fails but POLLIN|POLLOUT works.
507
+ */
508
+ struct iocb *iocb = linuxaio_submits [submitted];
509
+ epoll_modify (EV_A_ iocb->aio_fildes, 0, anfds [iocb->aio_fildes].events);
510
+ iocb->aio_reqprio = -1; /* mark iocb as epoll */
511
+
512
+ res = 1; /* skip this iocb - another iocb, another chance */
513
+ }
514
+ else if (errno == EAGAIN)
515
+ {
516
+ /* This happens when the ring buffer is full, or some other shit we
517
+ * don't know and isn't documented. Most likely because we have too
518
+ * many requests and linux aio can't be assed to handle them.
519
+ * In this case, we try to allocate a larger ring buffer, freeing
520
+ * ours first. This might fail, in which case we have to fall back to 100%
521
+ * epoll.
522
+ * God, how I hate linux not getting its act together. Ever.
523
+ */
524
+ evsys_io_destroy (linuxaio_ctx);
525
+ linuxaio_submitcnt = 0;
526
+
527
+ /* rearm all fds with active iocbs */
528
+ {
529
+ int fd;
530
+ for (fd = 0; fd < linuxaio_iocbpmax; ++fd)
531
+ if (linuxaio_iocbps [fd]->io.aio_buf)
532
+ linuxaio_fd_rearm (EV_A_ fd);
533
+ }
534
+
535
+ ++linuxaio_iteration;
536
+ if (linuxaio_io_setup (EV_A) < 0)
537
+ {
538
+ /* to bad, we can't get a new aio context, go 100% epoll */
539
+ linuxaio_free_iocbp (EV_A);
540
+ ev_io_stop (EV_A_ &linuxaio_epoll_w);
541
+ ev_ref (EV_A);
542
+ linuxaio_ctx = 0;
543
+ backend_modify = epoll_modify;
544
+ backend_poll = epoll_poll;
545
+ }
546
+
547
+ timeout = 0;
548
+ /* it's easiest to handle this mess in another iteration */
549
+ return;
550
+ }
551
+ else if (errno == EBADF)
552
+ {
553
+ assert (("libev: event loop rejected bad fd", errno != EBADF));
554
+ fd_kill (EV_A_ linuxaio_submits [submitted]->aio_fildes);
555
+
556
+ res = 1; /* skip this iocb */
557
+ }
558
+ else
559
+ ev_syserr ("(libev) linuxaio io_submit");
560
+
561
+ submitted += res;
562
+ }
563
+
564
+ linuxaio_submitcnt = 0;
565
+
566
+ /* second phase: fetch and parse events */
567
+
568
+ linuxaio_get_events (EV_A_ timeout);
569
+ }
570
+
571
+ inline_size
572
+ int
573
+ linuxaio_init (EV_P_ int flags)
574
+ {
575
+ /* would be great to have a nice test for IOCB_CMD_POLL instead */
576
+ /* also: test some semi-common fd types, such as files and ttys in recommended_backends */
577
+ /* 4.18 introduced IOCB_CMD_POLL, 4.19 made epoll work, and we need that */
578
+ if (ev_linux_version () < 0x041300)
579
+ return 0;
580
+
581
+ if (!epoll_init (EV_A_ 0))
582
+ return 0;
583
+
584
+ linuxaio_iteration = 0;
585
+
586
+ if (linuxaio_io_setup (EV_A) < 0)
587
+ {
588
+ epoll_destroy (EV_A);
589
+ return 0;
590
+ }
591
+
592
+ ev_io_init (EV_A_ &linuxaio_epoll_w, linuxaio_epoll_cb, backend_fd, EV_READ);
593
+ ev_set_priority (&linuxaio_epoll_w, EV_MAXPRI);
594
+ ev_io_start (EV_A_ &linuxaio_epoll_w);
595
+ ev_unref (EV_A); /* watcher should not keep loop alive */
596
+
597
+ backend_modify = linuxaio_modify;
598
+ backend_poll = linuxaio_poll;
599
+
600
+ linuxaio_iocbpmax = 0;
601
+ linuxaio_iocbps = 0;
602
+
603
+ linuxaio_submits = 0;
604
+ linuxaio_submitmax = 0;
605
+ linuxaio_submitcnt = 0;
606
+
607
+ return EVBACKEND_LINUXAIO;
608
+ }
609
+
610
+ inline_size
611
+ void
612
+ linuxaio_destroy (EV_P)
613
+ {
614
+ epoll_destroy (EV_A);
615
+ linuxaio_free_iocbp (EV_A);
616
+ evsys_io_destroy (linuxaio_ctx); /* fails in child, aio context is destroyed */
617
+ }
618
+
619
+ inline_size
620
+ void
621
+ linuxaio_fork (EV_P)
622
+ {
623
+ /* this frees all iocbs, which is very heavy-handed */
624
+ linuxaio_destroy (EV_A);
625
+ linuxaio_submitcnt = 0; /* all pointers were invalidated */
626
+
627
+ linuxaio_iteration = 0; /* we start over in the child */
628
+
629
+ while (linuxaio_io_setup (EV_A) < 0)
630
+ ev_syserr ("(libev) linuxaio io_setup");
631
+
632
+ /* forking epoll should also effectively unregister all fds from the backend */
633
+ epoll_fork (EV_A);
634
+
635
+ ev_io_stop (EV_A_ &linuxaio_epoll_w);
636
+ ev_io_set (EV_A_ &linuxaio_epoll_w, backend_fd, EV_READ);
637
+ ev_io_start (EV_A_ &linuxaio_epoll_w);
638
+
639
+ /* epoll_fork already did this. hopefully */
640
+ /*fd_rearm_all (EV_A);*/
641
+ }
642
+