RubyGems - polyphony - Versions diffs - 0.85 → 0.86 - Mend

polyphony 0.85 → 0.86

Files changed (230) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/Gemfile.lock +1 -1
data/ext/polyphony/io_extensions.c +2 -3
data/lib/polyphony/version.rb +1 -1
data/polyphony.gemspec +1 -1
data/test/test_backend.rb +1 -1
data/test/test_signal.rb +3 -3
data/vendor/liburing/.github/pull_request_template.md +86 -0
data/vendor/liburing/.github/workflows/build.yml +85 -0
data/vendor/liburing/.github/workflows/shellcheck.yml +20 -0
data/vendor/liburing/.gitignore +149 -0
data/vendor/liburing/COPYING +502 -0
data/vendor/liburing/COPYING.GPL +339 -0
data/vendor/liburing/LICENSE +7 -0
data/vendor/liburing/Makefile +82 -0
data/vendor/liburing/Makefile.common +5 -0
data/vendor/liburing/Makefile.quiet +11 -0
data/vendor/liburing/README +46 -0
data/vendor/liburing/configure +486 -0
data/vendor/liburing/debian/README.Debian +7 -0
data/vendor/liburing/debian/changelog +27 -0
data/vendor/liburing/debian/compat +1 -0
data/vendor/liburing/debian/control +48 -0
data/vendor/liburing/debian/copyright +49 -0
data/vendor/liburing/debian/liburing-dev.install +4 -0
data/vendor/liburing/debian/liburing-dev.manpages +6 -0
data/vendor/liburing/debian/liburing1-udeb.install +1 -0
data/vendor/liburing/debian/liburing1.install +1 -0
data/vendor/liburing/debian/liburing1.symbols +32 -0
data/vendor/liburing/debian/patches/series +1 -0
data/vendor/liburing/debian/rules +81 -0
data/vendor/liburing/debian/source/format +1 -0
data/vendor/liburing/debian/source/local-options +2 -0
data/vendor/liburing/debian/source/options +1 -0
data/vendor/liburing/debian/watch +3 -0
data/vendor/liburing/examples/Makefile +38 -0
data/vendor/liburing/examples/io_uring-cp.c +282 -0
data/vendor/liburing/examples/io_uring-test.c +112 -0
data/vendor/liburing/examples/link-cp.c +193 -0
data/vendor/liburing/examples/ucontext-cp.c +273 -0
data/vendor/liburing/liburing.pc.in +12 -0
data/vendor/liburing/liburing.spec +66 -0
data/vendor/liburing/make-debs.sh +53 -0
data/vendor/liburing/man/io_uring.7 +754 -0
data/vendor/liburing/man/io_uring_cq_advance.3 +35 -0
data/vendor/liburing/man/io_uring_cq_ready.3 +25 -0
data/vendor/liburing/man/io_uring_cqe_get_data.3 +34 -0
data/vendor/liburing/man/io_uring_cqe_seen.3 +32 -0
data/vendor/liburing/man/io_uring_enter.2 +1483 -0
data/vendor/liburing/man/io_uring_free_probe.3 +24 -0
data/vendor/liburing/man/io_uring_get_probe.3 +29 -0
data/vendor/liburing/man/io_uring_get_sqe.3 +38 -0
data/vendor/liburing/man/io_uring_opcode_supported.3 +29 -0
data/vendor/liburing/man/io_uring_prep_msg_ring.3 +58 -0
data/vendor/liburing/man/io_uring_prep_read.3 +50 -0
data/vendor/liburing/man/io_uring_prep_read_fixed.3 +54 -0
data/vendor/liburing/man/io_uring_prep_readv.3 +51 -0
data/vendor/liburing/man/io_uring_prep_readv2.3 +79 -0
data/vendor/liburing/man/io_uring_prep_write.3 +50 -0
data/vendor/liburing/man/io_uring_prep_write_fixed.3 +54 -0
data/vendor/liburing/man/io_uring_prep_writev.3 +51 -0
data/vendor/liburing/man/io_uring_prep_writev2.3 +78 -0
data/vendor/liburing/man/io_uring_queue_exit.3 +27 -0
data/vendor/liburing/man/io_uring_queue_init.3 +44 -0
data/vendor/liburing/man/io_uring_register.2 +688 -0
data/vendor/liburing/man/io_uring_register_buffers.3 +41 -0
data/vendor/liburing/man/io_uring_register_files.3 +35 -0
data/vendor/liburing/man/io_uring_setup.2 +534 -0
data/vendor/liburing/man/io_uring_sq_ready.3 +25 -0
data/vendor/liburing/man/io_uring_sq_space_left.3 +25 -0
data/vendor/liburing/man/io_uring_sqe_set_data.3 +30 -0
data/vendor/liburing/man/io_uring_sqe_set_flags.3 +60 -0
data/vendor/liburing/man/io_uring_sqring_wait.3 +30 -0
data/vendor/liburing/man/io_uring_submit.3 +29 -0
data/vendor/liburing/man/io_uring_submit_and_wait.3 +34 -0
data/vendor/liburing/man/io_uring_submit_and_wait_timeout.3 +49 -0
data/vendor/liburing/man/io_uring_unregister_buffers.3 +26 -0
data/vendor/liburing/man/io_uring_unregister_files.3 +26 -0
data/vendor/liburing/man/io_uring_wait_cqe.3 +33 -0
data/vendor/liburing/man/io_uring_wait_cqe_nr.3 +36 -0
data/vendor/liburing/man/io_uring_wait_cqe_timeout.3 +39 -0
data/vendor/liburing/man/io_uring_wait_cqes.3 +46 -0
data/vendor/liburing/src/Makefile +89 -0
data/vendor/liburing/src/arch/aarch64/syscall.h +95 -0
data/vendor/liburing/src/arch/generic/lib.h +21 -0
data/vendor/liburing/src/arch/generic/syscall.h +87 -0
data/vendor/liburing/src/arch/syscall-defs.h +67 -0
data/vendor/liburing/src/arch/x86/lib.h +32 -0
data/vendor/liburing/src/arch/x86/syscall.h +160 -0
data/vendor/liburing/src/include/liburing/barrier.h +81 -0
data/vendor/liburing/src/include/liburing/io_uring.h +442 -0
data/vendor/liburing/src/include/liburing.h +921 -0
data/vendor/liburing/src/int_flags.h +8 -0
data/vendor/liburing/src/lib.h +57 -0
data/vendor/liburing/src/liburing.map +53 -0
data/vendor/liburing/src/nolibc.c +48 -0
data/vendor/liburing/src/queue.c +403 -0
data/vendor/liburing/src/register.c +293 -0
data/vendor/liburing/src/setup.c +332 -0
data/vendor/liburing/src/syscall.c +47 -0
data/vendor/liburing/src/syscall.h +103 -0
data/vendor/liburing/test/232c93d07b74-test.c +306 -0
data/vendor/liburing/test/35fa71a030ca-test.c +329 -0
data/vendor/liburing/test/500f9fbadef8-test.c +89 -0
data/vendor/liburing/test/7ad0e4b2f83c-test.c +93 -0
data/vendor/liburing/test/8a9973408177-test.c +106 -0
data/vendor/liburing/test/917257daa0fe-test.c +53 -0
data/vendor/liburing/test/Makefile +244 -0
data/vendor/liburing/test/a0908ae19763-test.c +58 -0
data/vendor/liburing/test/a4c0b3decb33-test.c +180 -0
data/vendor/liburing/test/accept-link.c +254 -0
data/vendor/liburing/test/accept-reuse.c +164 -0
data/vendor/liburing/test/accept-test.c +79 -0
data/vendor/liburing/test/accept.c +477 -0
data/vendor/liburing/test/across-fork.c +283 -0
data/vendor/liburing/test/b19062a56726-test.c +53 -0
data/vendor/liburing/test/b5837bd5311d-test.c +77 -0
data/vendor/liburing/test/ce593a6c480a-test.c +136 -0
data/vendor/liburing/test/close-opath.c +122 -0
data/vendor/liburing/test/config +10 -0
data/vendor/liburing/test/connect.c +398 -0
data/vendor/liburing/test/cq-full.c +96 -0
data/vendor/liburing/test/cq-overflow.c +294 -0
data/vendor/liburing/test/cq-peek-batch.c +102 -0
data/vendor/liburing/test/cq-ready.c +94 -0
data/vendor/liburing/test/cq-size.c +64 -0
data/vendor/liburing/test/d4ae271dfaae-test.c +96 -0
data/vendor/liburing/test/d77a67ed5f27-test.c +65 -0
data/vendor/liburing/test/defer.c +307 -0
data/vendor/liburing/test/double-poll-crash.c +185 -0
data/vendor/liburing/test/drop-submit.c +92 -0
data/vendor/liburing/test/eeed8b54e0df-test.c +114 -0
data/vendor/liburing/test/empty-eownerdead.c +45 -0
data/vendor/liburing/test/eventfd-disable.c +151 -0
data/vendor/liburing/test/eventfd-reg.c +76 -0
data/vendor/liburing/test/eventfd-ring.c +97 -0
data/vendor/liburing/test/eventfd.c +112 -0
data/vendor/liburing/test/exec-target.c +6 -0
data/vendor/liburing/test/exit-no-cleanup.c +117 -0
data/vendor/liburing/test/fadvise.c +202 -0
data/vendor/liburing/test/fallocate.c +249 -0
data/vendor/liburing/test/fc2a85cb02ef-test.c +131 -0
data/vendor/liburing/test/file-register.c +858 -0
data/vendor/liburing/test/file-update.c +173 -0
data/vendor/liburing/test/file-verify.c +629 -0
data/vendor/liburing/test/files-exit-hang-poll.c +128 -0
data/vendor/liburing/test/files-exit-hang-timeout.c +134 -0
data/vendor/liburing/test/fixed-link.c +90 -0
data/vendor/liburing/test/fpos.c +252 -0
data/vendor/liburing/test/fsync.c +224 -0
data/vendor/liburing/test/hardlink.c +136 -0
data/vendor/liburing/test/helpers.c +135 -0
data/vendor/liburing/test/helpers.h +67 -0
data/vendor/liburing/test/io-cancel.c +550 -0
data/vendor/liburing/test/io_uring_enter.c +296 -0
data/vendor/liburing/test/io_uring_register.c +676 -0
data/vendor/liburing/test/io_uring_setup.c +192 -0
data/vendor/liburing/test/iopoll.c +372 -0
data/vendor/liburing/test/lfs-openat-write.c +119 -0
data/vendor/liburing/test/lfs-openat.c +275 -0
data/vendor/liburing/test/link-timeout.c +1107 -0
data/vendor/liburing/test/link.c +496 -0
data/vendor/liburing/test/link_drain.c +229 -0
data/vendor/liburing/test/madvise.c +195 -0
data/vendor/liburing/test/mkdir.c +108 -0
data/vendor/liburing/test/msg-ring.c +234 -0
data/vendor/liburing/test/multicqes_drain.c +387 -0
data/vendor/liburing/test/nop-all-sizes.c +99 -0
data/vendor/liburing/test/nop.c +115 -0
data/vendor/liburing/test/open-close.c +261 -0
data/vendor/liburing/test/openat2.c +308 -0
data/vendor/liburing/test/personality.c +204 -0
data/vendor/liburing/test/pipe-eof.c +83 -0
data/vendor/liburing/test/pipe-reuse.c +105 -0
data/vendor/liburing/test/poll-cancel-ton.c +135 -0
data/vendor/liburing/test/poll-cancel.c +228 -0
data/vendor/liburing/test/poll-link.c +230 -0
data/vendor/liburing/test/poll-many.c +208 -0
data/vendor/liburing/test/poll-mshot-update.c +273 -0
data/vendor/liburing/test/poll-ring.c +48 -0
data/vendor/liburing/test/poll-v-poll.c +353 -0
data/vendor/liburing/test/poll.c +109 -0
data/vendor/liburing/test/pollfree.c +426 -0
data/vendor/liburing/test/probe.c +135 -0
data/vendor/liburing/test/read-write.c +876 -0
data/vendor/liburing/test/register-restrictions.c +633 -0
data/vendor/liburing/test/rename.c +135 -0
data/vendor/liburing/test/ring-leak.c +173 -0
data/vendor/liburing/test/ring-leak2.c +249 -0
data/vendor/liburing/test/rsrc_tags.c +449 -0
data/vendor/liburing/test/runtests-loop.sh +16 -0
data/vendor/liburing/test/runtests.sh +170 -0
data/vendor/liburing/test/rw_merge_test.c +97 -0
data/vendor/liburing/test/self.c +91 -0
data/vendor/liburing/test/send_recv.c +286 -0
data/vendor/liburing/test/send_recvmsg.c +345 -0
data/vendor/liburing/test/sendmsg_fs_cve.c +200 -0
data/vendor/liburing/test/shared-wq.c +84 -0
data/vendor/liburing/test/short-read.c +75 -0
data/vendor/liburing/test/shutdown.c +165 -0
data/vendor/liburing/test/sigfd-deadlock.c +74 -0
data/vendor/liburing/test/skip-cqe.c +429 -0
data/vendor/liburing/test/socket-rw-eagain.c +158 -0
data/vendor/liburing/test/socket-rw-offset.c +157 -0
data/vendor/liburing/test/socket-rw.c +145 -0
data/vendor/liburing/test/splice.c +512 -0
data/vendor/liburing/test/sq-full-cpp.cc +45 -0
data/vendor/liburing/test/sq-full.c +45 -0
data/vendor/liburing/test/sq-poll-dup.c +204 -0
data/vendor/liburing/test/sq-poll-kthread.c +169 -0
data/vendor/liburing/test/sq-poll-share.c +137 -0
data/vendor/liburing/test/sq-space_left.c +159 -0
data/vendor/liburing/test/sqpoll-cancel-hang.c +157 -0
data/vendor/liburing/test/sqpoll-disable-exit.c +196 -0
data/vendor/liburing/test/sqpoll-exit-hang.c +78 -0
data/vendor/liburing/test/sqpoll-sleep.c +69 -0
data/vendor/liburing/test/statx.c +172 -0
data/vendor/liburing/test/stdout.c +232 -0
data/vendor/liburing/test/submit-link-fail.c +154 -0
data/vendor/liburing/test/submit-reuse.c +239 -0
data/vendor/liburing/test/symlink.c +116 -0
data/vendor/liburing/test/teardowns.c +58 -0
data/vendor/liburing/test/thread-exit.c +143 -0
data/vendor/liburing/test/timeout-new.c +252 -0
data/vendor/liburing/test/timeout-overflow.c +204 -0
data/vendor/liburing/test/timeout.c +1523 -0
data/vendor/liburing/test/unlink.c +112 -0
data/vendor/liburing/test/wakeup-hang.c +162 -0
metadata +223 -2

data/vendor/liburing/man/io_uring.7 ADDED Viewed

@@ -0,0 +1,754 @@
+.\" Copyright (C) 2020 Shuveb Hussain <shuveb@gmail.com>
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH IO_URING 7 2020-07-26 "Linux" "Linux Programmer's Manual"
+.SH NAME
+io_uring \- Asynchronous I/O facility
+.SH SYNOPSIS
+.nf
+.B "#include <linux/io_uring.h>"
+.fi
+.PP
+.SH DESCRIPTION
+.PP
+.B io_uring
+is a Linux-specific API for asynchronous I/O.
+It allows the user to submit one or more I/O requests,
+which are processed asynchronously without blocking the calling process.
+.B io_uring
+gets its name from ring buffers which are shared between user space and
+kernel space. This arrangement allows for efficient I/O,
+while avoiding the overhead of copying buffers between them,
+where possible.
+This interface makes
+.B io_uring
+different from other UNIX I/O APIs,
+wherein,
+rather than just communicate between kernel and user space with system calls,
+ring buffers are used as the main mode of communication.
+This arrangement has various performance benefits which are discussed in a
+separate section below.
+This man page uses the terms shared buffers, shared ring buffers and
+queues interchangeably.
+.PP
+The general programming model you need to follow for
+.B io_uring
+is outlined below
+.IP \(bu
+Set up shared buffers with
+.BR io_uring_setup (2)
+and
+.BR mmap (2),
+mapping into user space shared buffers for the submission queue (SQ) and the
+completion queue (CQ).
+You place I/O requests you want to make on the SQ,
+while the kernel places the results of those operations on the CQ.
+.IP \(bu
+For every I/O request you need to make (like to read a file, write a file,
+accept a socket connection, etc), you create a submission queue entry,
+or SQE,
+describe the I/O operation you need to get done and add it to the tail of
+the submission queue (SQ).
+Each I/O operation is,
+in essence,
+the equivalent of a system call you would have made otherwise,
+if you were not using
+.BR io_uring .
+You can add more than one SQE to the queue depending on the number of
+operations you want to request.
+.IP \(bu
+After you add one or more SQEs,
+you need to call
+.BR io_uring_enter (2)
+to tell the kernel to dequeue your I/O requests off the SQ and begin
+processing them.
+.IP \(bu
+For each SQE you submit,
+once it is done processing the request,
+the kernel places a completion queue event or CQE at the tail of the
+completion queue or CQ.
+The kernel places exactly one matching CQE in the CQ for every SQE you
+submit on the SQ.
+After you retrieve a CQE,
+minimally,
+you might be interested in checking the
+.I res
+field of the CQE structure,
+which corresponds to the return value of the system
+call's equivalent,
+had you used it directly without using
+.BR io_uring .
+For instance,
+a read operation under
+.BR io_uring ,
+started with the
+.BR IORING_OP_READ
+operation, issues the equivalent of the
+.BR read (2)
+system call. In practice, it mixes the semantics of
+.BR pread (2)
+and
+.BR preadv2 (2)
+in that it takes an explicit offset, and supports using -1 for the offset to
+indicate that the current file position should be used instead of passing in
+an explicit offset. See the opcode documentation for more details. Given that
+io_uring is an async interface,
+.I errno
+is never used for passing back error information. Instead,
+.I res
+will contain what the equivalent system call would have returned in case
+of success, and in case of error
+.I res
+will contain
+.I -errno .
+For example, if the normal read system call would have returned -1 and set
+.I errno
+to
+.B EINVAL ,
+then
+.I res
+would contain
+.B -EINVAL .
+If the normal system call would have returned a read size of 1024, then
+.I res
+would contain 1024.
+.IP \(bu
+Optionally,
+.BR io_uring_enter (2)
+can also wait for a specified number of requests to be processed by the kernel
+before it returns.
+If you specified a certain number of completions to wait for,
+the kernel would have placed at least those many number of CQEs on the CQ,
+which you can then readily read,
+right after the return from
+.BR io_uring_enter (2).
+.IP \(bu
+It is important to remember that I/O requests submitted to the kernel can
+complete in any order.
+It is not necessary for the kernel to process one request after another,
+in the order you placed them.
+Given that the interface is a ring,
+the requests are attempted in order,
+however that doesn't imply any sort of ordering on their completion.
+When more than one request is in flight,
+it is not possible to determine which one will complete first.
+When you dequeue CQEs off the CQ,
+you should always check which submitted request it corresponds to.
+The most common method for doing so is utilizing the
+.I user_data
+field in the request, which is passed back on the completion side.
+.PP
+Adding to and reading from the queues:
+.IP \(bu
+You add SQEs to the tail of the SQ.
+The kernel reads SQEs off the head of the queue.
+.IP \(bu
+The kernel adds CQEs to the tail of the CQ.
+You read CQEs off the head of the queue.
+.SS Submission queue polling
+One of the goals of
+.B io_uring
+is to provide a means for efficient I/O.
+To this end,
+.B io_uring
+supports a polling mode that lets you avoid the call to
+.BR io_uring_enter (2),
+which you use to inform the kernel that you have queued SQEs on to the SQ.
+With SQ Polling,
+.B io_uring
+starts a kernel thread that polls the submission queue for any I/O
+requests you submit by adding SQEs.
+With SQ Polling enabled,
+there is no need for you to call
+.BR io_uring_enter (2),
+letting you avoid the overhead of system calls.
+A designated kernel thread dequeues SQEs off the SQ as you add them and
+dispatches them for asynchronous processing.
+.SS Setting up io_uring
+.PP
+The main steps in setting up
+.B io_uring
+consist of mapping in the shared buffers with
+.BR mmap (2)
+calls.
+In the example program included in this man page,
+the function
+.BR app_setup_uring ()
+sets up
+.B io_uring
+with a QUEUE_DEPTH deep submission queue.
+Pay attention to the 2
+.BR mmap (2)
+calls that set up the shared submission and completion queues.
+If your kernel is older than version 5.4,
+three
+.BR mmap(2)
+calls are required.
+.PP
+.SS Submitting I/O requests
+The process of submitting a request consists of describing the I/O
+operation you need to get done using an
+.B io_uring_sqe
+structure instance.
+These details describe the equivalent system call and its parameters.
+Because the range of I/O operations Linux supports are very varied and the
+.B io_uring_sqe
+structure needs to be able to describe them,
+it has several fields,
+some packed into unions for space efficiency.
+Here is a simplified version of struct
+.B io_uring_sqe
+with some of the most often used fields:
+.PP
+.in +4n
+.EX
+struct io_uring_sqe {
+        __u8    opcode;         /* type of operation for this sqe */
+        __s32   fd;             /* file descriptor to do IO on */
+        __u64   off;            /* offset into file */
+        __u64   addr;           /* pointer to buffer or iovecs */
+        __u32   len;            /* buffer size or number of iovecs */
+        __u64   user_data;      /* data to be passed back at completion time */
+        __u8    flags;          /* IOSQE_ flags */
+        ...
+};
+.EE
+.in
+Here is struct
+.B io_uring_sqe
+in full:
+.in +4n
+.EX
+struct io_uring_sqe {
+        __u8    opcode;         /* type of operation for this sqe */
+        __u8    flags;          /* IOSQE_ flags */
+        __u16   ioprio;         /* ioprio for the request */
+        __s32   fd;             /* file descriptor to do IO on */
+        union {
+                __u64   off;    /* offset into file */
+                __u64   addr2;
+        };
+        union {
+                __u64   addr;   /* pointer to buffer or iovecs */
+                __u64   splice_off_in;
+        };
+        __u32   len;            /* buffer size or number of iovecs */
+        union {
+                __kernel_rwf_t  rw_flags;
+                __u32           fsync_flags;
+                __u16           poll_events;    /* compatibility */
+                __u32           poll32_events;  /* word-reversed for BE */
+                __u32           sync_range_flags;
+                __u32           msg_flags;
+                __u32           timeout_flags;
+                __u32           accept_flags;
+                __u32           cancel_flags;
+                __u32           open_flags;
+                __u32           statx_flags;
+                __u32           fadvise_advice;
+                __u32           splice_flags;
+        };
+        __u64   user_data;      /* data to be passed back at completion time */
+        union {
+                struct {
+                        /* pack this to avoid bogus arm OABI complaints */
+                        union {
+                                /* index into fixed buffers, if used */
+                                __u16   buf_index;
+                                /* for grouped buffer selection */
+                                __u16   buf_group;
+                        } __attribute__((packed));
+                        /* personality to use, if used */
+                        __u16   personality;
+                        __s32   splice_fd_in;
+                };
+                __u64   __pad2[3];
+        };
+};
+.EE
+.in
+.PP
+To submit an I/O request to
+.BR io_uring ,
+you need to acquire a submission queue entry (SQE) from the submission
+queue (SQ),
+fill it up with details of the operation you want to submit and call
+.BR io_uring_enter (2).
+If you want to avoid calling
+.BR io_uring_enter (2),
+you have the option of setting up Submission Queue Polling.
+.PP
+SQEs are added to the tail of the submission queue.
+The kernel picks up SQEs off the head of the SQ.
+The general algorithm to get the next available SQE and update the tail is
+as follows.
+.PP
+.in +4n
+.EX
+struct io_uring_sqe *sqe;
+unsigned tail, index;
+tail = *sqring->tail;
+index = tail & (*sqring->ring_mask);
+sqe = &sqring->sqes[index];
+/* fill up details about this I/O request */
+describe_io(sqe);
+/* fill the sqe index into the SQ ring array */
+sqring->array[index] = index;
+tail++;
+atomic_store_release(sqring->tail, tail);
+.EE
+.in
+.PP
+To get the index of an entry,
+the application must mask the current tail index with the size mask of the
+ring.
+This holds true for both SQs and CQs.
+Once the SQE is acquired,
+the necessary fields are filled in,
+describing the request.
+While the CQ ring directly indexes the shared array of CQEs,
+the submission side has an indirection array between them.
+The submission side ring buffer is an index into this array,
+which in turn contains the index into the SQEs.
+.PP
+The following code snippet demonstrates how a read operation,
+an equivalent of a
+.BR preadv2 (2)
+system call is described by filling up an SQE with the necessary
+parameters.
+.PP
+.in +4n
+.EX
+struct iovec iovecs[16];
+ ...
+sqe->opcode = IORING_OP_READV;
+sqe->fd = fd;
+sqe->addr = (unsigned long) iovecs;
+sqe->len = 16;
+sqe->off = offset;
+sqe->flags = 0;
+.EE
+.in
+.TP
+.B Memory ordering
+Modern compilers and CPUs freely reorder reads and writes without
+affecting the program's outcome to optimize performance.
+Some aspects of this need to be kept in mind on SMP systems since
+.B io_uring
+involves buffers shared between kernel and user space.
+These buffers are both visible and modifiable from kernel and user space.
+As heads and tails belonging to these shared buffers are updated by kernel
+and user space,
+changes need to be coherently visible on either side,
+irrespective of whether a CPU switch took place after the kernel-user mode
+switch happened.
+We use memory barriers to enforce this coherency.
+Being significantly large subjects on their own,
+memory barriers are out of scope for further discussion on this man page.
+.TP
+.B Letting the kernel know about I/O submissions
+Once you place one or more SQEs on to the SQ,
+you need to let the kernel know that you've done so.
+You can do this by calling the
+.BR io_uring_enter (2)
+system call.
+This system call is also capable of waiting for a specified count of
+events to complete.
+This way,
+you can be sure to find completion events in the completion queue without
+having to poll it for events later.
+.SS Reading completion events
+Similar to the submission queue (SQ),
+the completion queue (CQ) is a shared buffer between the kernel and user
+space.
+Whereas you placed submission queue entries on the tail of the SQ and the
+kernel read off the head,
+when it comes to the CQ,
+the kernel places completion queue events or CQEs on the tail of the CQ and
+you read off its head.
+.PP
+Submission is flexible (and thus a bit more complicated) since it needs to
+be able to encode different types of system calls that take various
+parameters.
+Completion,
+on the other hand is simpler since we're looking only for a return value
+back from the kernel.
+This is easily understood by looking at the completion queue event
+structure,
+struct
+.BR io_uring_cqe :
+.PP
+.in +4n
+.EX
+struct io_uring_cqe {
+	__u64	user_data;  /* sqe->data submission passed back */
+	__s32	res;        /* result code for this event */
+	__u32	flags;
+};
+.EE
+.in
+.PP
+Here,
+.I user_data
+is custom data that is passed unchanged from submission to completion.
+That is,
+from SQEs to CQEs.
+This field can be used to set context,
+uniquely identifying submissions that got completed.
+Given that I/O requests can complete in any order,
+this field can be used to correlate a submission with a completion.
+.I res
+is the result from the system call that was performed as part of the
+submission;
+its return value.
+The
+.I flags
+field could carry request-specific metadata in the future,
+but is currently unused.
+.PP
+The general sequence to read completion events off the completion queue is
+as follows:
+.PP
+.in +4n
+.EX
+unsigned head;
+head = *cqring->head;
+if (head != atomic_load_acquire(cqring->tail)) {
+    struct io_uring_cqe *cqe;
+    unsigned index;
+    index = head & (cqring->mask);
+    cqe = &cqring->cqes[index];
+    /* process completed CQE */
+    process_cqe(cqe);
+    /* CQE consumption complete */
+    head++;
+}
+atomic_store_release(cqring->head, head);
+.EE
+.in
+.PP
+It helps to be reminded that the kernel adds CQEs to the tail of the CQ,
+while you need to dequeue them off the head.
+To get the index of an entry at the head,
+the application must mask the current head index with the size mask of the
+ring.
+Once the CQE has been consumed or processed,
+the head needs to be updated to reflect the consumption of the CQE.
+Attention should be paid to the read and write barriers to ensure
+successful read and update of the head.
+.SS io_uring performance
+Because of the shared ring buffers between kernel and user space,
+.B io_uring
+can be a zero-copy system.
+Copying buffers to and from becomes necessary when system calls that
+transfer data between kernel and user space are involved.
+But since the bulk of the communication in
+.B io_uring
+is via buffers shared between the kernel and user space,
+this huge performance overhead is completely avoided.
+.PP
+While system calls may not seem like a significant overhead,
+in high performance applications,
+making a lot of them will begin to matter.
+While workarounds the operating system has in place to deal with Spectre
+and Meltdown are ideally best done away with,
+unfortunately,
+some of these workarounds are around the system call interface,
+making system calls not as cheap as before on affected hardware.
+While newer hardware should not need these workarounds,
+hardware with these vulnerabilities can be expected to be in the wild for a
+long time.
+While using synchronous programming interfaces or even when using
+asynchronous programming interfaces under Linux,
+there is at least one system call involved in the submission of each
+request.
+In
+.BR io_uring ,
+on the other hand,
+you can batch several requests in one go,
+simply by queueing up multiple SQEs,
+each describing an I/O operation you want and make a single call to
+.BR io_uring_enter (2).
+This is possible due to
+.BR io_uring 's
+shared buffers based design.
+.PP
+While this batching in itself can avoid the overhead associated with
+potentially multiple and frequent system calls,
+you can reduce even this overhead further with Submission Queue Polling,
+by having the kernel poll and pick up your SQEs for processing as you add
+them to the submission queue. This avoids the
+.BR io_uring_enter (2)
+call you need to make to tell the kernel to pick SQEs up.
+For high-performance applications,
+this means even lesser system call overheads.
+.SH CONFORMING TO
+.B io_uring
+is Linux-specific.
+.SH EXAMPLES
+The following example uses
+.B io_uring
+to copy stdin to stdout.
+Using shell redirection,
+you should be able to copy files with this example.
+Because it uses a queue depth of only one,
+this example processes I/O requests one after the other.
+It is purposefully kept this way to aid understanding.
+In real-world scenarios however,
+you'll want to have a larger queue depth to parallelize I/O request
+processing so as to gain the kind of performance benefits
+.B io_uring
+provides with its asynchronous processing of requests.
+.PP
+.EX
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+#include <linux/fs.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdatomic.h>
+#include <linux/io_uring.h>
+#define QUEUE_DEPTH 1
+#define BLOCK_SZ    1024
+/* Macros for barriers needed by io_uring */
+#define io_uring_smp_store_release(p, v)            \\
+    atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \\
+                  memory_order_release)
+#define io_uring_smp_load_acquire(p)                \\
+    atomic_load_explicit((_Atomic typeof(*(p)) *)(p),   \\
+                 memory_order_acquire)
+int ring_fd;
+unsigned *sring_tail, *sring_mask, *sring_array,
+            *cring_head, *cring_tail, *cring_mask;
+struct io_uring_sqe *sqes;
+struct io_uring_cqe *cqes;
+char buff[BLOCK_SZ];
+off_t offset;
+/*
+ * System call wrappers provided since glibc does not yet
+ * provide wrappers for io_uring system calls.
+* */
+int io_uring_setup(unsigned entries, struct io_uring_params *p)
+{
+    return (int) syscall(__NR_io_uring_setup, entries, p);
+}
+int io_uring_enter(int ring_fd, unsigned int to_submit,
+                   unsigned int min_complete, unsigned int flags)
+{
+    return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete,
+                         flags, NULL, 0);
+}
+int app_setup_uring(void) {
+    struct io_uring_params p;
+    void *sq_ptr, *cq_ptr;
+    /* See io_uring_setup(2) for io_uring_params.flags you can set */
+    memset(&p, 0, sizeof(p));
+    ring_fd = io_uring_setup(QUEUE_DEPTH, &p);
+    if (ring_fd < 0) {
+        perror("io_uring_setup");
+        return 1;
+    }
+    /*
+     * io_uring communication happens via 2 shared kernel-user space ring
+     * buffers, which can be jointly mapped with a single mmap() call in
+     * kernels >= 5.4.
+     */
+    int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
+    int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
+    /* Rather than check for kernel version, the recommended way is to
+     * check the features field of the io_uring_params structure, which is a
+     * bitmask. If IORING_FEAT_SINGLE_MMAP is set, we can do away with the
+     * second mmap() call to map in the completion ring separately.
+     */
+    if (p.features & IORING_FEAT_SINGLE_MMAP) {
+        if (cring_sz > sring_sz)
+            sring_sz = cring_sz;
+        cring_sz = sring_sz;
+    }
+    /* Map in the submission and completion queue ring buffers.
+     *  Kernels < 5.4 only map in the submission queue, though.
+     */
+    sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
+                  MAP_SHARED | MAP_POPULATE,
+                  ring_fd, IORING_OFF_SQ_RING);
+    if (sq_ptr == MAP_FAILED) {
+        perror("mmap");
+        return 1;
+    }
+    if (p.features & IORING_FEAT_SINGLE_MMAP) {
+        cq_ptr = sq_ptr;
+    } else {
+        /* Map in the completion queue ring buffer in older kernels separately */
+        cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
+                      MAP_SHARED | MAP_POPULATE,
+                      ring_fd, IORING_OFF_CQ_RING);
+        if (cq_ptr == MAP_FAILED) {
+            perror("mmap");
+            return 1;
+        }
+    }
+    /* Save useful fields for later easy reference */
+    sring_tail = sq_ptr + p.sq_off.tail;
+    sring_mask = sq_ptr + p.sq_off.ring_mask;
+    sring_array = sq_ptr + p.sq_off.array;
+    /* Map in the submission queue entries array */
+    sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
+                   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+                   ring_fd, IORING_OFF_SQES);
+    if (sqes == MAP_FAILED) {
+        perror("mmap");
+        return 1;
+    }
+    /* Save useful fields for later easy reference */
+    cring_head = cq_ptr + p.cq_off.head;
+    cring_tail = cq_ptr + p.cq_off.tail;
+    cring_mask = cq_ptr + p.cq_off.ring_mask;
+    cqes = cq_ptr + p.cq_off.cqes;
+    return 0;
+}
+/*
+* Read from completion queue.
+* In this function, we read completion events from the completion queue.
+* We dequeue the CQE, update and head and return the result of the operation.
+* */
+int read_from_cq() {
+    struct io_uring_cqe *cqe;
+    unsigned head;
+    /* Read barrier */
+    head = io_uring_smp_load_acquire(cring_head);
+    /*
+    * Remember, this is a ring buffer. If head == tail, it means that the
+    * buffer is empty.
+    * */
+    if (head == *cring_tail)
+        return -1;
+    /* Get the entry */
+    cqe = &cqes[head & (*cring_mask)];
+    if (cqe->res < 0)
+        fprintf(stderr, "Error: %s\\n", strerror(abs(cqe->res)));
+    head++;
+    /* Write barrier so that update to the head are made visible */
+    io_uring_smp_store_release(cring_head, head);
+    return cqe->res;
+}
+/*
+* Submit a read or a write request to the submission queue.
+* */
+int submit_to_sq(int fd, int op) {
+    unsigned index, tail;
+    /* Add our submission queue entry to the tail of the SQE ring buffer */
+    tail = *sring_tail;
+    index = tail & *sring_mask;
+    struct io_uring_sqe *sqe = &sqes[index];
+    /* Fill in the parameters required for the read or write operation */
+    sqe->opcode = op;
+    sqe->fd = fd;
+    sqe->addr = (unsigned long) buff;
+    if (op == IORING_OP_READ) {
+        memset(buff, 0, sizeof(buff));
+        sqe->len = BLOCK_SZ;
+    }
+    else {
+        sqe->len = strlen(buff);
+    }
+    sqe->off = offset;
+    sring_array[index] = index;
+    tail++;
+    /* Update the tail */
+    io_uring_smp_store_release(sring_tail, tail);
+    /*
+    * Tell the kernel we have submitted events with the io_uring_enter() system
+    * call. We also pass in the IOURING_ENTER_GETEVENTS flag which causes the
+    * io_uring_enter() call to wait until min_complete (the 3rd param) events
+    * complete.
+    * */
+    int ret =  io_uring_enter(ring_fd, 1,1,
+                              IORING_ENTER_GETEVENTS);
+    if(ret < 0) {
+        perror("io_uring_enter");
+        return -1;
+    }
+    return ret;
+}
+int main(int argc, char *argv[]) {
+    int res;
+    /* Setup io_uring for use */
+    if(app_setup_uring()) {
+        fprintf(stderr, "Unable to setup uring!\\n");
+        return 1;
+    }
+    /*
+    * A while loop that reads from stdin and writes to stdout.
+    * Breaks on EOF.
+    */
+    while (1) {
+        /* Initiate read from stdin and wait for it to complete */
+        submit_to_sq(STDIN_FILENO, IORING_OP_READ);
+        /* Read completion queue entry */
+        res = read_from_cq();
+        if (res > 0) {
+            /* Read successful. Write to stdout. */
+            submit_to_sq(STDOUT_FILENO, IORING_OP_WRITE);
+            read_from_cq();
+        } else if (res == 0) {
+            /* reached EOF */
+            break;
+        }
+        else if (res < 0) {
+            /* Error reading file */
+            fprintf(stderr, "Error: %s\\n", strerror(abs(res)));
+            break;
+        }
+        offset += res;
+    }
+    return 0;
+}
+.EE
+.SH SEE ALSO
+.BR io_uring_enter (2)
+.BR io_uring_register (2)
+.BR io_uring_setup (2)