RubyGems - uringmachine - Versions diffs - 0.22.0 → 0.23.0 - Mend

uringmachine 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -2
data/TODO.md +24 -138
data/benchmark/README.md +69 -103
data/benchmark/bm_io_pipe.rb +14 -0
data/benchmark/chart_all.png +0 -0
data/benchmark/common.rb +9 -0
data/benchmark/read_each.rb +83 -0
data/benchmark/send.rb +31 -36
data/ext/um/extconf.rb +7 -1
data/ext/um/um.c +131 -8
data/ext/um/um.h +8 -0
data/ext/um/um_class.c +34 -0
data/ext/um/um_const.c +0 -2
data/ext/um/um_op.c +20 -2
data/ext/um/um_utils.c +27 -0
data/grant-2025/journal.md +2 -2
data/grant-2025/tasks.md +8 -13
data/lib/uringmachine/version.rb +1 -1
data/test/helper.rb +5 -4
data/test/test_fiber_scheduler.rb +1 -17
data/test/test_um.rb +299 -62
data/vendor/liburing/configure +4 -2
data/vendor/liburing/src/Makefile +1 -0
data/vendor/liburing/test/min-timeout-wait.c +57 -2
data/vendor/liburing/test/min-timeout.c +22 -0
metadata +4 -3
data/benchmark/chart.png +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 74e4816d1191d862df3ba04d46cc038d04b999c22c5604d9a4eec0d1d3fd047c
-  data.tar.gz: d857ba559f6c48dfc8d65a1812eb3996c7a65d70d263e016bbb96dbf99e6273c
+  metadata.gz: b185b9cafdee3930061ed7101a12ccb500a8f131a9715a6a1268b22507ec2d85
+  data.tar.gz: c2d0fe4aced8f2340b2cdd29cf9540b7075198be56a0e695f58b9fbab5fae65c
 SHA512:
-  metadata.gz: 662c0f7e07df7f87c759eb3e8001aa91c0682d55c63fc46e0429c5ac577de3e0f89476f93b0deb3e05fb3fba4daa4eaae767141615c9e1af1b35df8966f7d988
-  data.tar.gz: d22cc49d99ef5772411ebdb8019b6d83eb9e944a83b0a327c638bb5eeaef5661ed3cc34dcb6673a26049e9f7f17cebe76306ca7b847cd4bc0e244c99dfafb210
+  metadata.gz: 75b0fe0a71242d728cbe1901457e41a11255c1922eca6501e3fa4286a7bd89a01ad7e5625baa24721ad63efc16da24a4b4d8ffeb85114be8fbe910066a0b033e
+  data.tar.gz: 64afeb65cc42c5b5c3af30dc361f2917a6341fd38e645114c48972bb6aa6617f496be6de040dfec6eefa102ae1d933cd726549890a9d57369f465be3b6df613f

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,16 @@
+# 0.23.0 2025-12-16
+- Add `UM#accept_into_queue`, fix `#accept_each` to throw on error
+- Use Set instead of Hash for holding pending fibers
+- Add `UM#writev`, `UM#sendv` methods
+- Allocate um_op and um_op_result in batches of 256
+- Remove `SIGCLD` const
+# 0.22.1 2025-12-11
+- Comment out SIGCLD constant
 # 0.22.0 2025-12-10
 - Fix use of `um_yield` in statx, multishot ops
@@ -8,6 +21,7 @@
 - More tests and benchmarks
 - Add `UM#await_fibers` for awaiting fibers
 - Add `UM.socketpair` for creating a socket pair
+- Fix segfault caused by waiting fibers not being marked
 - Fiber scheduler:
   - Use fiber's mailbox for processing blocking operations
   - Add `#io_close`, `#yield` hooks, remove `#process_fork` hook
@@ -22,8 +36,8 @@
 - Add debug logging for key io_uring interactions
 - Add UM#mark and DEBUG_MARK for debugging specific UM instances
 - Short-circuit zero-length writes
-- Add optional file_offset argument to #read, #write. Add optional len and file_off
-set arguments to #write_async
+- Add optional file_offset argument to #read, #write. Add optional len and
+  file_offset arguments to #write_async
 - Add support for specifying SQPOLL mode and SQ idle timeout in `UM#initialize`
 - Add support for specifying number of SQ entries in `UM#initialize`
 - Implement global worker pool for blocking operations in fiber scheduler

data/TODO.md CHANGED Viewed

@@ -1,148 +1,28 @@
 ## immediate
-## Measuring CPU time for fibers
-- use CPU time (CLOCK_THREAD_CPUTIME_ID)
-- measure:
-  - time each fiber is waiting
-  - time each fiber is running
-  - time machine is waiting (for CQEs)
-  - time machine is running fibers from the runqueue
-- can be turned on/off at any time
-- no performance impact when off
-How can this be implemented:
-- `um_get_time_cpu()` function for reading CPU time (CLOCK_THREAD_CPUTIME_ID) as
-  double.
-- add to `struct um`:
-  ```c
-  struct um {
-    ...
-    int profiling_mode;
-    double total_time_run;
-    double total_time_wait;
-    double last_cpu_time;
-  }
-  ```
+## buffer rings - automatic management
-- `UM#profile=` to turn it on/off.
-- On `machine.profile = true`, reset `total_time_xxx` and `last_cpu_time`
+```ruby
+# completely hands off
+machine.read_each(fd) { |str| ... }
-  ```c
-  machine->total_time_run = 0;
-  machine->total_time_wait = 0;
-  machine->last_cpu_time = um_get_time_cpu();
-  ```
+# what if we want to get IO::Buffer?
+machine.read_each(fd, io_buffer: true) { |iobuff, len| ... }
+```
-- when profiling is active:
-  - before processing CQEs:
-    ```c
-    // before
-    double cpu_time0;
-    VALUE fiber;
-    int profiling_mode = machine->profiling_mode;
-    if (profiling_mode) {
-      fiber = rb_fiber_current();
-      cpu_time0 = um_get_time_cpu();
-      double elapsed = cpu_time0 - machine->last_cpu_time;
-      um_update_fiber_time_run(fiber, cpu_time0, elapsed);
-      machine->total_time_run += elapsed;
-    }
-    process_cqes(...)
-    // after
-    if (profiling_mode) {
-      double cpu_time1 = um_get_time_cpu();
-      double elapsed = cpu_time1 - cpu_time0;
-      um_update_fiber_last_time(fiber, cpu_time1);
-      machine->total_time_wait += elapsed;
-      machine->last_cpu_time = cpu_time1;
-    }
-    ```
-  - when doing switching, in `um_process_runqueue_op`:
-    ```c
-    // before
-    double cpu_time;
-    VALUE cur_fiber;
-    VALUE next_fiber = get_next_fiber(...);
-    int profiling_mode = machine->profiling_mode;
-    if (profiling_mode) {
-      cur_fiber = rb_fiber_current();
-      cpu_time = um_get_time_cpu();
-      double elapsed = cpu_time - machine->last_cpu_time;
-      um_update_fiber_time_run(cur_fiber, cpu_time, elapsed);
-      machine->total_time_run += elapsed;
-      um_update_fiber_time_wait(next_fiber, cpu_time);
-      machine->last_cpu_time = cpu_time;
-    }
-    do_fiber_transfer(...)
-    ```
-  - updating fiber time instance vars:
-    ```c
-    inline void um_update_fiber_time_run(VALUE fiber, double stamp, double elapsed) {
-      // VALUE fiber_stamp = rb_ivar_get(fiber, ID_time_last_cpu);
-      VALUE fiber_total_run = rb_ivar_get(fiber, ID_time_total_run);
-      double total = NIL_P(fiber_total_run) ?
-        elapsed : NUM2DBL(fiber_total_run) + elapsed;
-      rb_ivar_set(fiber, ID_time_total_run, DBL2NUM(total));
-      rb_ivar_set(fiber, ID_time_last_cpu, DBL2NUM(stamp));
-    }
-    inline void um_update_fiber_time_wait(VALUE fiber, double stamp) {
-      VALUE fiber_last_stamp = rb_ivar_get(fiber, ID_time_last_cpu);
-      if (likely(!NIL_P(fiber_last_stamp))) {
-        double last_stamp = NUM2DBL(fiber_last_stamp);
-        double elapsed = stamp - last_stamp;
-        VALUE fiber_total_wait = rb_ivar_get(fiber, ID_time_total_wait);
-        double total = NIL_P(fiber_total_wait) ?
-          elapsed : NUM2DBL(fiber_total_wait) + elapsed;
-        rb_ivar_set(fiber, ID_time_total_wait, DBL2NUM(total));
-      }
-      else
-        rb_ivar_set(fiber, ID_time_total_wait, DBL2NUM(0.0));
-      rb_ivar_set(fiber, ID_time_last_cpu, DBL2NUM(stamp));
-    }
-    ```
-## Metrics API
-- machine metrics: `UM#metrics` - returns a hash containing metrics:
+## write/send multiple buffers at once
-  ```ruby
-  {
-    size:,          # SQ size (entries)
-    total_ops:, # total ops submitted
-    total_fiber_switches:, # total fiber switches
-    total_cqe_waits:, # total number of CQE waits
-    ops_pending:, # number of pending ops
-    ops_unsubmitted:, # number of unsubmitted
-    ops_runqueue:, # number of ops in runqueue
-    ops_free:, # number of ops in freelist
-    ops_transient:, # number of ops in transient list
-    hwm_pending:, # high water mark - pending ops
-    hwm_unsubmitted:, # high water mark - unsubmitted ops
-    hwm_runqueue:, # high water mark - runqueue depth
-    hwm_free:, # high water mark - ops in free list
-    hwm_transient:, # high water mark - ops in transient list
-    # when profiling is active
-    time_total_run:, # total CPU time running
-    time_total_wait:, # total CPU time waiting for CQEs
-  }
-  ```
+This is done as vectored IO:
-- For this we need to add tracking for:
-  - runqueue list size
-  - transient list size
-  - free list size
-- Those will be done in um_op.c (in linked list management code)
+```ruby
+machine.writev(fd, buf1, buf2, buf3)
-- All metrics info in kept in
+# with optional file offset:
+machine.writev(fd, buf1, buf2, buf3, 0)
+# for the moment it won't take flags
+machine.sendv(fd, buf1, buf2, buf3)
+```
 ## useful concurrency tools
@@ -152,13 +32,19 @@ How can this be implemented:
   debouncer = UM.debounce { }
   ```
+## polyvalent select
+- select on multiple queues (ala Go)
+- select on mixture of queues and fds
 ## ops
 - [ ] multishot timeout
   - [v] machine.periodically(interval) { ... }
   - [ ] machine.prep_timeout_multishot(interval)
-- writev
 - splice / - tee
 - sendto
 - recvfrom

data/benchmark/README.md CHANGED Viewed

@@ -4,25 +4,26 @@ The following benchmarks measure the performance of UringMachine against stock
 Ruby in a variety of scenarios. For each scenario, we compare three different
 implementations:
-- **Threads**: thread-based concurrency using the stock Ruby I/O and
+- `Threads`: thread-based concurrency using the stock Ruby I/O and
   synchronization classes.
-- **Async FS**: fiber-based concurrency with the
-  [Async](https://github.com/socketry/async) fiber scheduler, using the stock
-  Ruby I/O and synchronization classes.
+- `ThreadPool`: thread pool consisting of 10 worker threads, receiving jobs
+  through a common queue.
-- **UM FS**: fiber-based concurrency with the UringMachine fiber scheduler,
-  using the stock Ruby I/O and synchronization classes.
+- `Async epoll`: fiber-based concurrency with
+  [Async](https://github.com/socketry/async) fiber scheduler, using an epoll
+  selector.
-- **UM pure**: fiber-based concurrency using the UringMachine low-level (pure)
-  API.
+- `Async uring`: fiber-based concurrency with Async fiber scheduler, using a
+  uring selector.
-- **UM sqpoll**: the same as **UM pure** with [submission queue
-  polling](https://unixism.net/loti/tutorial/sq_poll.html).
+- `UM FS`: fiber-based concurrency with UringMachine fiber scheduler.
-<img src="./chart.png">
+- `UM`: fiber-based concurrency using the UringMachine low-level API.
-## Observations:
+<img src="./chart_all.png">
+## Observations
 - We see the stark difference between thread-based and fiber-based concurrency.
   For I/O-bound workloads, there's really no contest - and that's exactly why
@@ -34,28 +35,37 @@ implementations:
   C-extension.
 - The UringMachine low-level API is faster to use in most cases, and its
-  performance advantage grows with the level of concurrency.
-- SQ polling provides a performance advantage in high-concurrency scenarios,
-  depending on the context. It remains to be seen how it affects performance in
-  real-world situations.
+  performance advantage grows with the level of concurrency. Interestingly, when
+  performing CPU-bound work, it seems slightly slightly slower. This should be
+  investigated.
 - The [pg](https://github.com/ged/ruby-pg) gem supports the use of fiber
   schedulers, and there too we see a marked performance advantage to using
   fibers instead of threads.
+According to these benchmarks, for I/O-bound scenarios the different fiber-based
+implementations present a average speedup as follows:
+|implementation|average factor|
+|--------------|--------------|
+|Async epoll   |x2.36         |
+|Async uring   |x2.42         |
+|UM FS         |x2.85         |
+|UM            |x6.20         |
 ## 1. I/O - Pipe
 50 groups, where in each group we create a pipe with a pair of threads/fibers
 writing/reading 1KB of data to the pipe.
 ```
-C=50x2         user     system      total        real
-Threads    2.501885   3.111840   5.613725 (  5.017991)
-Async FS   1.189332   0.526275   1.715607 (  1.715726)
-UM FS      0.715688   0.318851   1.034539 (  1.034723)
-UM pure    0.241029   0.365079   0.606108 (  0.606308)
-UM sqpoll  0.217577   0.634414   0.851991 (  0.593531)
+C=50x2           user     system      total        real
+Threads      2.105002   2.671980   4.776982 (  4.272842)
+ThreadPool   4.818014  10.740555  15.558569 (  7.070236)
+Async epoll  1.118937   0.254803   1.373740 (  1.374298)
+Async uring  1.363248   0.270063   1.633311 (  1.633696)
+UM FS        0.746332   0.183006   0.929338 (  0.929619)
+UM           0.237816   0.328352   0.566168 (  0.566265)
 ```
 ## 2. I/O - Socketpair
@@ -64,12 +74,13 @@ UM sqpoll  0.217577   0.634414   0.851991 (  0.593531)
 pair of threads/fibers writing/reading 1KB of data to the sockets.
 ```
-N=50           user     system      total        real
-Threads    2.372753   3.612468   5.985221 (  4.798625)
-Async FS   0.516226   0.877822   1.394048 (  1.394266)
-UM FS      0.521360   0.875674   1.397034 (  1.397327)
-UM pure    0.239353   0.642498   0.881851 (  0.881962)
-UM sqpoll  0.220933   1.021997   1.242930 (  0.976198)
+C=50x2           user     system      total        real
+Threads      2.068122   3.247781   5.315903 (  4.295488)
+ThreadPool   2.283882   3.461607   5.745489 (  4.650422)
+Async epoll  0.381400   0.846445   1.227845 (  1.227983)
+Async uring  0.472526   0.821467   1.293993 (  1.294166)
+UM FS        0.443023   0.734334   1.177357 (  1.177576)
+UM           0.116995   0.675997   0.792992 (  0.793183)
 ```
 ## 3. Mutex - CPU-bound
@@ -78,12 +89,12 @@ UM sqpoll  0.220933   1.021997   1.242930 (  0.976198)
 threads/fibers locking the mutex and performing a Regexp match.
 ```
-N=20           user     system      total        real
-Threads    5.348378   0.021847   5.370225 (  5.362117)
-Async FS   5.519970   0.003964   5.523934 (  5.524536)
-UM FS      5.505282   0.003983   5.509265 (  5.509840)
-UM pure    5.607048   0.002991   5.610039 (  5.610749)
-UM sqpoll  5.437836   5.418316  10.856152 (  5.443331)
+C=20x10          user     system      total        real
+Threads      5.174998   0.024885   5.199883 (  5.193211)
+Async epoll  5.309793   0.000949   5.310742 (  5.311217)
+Async uring  5.341404   0.004860   5.346264 (  5.346963)
+UM FS        5.363719   0.001976   5.365695 (  5.366254)
+UM           5.351073   0.005986   5.357059 (  5.357602)
 ```
 ## 4. Mutex - I/O-bound
@@ -93,81 +104,36 @@ start 10 worker threads/fibers locking the mutex and writing 1KB chunks to the
 file.
 ```
-N=1            user     system      total        real
-Threads    0.044103   0.057831   0.101934 (  0.087204)
-Async FS   0.050608   0.084449   0.135057 (  0.121300)
-UM FS      0.030355   0.077069   0.107424 (  0.108146)
-UM pure    0.024489   0.086201   0.110690 (  0.108023)
-UM sqpoll  0.022752   0.225133   0.247885 (  0.136251)
-N=5            user     system      total        real
-Threads    0.214296   0.384078   0.598374 (  0.467425)
-Async FS   0.085820   0.158782   0.244602 (  0.139766)
-UM FS      0.064279   0.147278   0.211557 (  0.117488)
-UM pure    0.036478   0.182950   0.219428 (  0.119745)
-UM sqpoll  0.036929   0.347573   0.384502 (  0.160814)
-N=10           user     system      total        real
-Threads    0.435688   0.752219   1.187907 (  0.924561)
-Async FS   0.126573   0.303704   0.430277 (  0.234900)
-UM FS      0.128427   0.215204   0.343631 (  0.184074)
-UM pure    0.065522   0.359659   0.425181 (  0.192385)
-UM sqpoll  0.076810   0.477429   0.554239 (  0.210087)
-N=20           user     system      total        real
-Threads    0.830763   1.585299   2.416062 (  1.868194)
-Async FS   0.291823   0.644043   0.935866 (  0.507887)
-UM FS      0.226202   0.460401   0.686603 (  0.362879)
-UM pure    0.120524   0.616274   0.736798 (  0.332182)
-UM sqpoll  0.177150   0.849890   1.027040 (  0.284069)
-N=50           user     system      total        real
-Threads    2.124048   4.182537   6.306585 (  4.878387)
-Async FS   0.897134   1.268629   2.165763 (  1.254624)
-UM FS      0.733193   0.971821   1.705014 (  0.933749)
-UM pure    0.226431   1.504441   1.730872 (  0.760731)
-UM sqpoll  0.557310   2.107389   2.664699 (  0.783992)
-N=100          user     system      total        real
-Threads    4.420832   8.628756  13.049588 ( 10.264590)
-Async FS   2.557661   2.532998   5.090659 (  3.179336)
-UM FS      2.262136   1.912055   4.174191 (  2.523789)
-UM pure    0.633897   2.793998   3.427895 (  1.612989)
-UM sqpoll  1.119460   4.193703   5.313163 (  1.525968)
+C=50x10          user     system      total        real
+Threads      2.042649   3.441547   5.484196 (  4.328783)
+Async epoll  0.810375   0.744084   1.554459 (  1.554726)
+Async uring  0.854985   1.129260   1.984245 (  1.140749)
+UM FS        0.686329   0.872376   1.558705 (  0.845214)
+UM           0.250370   1.323227   1.573597 (  0.720928)
 ```
-## 5. Queue
+## 5. Postgres client
-20 concurrent groups, where in each group we create a queue, start 5 producer
-threads/fibers that push items to the queue, and 10 consumer threads/fibers that
-pull items from the queue.
+C concurrent threads/fibers, each thread issuing SELECT query to a PG database.
 ```
-N=20           user     system      total        real
-Threads    2.522270   0.125569   2.647839 (  2.638276)
-Async FS   2.245917   0.044860   2.290777 (  2.291068)
-UM FS      2.235130   0.000958   2.236088 (  2.236392)
-UM pure    2.125827   0.225050   2.350877 (  2.351347)
-UM sqpoll  2.044662   2.460344   4.505006 (  2.261502)
+C=50             user     system      total        real
+Threads      4.304292   1.358116   5.662408 (  4.795725)
+Async epoll  2.890160   0.432836   3.322996 (  3.334350)
+Async uring  2.818439   0.433896   3.252335 (  3.252799)
+UM FS        2.819371   0.443182   3.262553 (  3.264606)
 ```
+## 6. Queue
-## 6. Postgres client
-C concurrent threads/fiber, each thread issuing SELECT query to a PG database.
+20 concurrent groups, where in each group we create a queue, start 5 producer
+threads/fibers that push items to the queue, and 10 consumer threads/fibers that
+pull items from the queue.
 ```
-C=10          user     system      total        real
-Threads   0.813844   0.358261   1.172105 (  0.987320)
-Async FS  0.545493   0.098608   0.644101 (  0.644636)
-UM FS     0.523503   0.094336   0.617839 (  0.619250)
-C=20          user     system      total        real
-Threads   1.652901   0.714299   2.367200 (  2.014781)
-Async FS  1.136826   0.212991   1.349817 (  1.350544)
-UM FS     1.084873   0.205865   1.290738 (  1.291865)
-C=50          user     system      total        real
-Threads   4.410604   1.804900   6.215504 (  5.253016)
-Async FS  2.918522   0.507981   3.426503 (  3.427966)
-UM FS     2.789549   0.537269   3.326818 (  3.329802)
+C=20x(5+10)      user     system      total        real
+Threads      4.880983   0.207451   5.088434 (  5.071019)
+Async epoll  4.107208   0.006519   4.113727 (  4.114227)
+Async uring  4.206283   0.028974   4.235257 (  4.235705)
+UM FS        4.082394   0.001719   4.084113 (  4.084522)
+UM           4.099893   0.323569   4.423462 (  4.424089)
 ```

data/benchmark/bm_io_pipe.rb CHANGED Viewed

@@ -38,6 +38,20 @@ class UMBenchmark
     end
   end
+  def do_baseline
+    GROUPS.times do
+      r, w = IO.pipe
+      r.sync = true
+      w.sync = true
+      ITERATIONS.times {
+        w.write(DATA)
+        r.read(SIZE)
+      }
+      r.close
+      w.close
+    end
+  end
   def do_scheduler(scheduler, ios)
     GROUPS.times do
       r, w = IO.pipe

data/benchmark/chart_all.png ADDED Viewed

Binary file

data/benchmark/common.rb CHANGED Viewed

@@ -54,6 +54,7 @@ class UMBenchmark
   end
   @@benchmarks = {
+    baseline:     [:baseline,     "No Concurrency"],
     threads:      [:threads,      "Threads"],
     thread_pool:  [:thread_pool,  "ThreadPool"],
     async_uring:  [:scheduler,    "Async uring"],
@@ -69,6 +70,10 @@ class UMBenchmark
     end
   end
+  def run_baseline
+    do_baseline
+  end
   def run_threads
     threads = []
     ios = []
@@ -118,6 +123,8 @@ class UMBenchmark
     fds = []
     do_um(machine, fibers, fds)
     machine.await_fibers(fibers)
+    puts "UM:"
+    p machine.metrics
     fds.each { machine.close(it) }
   end
@@ -128,6 +135,8 @@ class UMBenchmark
     do_um(machine, fibers, fds)
     machine.await_fibers(fibers)
     fds.each { machine.close_async(it) }
+    puts "UM sqpoll:"
+    p machine.metrics
     machine.snooze
   end
 end

data/benchmark/read_each.rb ADDED Viewed

@@ -0,0 +1,83 @@
+# frozen_string_literal: true
+require 'bundler/inline'
+gemfile do
+  source 'https://rubygems.org'
+  gem 'uringmachine', path: '..'
+  gem 'benchmark-ips'
+end
+require 'benchmark/ips'
+require 'uringmachine'
+@machine = UM.new
+make_socket_pair = -> do
+  port = 10000 + rand(30000)
+  server_fd = @machine.socket(UM::AF_INET, UM::SOCK_STREAM, 0, 0)
+  @machine.setsockopt(server_fd, UM::SOL_SOCKET, UM::SO_REUSEADDR, true)
+  @machine.bind(server_fd, '127.0.0.1', port)
+  @machine.listen(server_fd, UM::SOMAXCONN)
+  client_conn_fd = @machine.socket(UM::AF_INET, UM::SOCK_STREAM, 0, 0)
+  @machine.connect(client_conn_fd, '127.0.0.1', port)
+  server_conn_fd = @machine.accept(server_fd)
+  @machine.close(server_fd)
+  [client_conn_fd, server_conn_fd]
+end
+@client_fd, @server_fd = make_socket_pair.()
+@read_buf = +''
+@read_fiber = @machine.spin do
+  while true
+    @machine.read(@client_fd, @read_buf, 65536, 0)
+  end
+end
+STR_COUNT = ARGV[0]&.to_i || 3
+STR_SIZE = ARGV[1]&.to_i || 100
+@parts = ['*' * STR_SIZE] * STR_COUNT
+@server_io = IO.new(@server_fd)
+@server_io.sync = true
+def io_write
+  @server_io.write(*@parts)
+  @machine.snooze
+end
+def um_write
+  str = @parts.join
+  len = str.bytesize
+  while len > 0
+    ret = @machine.write(@server_fd, str, len)
+    len -= ret
+    str = str[ret..-1] if len > 0
+  end
+end
+def um_send
+  str = @parts.join
+  @machine.send(@server_fd, str, str.bytesize, UM::MSG_WAITALL)
+end
+@bgid = @machine.setup_buffer_ring(0, 8)
+def um_send_bundle
+  @machine.send_bundle(@server_fd, @bgid, @parts)
+end
+p(STR_COUNT:, STR_SIZE:)
+Benchmark.ips do |x|
+  x.report('IO#write')       { io_write }
+  x.report('UM#write')       { um_write }
+  x.report('UM#send')        { um_send }
+  x.report('UM#send_bundle') { um_send_bundle }
+  x.compare!(order: :baseline)
+end