RubyGems - iodine - Versions diffs - 0.2.17 → 0.3.0 - Mend

iodine 0.2.17 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of iodine might be problematic. Click here for more details.

Files changed (55) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/README.md +36 -3
data/bin/config.ru +23 -2
data/bin/http-hello +1 -1
data/bin/ws-shootout +5 -0
data/ext/iodine/defer.c +468 -0
data/ext/iodine/defer.h +105 -0
data/ext/iodine/evio.c +263 -0
data/ext/iodine/evio.h +133 -0
data/ext/iodine/extconf.rb +2 -1
data/ext/iodine/facil.c +958 -0
data/ext/iodine/facil.h +423 -0
data/ext/iodine/http.c +90 -0
data/ext/iodine/http.h +50 -12
data/ext/iodine/http1.c +200 -267
data/ext/iodine/http1.h +17 -26
data/ext/iodine/http1_request.c +81 -0
data/ext/iodine/http1_request.h +58 -0
data/ext/iodine/http1_response.c +403 -0
data/ext/iodine/http1_response.h +90 -0
data/ext/iodine/http1_simple_parser.c +124 -108
data/ext/iodine/http1_simple_parser.h +8 -3
data/ext/iodine/http_request.c +104 -0
data/ext/iodine/http_request.h +58 -102
data/ext/iodine/http_response.c +212 -208
data/ext/iodine/http_response.h +89 -252
data/ext/iodine/iodine_core.c +57 -46
data/ext/iodine/iodine_core.h +3 -1
data/ext/iodine/iodine_http.c +105 -81
data/ext/iodine/iodine_websocket.c +17 -13
data/ext/iodine/iodine_websocket.h +1 -0
data/ext/iodine/rb-call.c +9 -7
data/ext/iodine/{rb-libasync.h → rb-defer.c} +57 -49
data/ext/iodine/rb-rack-io.c +12 -6
data/ext/iodine/rb-rack-io.h +1 -1
data/ext/iodine/rb-registry.c +5 -2
data/ext/iodine/sock.c +1159 -0
data/ext/iodine/{libsock.h → sock.h} +138 -142
data/ext/iodine/spnlock.inc +77 -0
data/ext/iodine/websockets.c +101 -112
data/ext/iodine/websockets.h +38 -19
data/iodine.gemspec +3 -3
data/lib/iodine/version.rb +1 -1
data/lib/rack/handler/iodine.rb +6 -6
metadata +23 -19
data/ext/iodine/http_response_http1.h +0 -382
data/ext/iodine/libasync.c +0 -570
data/ext/iodine/libasync.h +0 -122
data/ext/iodine/libreact.c +0 -350
data/ext/iodine/libreact.h +0 -244
data/ext/iodine/libserver.c +0 -957
data/ext/iodine/libserver.h +0 -481
data/ext/iodine/libsock.c +0 -1025
data/ext/iodine/spnlock.h +0 -243

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: de923f79329ea2c150786d10000b43b6378612e3
-  data.tar.gz: 97749ca95384e2e5b0487c4bfda183690440257c
+  metadata.gz: ca71a751e0fec40a40f656630455dabd2c8b7fce
+  data.tar.gz: d80446503accae4cca189bdd7526571f8fb5c5a9
 SHA512:
-  metadata.gz: 40b2d878c483d077421eab48b4be2b02f6c3bb1ebf32cd43c9fc6007fdae110828df6b6d02867d54bf6ae524e3762dd8d9f95277712038a64e1727f00e3a1153
-  data.tar.gz: 289d7f82bd5f2a67621ada01dfc56143ba62c5dbeafcd0b4c5a544a9ba7c992b2c4dff5322b7b0fe21e656f3b9913716d8b523d0c708325028c54fc2a2cfa0bf
+  metadata.gz: c4fa45b4c91496c28cf85136b5ca15479d4ef6f615a29a688fe279d7258fae33ab5c7c4c48cf13676ef1b337c4d4e8efb5a2c9bf43d86bf8fe7ea8c628f5ee00
+  data.tar.gz: f3a404f09c1eb2a1cb0d18130ed06001e3831f0483c4b1d0c0a90d98dae863f896cc6ef7e4418c71b6cbf32455b341a245b5294688a86b036f7a7f9e6e3eba11

data/CHANGELOG.md CHANGED

@@ -8,6 +8,12 @@ Please notice that this change log contains changes for upcoming releases as wel
 ***
+#### Change log v.0.3.0
+**`facil.io` C Core Update**: The C library core that drives Iodine [`facil.io`](http://facil.io) was updated to version 0.4.0 and Iodine follows closely on the heels of this update. The transition was easy enough and the API remains unchanged... but because the performance gain was so big and because it's a new code base, we opted to bump the minor release version.
+***
 #### Change log v.0.2.17
 **Performance**: Enhanced Performance for single threaded / blocking applications by adding a dedicated IO thread. This is related to issue #14.

data/README.md CHANGED

@@ -31,7 +31,7 @@ Iodine includes a light and fast HTTP and Websocket server written in C that was
 Using the Iodine server is easy, simply add Iodine as a gem to your Rack application:
 ```ruby
-gem 'iodine', '>=0.2'
+gem 'iodine', '>=0.3'
 ```
 Iodine will calculate, when possible, a good enough default concurrency model for fast applications... this might not fit your application if you use database access or other blocking calls.
@@ -225,15 +225,48 @@ Iodine::Rack.run My_Broadcast
 Of course, if you still want to use Rack's `hijack` API, Iodine will support you - but be aware that you will need to implement your own reactor and thread pool for any sockets you hijack, as well as a socket buffer for non-blocking `write` operations (why do that when you can write a protocol object and have the main reactor manage the socket?).
+### Performance oriented design - but safety first
+Iodine is an evened server, similar in it's architecture to `nginx` and `puma`. It's different than the simple "thread-per-client" design that is often taught when we begin to learn about network programming.
+By leveraging `epoll` (on Linux) and `kqueue` (on BSD), iodine can listen to multiple network events on multiple sockets using a single thread.
+All these events go into a task queue, together with the application events and any user generated tasks, such as ones scheduled by [`Iodine.run`](http://www.rubydoc.info/github/boazsegev/iodine/Iodine#run-class_method).
+In pseudo-code, this might look like this
+```ruby
+QUEUE = Queue.new
+def server_cycle
+    QUEUE << get_next_32_socket_events # these events schedule the proper user code to run
+    QUEUE << [server]
+end
+def run_server
+      while ((event = QUEUE.pop))
+            event.shift.call(*event)
+      end
+end
+```
+In pure Ruby (without using C extensions or Java), it's possible to do the same by using `select`... and although `select` has some issues, it works well for smaller concurrency levels.
+The server events are fairly fast and fragmented (longer code is fragmented across multiple events), so one thread is enough to run the server including it's static file service and everything... but single threaded mode should probably be avoided.
+The thread pool is there to help slow user code. It's very common that the application's code will run slower and require external resources (i.e., databases, a pub/sub service, etc'). This slow code could "starve" the server (that is patiently waiting to run it's tasks on the same thread) - which is why a thread pool is often necessary.
+The slower your application code, the more threads you will need to keep the server running smoothly.
 ### How does it compare to other servers?
-Personally, after looking around, the only comparable servers are Puma and Passenger (the open source version), which Iodine significantly outperformed on my tests.
+Personally, after looking around, the only comparable servers are Puma and Passenger, which Iodine significantly outperformed on my tests (I didn't test Passenger's enterprise version).
 Since the HTTP and Websocket parsers are written in C (with no RegExp), they're fairly fast.
 Also, Iodine's core and parsers are running outside of Ruby's global lock, meaning that they enjoy true concurrency before entering the Ruby layer (your application) - this offers Iodine a big advantage over other Ruby servers.
-Another assumption Iodine makes is that it is behind a load balancer / proxy (which is the normal way Ruby applications are deployed) - this allows Iodine to disregard header validity checks (we're not checking for invalid characters) which speeds up the parsing process even further.
+Another assumption Iodine makes is that it is behind a load balancer / proxy (which is the normal way Ruby applications are deployed) - this allows Iodine to disregard header validity checks (we're not checking for invalid characters) and focus it's resources on other security and performance concerns.
 I recommend benchmarking the performance for yourself using `wrk` or `ab`:

data/bin/config.ru CHANGED

@@ -8,7 +8,8 @@ require 'rack/lint'
 # This value (app) sets which of the different applications will run.
 #
 # Valid values are "hello", "slow" (debugs env values), "simple"
-app = 'big'
+app = 'hello'
 # This is a simple Hello World Rack application, for benchmarking.
 HELLO_RESPONSE = [200, { 'Content-Type'.freeze => 'text/html'.freeze,
         'Content-Length'.freeze => '16'.freeze }.freeze,
@@ -19,7 +20,6 @@ hello = proc do |_env|
 end
 slow = proc do |env|
-  out = "ENV:\n<br/>\n#{env.to_a.map { |h| "#{h[0]}: #{h[1]}" } .join "\n<br/>\n"}\n<br/>\n"
   request = Rack::Request.new(env)
   # Benchmark.bm do |bm|
   #   bm.report('Reading from env Hash to a string X 1000') { 1000.times { out = "ENV:\r\n#{env.to_a.map { |h| "#{h[0]}: #{h[1]}" } .join "\n"}\n" } }
@@ -27,7 +27,10 @@ slow = proc do |env|
   # end
   if request.path_info == '/source'.freeze
     [200, { 'X-Sendfile' => File.expand_path(__FILE__) }, []]
+  elsif request.path_info == '/file'.freeze
+    [200, { 'X-Header' => 'This was a Rack::Sendfile response' }, File.open(__FILE__)]
   else
+    out = "ENV:\n<br/>\n#{env.to_a.map { |h| "#{h[0]}: #{h[1]}" } .join "\n<br/>\n"}\n<br/>\n"
     out += "\n<br/>\nRequest Path: #{request.path_info}\n<br/>\nParams:\n<br/>\n#{request.params.to_a.map { |h| "#{h[0]}: #{h[1]}" } .join "\n<br/>\n"}\n<br/>\n" unless request.params.empty?
     [200, { 'Content-Type'.freeze => 'text/html'.freeze,
             'Content-Length'.freeze => out.length.to_s },
@@ -49,11 +52,17 @@ simple = proc do |env|
 end
 logo_png = nil
 big = proc do |_env|
   logo_png ||= IO.binread '../logo.png'
   [200, { 'Content-Length'.freeze => logo_png.length.to_s , 'Content-Type'.freeze => 'image/png'.freeze}, [logo_png]]
 end
+bigX = proc do |_env|
+  logo_png ||= IO.binread '../logo.png'
+  [200, { 'Content-Length'.freeze => logo_png.length.to_s , 'Content-Type'.freeze => 'image/png'.freeze, 'X-Sendfile'.freeze => '../logo.png'.freeze}, [logo_png]]
+end
 case app
 when 'simple'
   use Rack::Sendfile
@@ -62,6 +71,8 @@ when 'hello'
   run hello
 when 'big'
   run big
+when 'bigX'
+  run bigX
 when 'slow'
   use Rack::Lint
   run slow
@@ -71,3 +82,13 @@ end
 # ab -n 1000000 -c 2000 -k http://127.0.0.1:3000/
 # wrk -c400 -d5 -t12 http://localhost:3000/
+#
+# def cycle
+#   puts `wrk -c4000 -d5 -t12 http://localhost:3000/`
+#   sleep(2)
+#   puts `wrk -c4000 -d5 -t12 http://localhost:3000/source`
+#   sleep(3)
+#   puts `wrk -c200 -d5 -t12 http://localhost:3000/file`
+#    true
+# end
+# sleep(10) while cycle

data/bin/http-hello CHANGED

@@ -13,7 +13,7 @@ require 'rack'
 # create the server object and setup any settings we might need.
 Iodine::Rack
-Iodine.threads ||= 16
+Iodine.threads ||= 1
 Iodine.processes ||= 1 # 4
 Iodine::Rack.public = '~/Documents/Scratch'
 count = 2

data/bin/ws-shootout CHANGED

@@ -33,6 +33,8 @@ class ShootoutApp
       msg = {type: 'broadcast', payload: payload}.to_json
       # Iodine::Websocket.each {|ws| ws.write msg}
       Iodine::Websocket.each_write(msg) # {|ws| true }
+      # each_write(msg) # {|ws| true }
+      # write msg
       write({type: "broadcastResult", payload: payload}.to_json)
     end
   end
@@ -46,6 +48,9 @@ Iodine::Rack.public = nil
 Iodine::Rack.app = ShootoutApp
 Iodine.start
+# websocket-bench broadcast ws://127.0.0.1:3000/ --concurrent 10 --sample-size 100 --step-size 1000 --limit-percentile 95 --limit-rtt 250ms --initial-clients 1000
+#
 # server.on_http= Proc.new do |env|
 #   # [200, {"Content-Length".freeze => "12".freeze}, ["Hello World!".freeze]];
 #   if env["HTTP_UPGRADE".freeze] =~ /websocket/i.freeze

data/ext/iodine/defer.c ADDED

@@ -0,0 +1,468 @@
+/*
+Copyright: Boaz Segev, 2016-2017
+License: MIT
+Feel free to copy, use and enjoy according to the license provided.
+*/
+#include "spnlock.inc"
+#include "defer.h"
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+/* *****************************************************************************
+Compile time settings
+***************************************************************************** */
+#ifndef DEFER_QUEUE_BUFFER
+#define DEFER_QUEUE_BUFFER 4096
+#endif
+#ifndef DEFER_THROTTLE
+#define DEFER_THROTTLE 8388608UL
+#endif
+/* *****************************************************************************
+Data Structures
+***************************************************************************** */
+typedef struct {
+  void (*func)(void *, void *);
+  void *arg1;
+  void *arg2;
+} task_s;
+typedef struct task_node_s {
+  task_s task;
+  struct task_node_s *next;
+} task_node_s;
+static task_node_s tasks_buffer[DEFER_QUEUE_BUFFER];
+static struct {
+  task_node_s *first;
+  task_node_s **last;
+  task_node_s *pool;
+  spn_lock_i lock;
+  unsigned char initialized;
+} deferred = {.first = NULL,
+              .last = &deferred.first,
+              .pool = NULL,
+              .lock = 0,
+              .initialized = 0};
+/* *****************************************************************************
+API
+***************************************************************************** */
+/** Defer an execution of a function for later. */
+int defer(void (*func)(void *, void *), void *arg1, void *arg2) {
+  if (!func)
+    goto call_error;
+  task_node_s *task;
+  spn_lock(&deferred.lock);
+  if (deferred.pool) {
+    task = deferred.pool;
+    deferred.pool = deferred.pool->next;
+  } else if (deferred.initialized) {
+    task = malloc(sizeof(task_node_s));
+    if (!task)
+      goto error;
+  } else
+    goto initialize;
+schedule:
+  *deferred.last = task;
+  deferred.last = &task->next;
+  task->task.func = func;
+  task->task.arg1 = arg1;
+  task->task.arg2 = arg2;
+  task->next = NULL;
+  spn_unlock(&deferred.lock);
+  return 0;
+error:
+  spn_unlock(&deferred.lock);
+  perror("ERROR CRITICAL: defer can't allocate task");
+  exit(9);
+call_error:
+  return -1;
+initialize:
+  deferred.initialized = 1;
+  task = tasks_buffer;
+  deferred.pool = tasks_buffer + 1;
+  for (size_t i = 1; i < (DEFER_QUEUE_BUFFER - 1); i++) {
+    tasks_buffer[i].next = &tasks_buffer[i + 1];
+  }
+  tasks_buffer[DEFER_QUEUE_BUFFER - 1].next = NULL;
+  goto schedule;
+}
+/** Performs all deferred functions until the queue had been depleted. */
+void defer_perform(void) {
+  task_node_s *tmp;
+  task_s task;
+restart:
+  spn_lock(&deferred.lock);
+  tmp = deferred.first;
+  if (tmp) {
+    deferred.first = tmp->next;
+    if (!deferred.first)
+      deferred.last = &deferred.first;
+    task = tmp->task;
+    if (tmp >= tasks_buffer && tmp < tasks_buffer + DEFER_QUEUE_BUFFER) {
+      tmp->next = deferred.pool;
+      deferred.pool = tmp;
+    } else {
+      free(tmp);
+    }
+    spn_unlock(&deferred.lock);
+    task.func(task.arg1, task.arg2);
+    goto restart;
+  } else
+    spn_unlock(&deferred.lock);
+}
+/** returns true if there are deferred functions waiting for execution. */
+int defer_has_queue(void) { return deferred.first != NULL; }
+/* *****************************************************************************
+Thread Pool Support
+***************************************************************************** */
+#if defined(__unix__) || defined(__APPLE__) || defined(__linux__) ||           \
+    defined(DEBUG)
+#include <pthread.h>
+#pragma weak defer_new_thread
+void *defer_new_thread(void *(*thread_func)(void *), pool_pt arg) {
+  pthread_t *thread = malloc(sizeof(*thread));
+  if (thread == NULL || pthread_create(thread, NULL, thread_func, (void *)arg))
+    goto error;
+  return thread;
+error:
+  free(thread);
+  return NULL;
+}
+#pragma weak defer_join_thread
+int defer_join_thread(void *p_thr) {
+  if (!p_thr)
+    return -1;
+  pthread_join(*(pthread_t *)p_thr, NULL);
+  free(p_thr);
+  return 0;
+}
+#else /* No pthreads... BYO thread implementation. */
+#pragma weak defer_new_thread
+void *defer_new_thread(void *(*thread_func)(void *), void *arg) {
+  (void)thread_func;
+  (void)arg;
+  return NULL;
+}
+#pragma weak defer_join_thread
+int defer_join_thread(void *p_thr) {
+  (void)p_thr;
+  return -1;
+}
+#endif /* DEBUG || pthread default */
+struct defer_pool {
+  unsigned int flag;
+  unsigned int count;
+  void *threads[];
+};
+static void *defer_worker_thread(void *pool) {
+  signal(SIGPIPE, SIG_IGN);
+  size_t throttle = (((pool_pt)pool)->count & 127) * DEFER_THROTTLE;
+  do {
+    throttle_thread(throttle);
+    defer_perform();
+  } while (((pool_pt)pool)->flag);
+  return NULL;
+}
+void defer_pool_stop(pool_pt pool) { pool->flag = 0; }
+int defer_pool_is_active(pool_pt pool) { return pool->flag; }
+void defer_pool_wait(pool_pt pool) {
+  while (pool->count) {
+    pool->count--;
+    defer_join_thread(pool->threads[pool->count]);
+  }
+}
+static inline pool_pt defer_pool_initialize(unsigned int thread_count,
+                                            pool_pt pool) {
+  pool->flag = 1;
+  pool->count = 0;
+  while (pool->count < thread_count &&
+         (pool->threads[pool->count] =
+              defer_new_thread(defer_worker_thread, pool)))
+    pool->count++;
+  if (pool->count == thread_count) {
+    return pool;
+  }
+  defer_pool_stop(pool);
+  return NULL;
+}
+pool_pt defer_pool_start(unsigned int thread_count) {
+  if (thread_count == 0)
+    return NULL;
+  pool_pt pool = malloc(sizeof(*pool) + (thread_count * sizeof(void *)));
+  if (!pool)
+    return NULL;
+  return defer_pool_initialize(thread_count, pool);
+}
+/* *****************************************************************************
+Child Process support (`fork`)
+***************************************************************************** */
+static pool_pt forked_pool;
+static void sig_int_handler(int sig) {
+  if (sig != SIGINT)
+    return;
+  if (!forked_pool)
+    return;
+  defer_pool_stop(forked_pool);
+}
+/* *
+Zombie Reaping
+With thanks to Dr Graham D Shaw.
+http://www.microhowto.info/howto/reap_zombie_processes_using_a_sigchld_handler.html
+*/
+void reap_child_handler(int sig) {
+  (void)(sig);
+  int old_errno = errno;
+  while (waitpid(-1, NULL, WNOHANG) > 0)
+    ;
+  errno = old_errno;
+}
+inline static void reap_children(void) {
+  struct sigaction sa;
+  sa.sa_handler = reap_child_handler;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
+  if (sigaction(SIGCHLD, &sa, 0) == -1) {
+    perror("Child reaping initialization failed");
+    exit(1);
+  }
+}
+/**
+ * Forks the process, starts up a thread pool and waits for all tasks to run.
+ * All existing tasks will run in all processes (multiple times).
+ *
+ * Returns 0 on success, -1 on error and a positive number if this is a child
+ * process that was forked.
+ */
+int defer_perform_in_fork(unsigned int process_count,
+                          unsigned int thread_count) {
+  struct sigaction act, old, old_term, old_pipe;
+  pid_t *pids = NULL;
+  int ret = 0;
+  unsigned int pids_count;
+  act.sa_handler = sig_int_handler;
+  sigemptyset(&act.sa_mask);
+  act.sa_flags = SA_RESTART | SA_NOCLDSTOP;
+  if (sigaction(SIGINT, &act, &old)) {
+    perror("couldn't set signal handler");
+    goto finish;
+  };
+  if (sigaction(SIGTERM, &act, &old_term)) {
+    perror("couldn't set signal handler");
+    goto finish;
+  };
+  act.sa_handler = SIG_IGN;
+  if (sigaction(SIGPIPE, &act, &old_pipe)) {
+    perror("couldn't set signal handler");
+    goto finish;
+  };
+  reap_children();
+  if (!process_count)
+    process_count = 1;
+  --process_count;
+  pids = calloc(process_count, sizeof(*pids));
+  if (process_count && !pids)
+    goto finish;
+  for (pids_count = 0; pids_count < process_count; pids_count++) {
+    if (!(pids[pids_count] = fork())) {
+      forked_pool = defer_pool_start(thread_count);
+      defer_pool_wait(forked_pool);
+      defer_perform();
+      defer_perform();
+      return 1;
+    }
+    if (pids[pids_count] == -1) {
+      ret = -1;
+      goto finish;
+    }
+  }
+  pids_count++;
+  forked_pool = defer_pool_start(thread_count);
+  defer_pool_wait(forked_pool);
+  forked_pool = NULL;
+  defer_perform();
+finish:
+  if (pids) {
+    for (size_t j = 0; j < pids_count; j++) {
+      kill(pids[j], SIGINT);
+    }
+    for (size_t j = 0; j < pids_count; j++) {
+      waitpid(pids[j], NULL, 0);
+    }
+    free(pids);
+  }
+  sigaction(SIGINT, &old, &act);
+  sigaction(SIGTERM, &old_term, &act);
+  sigaction(SIGTERM, &old_pipe, &act);
+  return ret;
+}
+/** Returns TRUE (1) if the forked thread pool hadn't been signaled to finish
+ * up. */
+int defer_fork_is_active(void) { return forked_pool && forked_pool->flag; }
+/* *****************************************************************************
+Test
+***************************************************************************** */
+#ifdef DEBUG
+#include <stdio.h>
+#include <pthread.h>
+#define DEFER_TEST_THREAD_COUNT 128
+static spn_lock_i i_lock = 0;
+static size_t i_count = 0;
+static void sample_task(void *unused, void *unused2) {
+  (void)(unused);
+  (void)(unused2);
+  spn_lock(&i_lock);
+  i_count++;
+  spn_unlock(&i_lock);
+}
+static void sched_sample_task(void *unused, void *unused2) {
+  (void)(unused);
+  (void)(unused2);
+  for (size_t i = 0; i < 1024; i++) {
+    defer(sample_task, NULL, NULL);
+  }
+}
+static void thrd_sched(void *unused, void *unused2) {
+  for (size_t i = 0; i < (1024 / DEFER_TEST_THREAD_COUNT); i++) {
+    sched_sample_task(unused, unused2);
+  }
+}
+static void text_task_text(void *unused, void *unused2) {
+  (void)(unused);
+  (void)(unused2);
+  spn_lock(&i_lock);
+  fprintf(stderr, "this text should print before defer_perform returns\n");
+  spn_unlock(&i_lock);
+}
+static void text_task(void *a1, void *a2) {
+  static const struct timespec tm = {.tv_sec = 2};
+  nanosleep(&tm, NULL);
+  defer(text_task_text, a1, a2);
+}
+static void pid_task(void *arg, void *unused2) {
+  (void)(unused2);
+  fprintf(stderr, "* %d pid is going to sleep... (%s)\n", getpid(),
+          arg ? (char *)arg : "unknown");
+}
+void defer_test(void) {
+  time_t start, end;
+  fprintf(stderr, "Starting defer testing\n");
+  spn_lock(&i_lock);
+  i_count = 0;
+  spn_unlock(&i_lock);
+  start = clock();
+  for (size_t i = 0; i < 1024; i++) {
+    defer(sched_sample_task, NULL, NULL);
+  }
+  defer_perform();
+  end = clock();
+  fprintf(stderr, "Defer single thread: %lu cycles with i_count = %lu\n",
+          end - start, i_count);
+  spn_lock(&i_lock);
+  i_count = 0;
+  spn_unlock(&i_lock);
+  start = clock();
+  pool_pt pool = defer_pool_start(DEFER_TEST_THREAD_COUNT);
+  if (pool) {
+    for (size_t i = 0; i < DEFER_TEST_THREAD_COUNT; i++) {
+      defer(thrd_sched, NULL, NULL);
+    }
+    // defer((void (*)(void *))defer_pool_stop, pool);
+    defer_pool_stop(pool);
+    defer_pool_wait(pool);
+    end = clock();
+    fprintf(stderr,
+            "Defer multi-thread (%d threads): %lu cycles with i_count = %lu\n",
+            DEFER_TEST_THREAD_COUNT, end - start, i_count);
+  } else
+    fprintf(stderr, "Defer multi-thread: FAILED!\n");
+  spn_lock(&i_lock);
+  i_count = 0;
+  spn_unlock(&i_lock);
+  start = clock();
+  for (size_t i = 0; i < 1024; i++) {
+    defer(sched_sample_task, NULL, NULL);
+  }
+  defer_perform();
+  end = clock();
+  fprintf(stderr, "Defer single thread (2): %lu cycles with i_count = %lu\n",
+          end - start, i_count);
+  fprintf(stderr, "calling defer_perform.\n");
+  defer(text_task, NULL, NULL);
+  defer_perform();
+  fprintf(stderr, "defer_perform returned. i_count = %lu\n", i_count);
+  size_t pool_count = 0;
+  task_node_s *pos = deferred.pool;
+  while (pos) {
+    pool_count++;
+    pos = pos->next;
+  }
+  fprintf(stderr, "defer pool count %lu/%d (%s)\n", pool_count,
+          DEFER_QUEUE_BUFFER,
+          pool_count == DEFER_QUEUE_BUFFER ? "pass" : "FAILED");
+  fprintf(stderr, "press ^C to finish PID test\n");
+  defer(pid_task, "pid test", NULL);
+  if (defer_perform_in_fork(4, 64) > 0) {
+    fprintf(stderr, "* %d finished\n", getpid());
+    exit(0);
+  };
+  fprintf(stderr,
+          "   === Defer pool memory footprint %lu X %d = %lu bytes ===\n",
+          sizeof(task_node_s), DEFER_QUEUE_BUFFER, sizeof(tasks_buffer));
+}
+#endif