RubyGems - kino - Versions diffs - 0.1.0 - Mend

kino 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +7 -0
data/.yardopts +14 -0
data/CHANGELOG.md +54 -0
data/Cargo.lock +993 -0
data/Cargo.toml +15 -0
data/LICENSE.txt +21 -0
data/README.md +384 -0
data/doc/README.md +6 -0
data/doc/architecture.md +161 -0
data/doc/benchmarks.md +321 -0
data/doc/rails-on-ractors.md +50 -0
data/doc/why-kino.md +91 -0
data/exe/kino +26 -0
data/ext/kino/Cargo.toml +49 -0
data/ext/kino/build.rs +5 -0
data/ext/kino/extconf.rb +6 -0
data/ext/kino/src/env_strings.rs +318 -0
data/ext/kino/src/gvl.rs +103 -0
data/ext/kino/src/lib.rs +90 -0
data/ext/kino/src/logsink.rs +155 -0
data/ext/kino/src/queue.rs +207 -0
data/ext/kino/src/registry.rs +268 -0
data/ext/kino/src/request.rs +432 -0
data/ext/kino/src/response.rs +214 -0
data/ext/kino/src/server.rs +621 -0
data/ext/kino/src/style.rs +87 -0
data/ext/kino/src/test_support.rs +82 -0
data/ext/kino/src/timer.rs +57 -0
data/ext/kino/src/tls.rs +96 -0
data/lib/kino/check.rb +199 -0
data/lib/kino/cli.rb +254 -0
data/lib/kino/configuration.rb +190 -0
data/lib/kino/errors_stream.rb +25 -0
data/lib/kino/input.rb +77 -0
data/lib/kino/logger.rb +56 -0
data/lib/kino/null_input.rb +37 -0
data/lib/kino/ractor_supervisor.rb +103 -0
data/lib/kino/server.rb +271 -0
data/lib/kino/stream.rb +61 -0
data/lib/kino/templates/kino.rb.tt +141 -0
data/lib/kino/version.rb +6 -0
data/lib/kino/worker.rb +124 -0
data/lib/kino.rb +53 -0
data/sig/kino.rbs +178 -0
metadata +219 -0

data/ext/kino/src/queue.rs ADDED Viewed

@@ -0,0 +1,207 @@
+//! The worker-side bridge: batched request intake and the fused
+//! respond-and-take call. This is where FFI crossings per request went
+//! from three in early designs (take, env, respond) to amortized ~one.
+//!
+//! Blocking discipline (everywhere in this crate): bounded `recv_timeout`
+//! ticks + an AtomicBool interrupt flag. No flume::Selector: it loses
+//! wakeups under churn, observed as workers going permanently deaf to a
+//! non-empty queue after ~100k requests.
+use std::cell::RefCell;
+use std::sync::atomic::Ordering;
+use std::sync::Arc;
+use std::time::Duration;
+use magnus::prelude::*;
+use magnus::{Error, RArray, RHash, RString, Ruby};
+use crate::gvl;
+use crate::registry::{self, BoxedCtx, ServerInner, WorkerSlot};
+use crate::request::Request;
+pub const TICK: Duration = Duration::from_millis(50);
+/// None = shutdown (queue closed or interrupted); the caller can't tell
+/// the difference and doesn't need to.
+type Taken = Option<BoxedCtx>;
+/// Block until one request arrives (GVL released, interruptible).
+/// No busy-poll before parking, deliberately: the wake-per-request futex
+/// cost is real (~20% of cycles at saturation, per perf), but a measured
+/// 20µs spin made things WORSE on oversubscribed cores: spinners steal
+/// exactly the CPU the tokio threads need.
+fn block_take(server: &ServerInner, slot: &Arc<WorkerSlot>) -> Taken {
+    if slot.lane_rx.is_some() {
+        return lane_take(server, slot);
+    }
+    let req_rx = &server.req_rx;
+    // Fast path: a request is already queued (the common case under load).
+    // try_recv never blocks, so the whole GVL release/reacquire (two
+    // scheduler round-trips per request) is skipped entirely.
+    match req_rx.try_recv() {
+        Ok(ctx) => Some(ctx),
+        Err(flume::TryRecvError::Disconnected) => None,
+        Err(flume::TryRecvError::Empty) => {
+            gvl::interruptible(&slot.interrupted, || match req_rx.recv_timeout(TICK) {
+                Ok(ctx) => Some(Some(ctx)),
+                Err(flume::RecvTimeoutError::Timeout) => None,
+                Err(flume::RecvTimeoutError::Disconnected) => Some(None),
+            })
+            .flatten()
+        }
+    }
+}
+/// Lane-mode take: own lane first (no wake needed while the dispatcher
+/// keeps feeding an awake lane), then steal from siblings, then park on
+/// the own lane with the parked flag raised so the dispatcher avoids it.
+fn lane_take(server: &ServerInner, slot: &Arc<WorkerSlot>) -> Taken {
+    let lane_rx = slot.lane_rx.as_ref().expect("lane_take without lane");
+    let steal = || -> Option<BoxedCtx> {
+        let slots = server.slots.read();
+        for other in slots.iter() {
+            if Arc::ptr_eq(other, slot) {
+                continue;
+            }
+            if let Some(rx) = other.lane_rx.as_ref() {
+                if let Ok(ctx) = rx.try_recv() {
+                    return Some(ctx);
+                }
+            }
+        }
+        None
+    };
+    // Hot path, GVL still held: own lane, then a steal sweep.
+    match lane_rx.try_recv() {
+        Ok(ctx) => return Some(ctx),
+        Err(flume::TryRecvError::Disconnected) => return None,
+        Err(flume::TryRecvError::Empty) => {}
+    }
+    if let Some(ctx) = steal() {
+        return Some(ctx);
+    }
+    // Park. The flag-then-recheck order closes the race with a dispatcher
+    // that read parked=false just before we set it: anything it sent lands
+    // in the lane, and recv_timeout checks the queue before sleeping.
+    slot.parked.store(true, Ordering::SeqCst);
+    let taken = gvl::interruptible(&slot.interrupted, || {
+        match lane_rx.recv_timeout(TICK) {
+            Ok(ctx) => Some(Some(ctx)),
+            // Periodic steal so a backlog behind a slow sibling can't
+            // outlive a tick.
+            Err(flume::RecvTimeoutError::Timeout) => steal().map(Some),
+            Err(flume::RecvTimeoutError::Disconnected) => Some(None),
+        }
+    })
+    .flatten();
+    slot.parked.store(false, Ordering::SeqCst);
+    taken
+}
+/// Wrap a ctx into its env Hash, with the Ruby request handle embedded
+/// under the frozen "kino.request" key (one Hash carries everything, no
+/// per-request pair array). Registered in the slot's in-flight list;
+/// created inside the calling ractor, so handle ownership is correct by
+/// construction.
+fn admit(
+    ruby: &Ruby,
+    server: &ServerInner,
+    slot: &Arc<WorkerSlot>,
+    mut ctx: BoxedCtx,
+) -> Result<RHash, Error> {
+    server.served.fetch_add(1, Ordering::Relaxed);
+    slot.current.lock().push(Arc::downgrade(&ctx.responder));
+    // Wire the slot into the request so blocked body reads/writes are
+    // interruptible the same way the queue pop is.
+    ctx.slot = Some(slot.clone());
+    let env = crate::request::build_env(ruby, &ctx)?;
+    let request = ruby.obj_wrap(Request(RefCell::new(*ctx)));
+    let key = ruby.get_inner(crate::env_strings::get().kino_request);
+    env.aset(key, request.as_value())?;
+    Ok(env)
+}
+type Checkout = (Arc<ServerInner>, Arc<WorkerSlot>, BoxedCtx);
+fn checkout(ruby: &Ruby, server_id: u64, worker_id: usize) -> Result<Option<Checkout>, Error> {
+    let Some(server) = registry::try_get(server_id) else {
+        return Ok(None); // server torn down → clean shutdown signal
+    };
+    let slot = server.slot(ruby, worker_id)?;
+    // The previous batch is fully answered once the worker comes back.
+    slot.current.lock().clear();
+    slot.interrupted.store(false, Ordering::SeqCst);
+    Ok(block_take(&server, &slot).map(|ctx| (server, slot, ctx)))
+}
+/// Take one request; returns its env Hash (request handle inside under
+/// "kino.request") or nil on shutdown. The batch-of-one hot path: no
+/// arrays allocated at all.
+pub fn take_one(ruby: &Ruby, server_id: u64, worker_id: usize) -> Result<Option<RHash>, Error> {
+    match checkout(ruby, server_id, worker_id)? {
+        Some((server, slot, ctx)) => Ok(Some(admit(ruby, &server, &slot, ctx)?)),
+        None => Ok(None),
+    }
+}
+/// Take up to `max` requests: block for the first, drain the rest
+/// non-blocking (they only batch when the queue is already deep).
+/// Returns nil on shutdown; otherwise an Array of env Hashes.
+pub fn take_batch(
+    ruby: &Ruby,
+    server_id: u64,
+    worker_id: usize,
+    max: usize,
+) -> Result<Option<RArray>, Error> {
+    let Some((server, slot, first)) = checkout(ruby, server_id, worker_id)? else {
+        return Ok(None);
+    };
+    let batch = ruby.ary_new_capa(max.max(1));
+    batch.push(admit(ruby, &server, &slot, first)?)?;
+    for _ in 1..max {
+        match server.req_rx.try_recv() {
+            Ok(ctx) => batch.push(admit(ruby, &server, &slot, ctx)?)?,
+            Err(_) => break,
+        }
+    }
+    Ok(Some(batch))
+}
+/// The fused hot path: answer `request` (complete response in one shot)
+/// and immediately take the next request. One FFI crossing per request
+/// once the loop is warm.
+pub fn respond_and_take_one(
+    ruby: &Ruby,
+    request: &Request,
+    server_id: u64,
+    worker_id: usize,
+    status: u16,
+    headers: RHash,
+    body: RString,
+) -> Result<Option<RHash>, Error> {
+    crate::request::respond_simple(ruby, request, status, headers, body)?;
+    take_one(ruby, server_id, worker_id)
+}
+/// Batch variant of the fused call.
+#[allow(clippy::too_many_arguments)]
+pub fn respond_and_take(
+    ruby: &Ruby,
+    request: &Request,
+    server_id: u64,
+    worker_id: usize,
+    max: usize,
+    status: u16,
+    headers: RHash,
+    body: RString,
+) -> Result<Option<RArray>, Error> {
+    crate::request::respond_simple(ruby, request, status, headers, body)?;
+    take_batch(ruby, server_id, worker_id, max)
+}

data/ext/kino/src/registry.rs ADDED Viewed

@@ -0,0 +1,268 @@
+//! Global server registry. Ruby never holds a pointer to native state:
+//! workers receive plain integers (server id, worker id), both
+//! Ractor-shareable, and every native call resolves them here. This is what
+//! keeps TypedData objects from ever crossing a ractor boundary.
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+use std::sync::{Arc, OnceLock, Weak};
+use parking_lot::{Mutex, RwLock};
+use crate::request::RequestCtx;
+use crate::response::Responder;
+/// Requests travel through channels boxed: one heap allocation at accept
+/// time instead of moving ~300 bytes by value through every channel hop.
+pub type BoxedCtx = Box<RequestCtx>;
+/// Probed on every take; keys are our own ids, so ahash over SipHash.
+type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>;
+/// One per `Kino::Server`. Owns the tokio runtime, the request queue and the
+/// worker slots.
+pub struct ServerInner {
+    pub id: u64,
+    /// Senders' side of the request queue. `close_queue` takes it; once all
+    /// clones drop, blocked workers see Disconnected and exit their loops.
+    pub req_tx: Mutex<Option<flume::Sender<BoxedCtx>>>,
+    pub req_rx: flume::Receiver<BoxedCtx>,
+    /// Signals the accept loop to stop. Watch channel: `true` = draining.
+    pub shutdown_tx: tokio::sync::watch::Sender<bool>,
+    /// Runtime is kept so we can shut it down explicitly; in an Option so
+    /// `shutdown_runtime` can take ownership out of the Arc.
+    pub runtime: Mutex<Option<tokio::runtime::Runtime>>,
+    pub slots: RwLock<Vec<Arc<WorkerSlot>>>,
+    pub in_flight: AtomicUsize,
+    /// Requests handed to Ruby workers (admitted), and requests rejected
+    /// with a 503 (queue full / draining). Relaxed: stats-only counters.
+    pub served: AtomicU64,
+    pub rejected: AtomicU64,
+    pub queue_timeout_ms: u64,
+    /// 0 = no request timeout; otherwise the response head must arrive
+    /// within this many ms or the client gets a 504.
+    pub request_timeout_ms: u64,
+    pub timeouts: AtomicU64,
+    pub https: bool,
+    /// Native access log sink (None unless log_requests is on).
+    pub access_log: Option<crate::logsink::Sink>,
+    /// Lane-dispatch mode: per-worker queues, awake-preferring dispatch.
+    pub lanes: bool,
+    /// Round-robin cursor for lane dispatch.
+    pub lane_cursor: AtomicUsize,
+}
+/// One per worker *thread* (slot count = workers × threads). The interrupt
+/// flag is the UBF target and the shutdown kick: blocking natives poll it
+/// between bounded waits (flume::Selector proved to lose wakeups under
+/// churn, so no select-style blocking anywhere). The in-flight list lets
+/// the supervisor 500 every request a dead ractor was holding; workers
+/// take requests in small batches, so there can be several.
+pub struct WorkerSlot {
+    pub interrupted: std::sync::atomic::AtomicBool,
+    pub current: Mutex<smallvec::SmallVec<[Weak<Responder>; 8]>>,
+    /// Lane mode only: this worker's private queue and its parked flag.
+    /// The dispatcher prefers awake (non-parked) lanes so a hot worker
+    /// keeps taking without ever paying the futex wake.
+    pub lane_tx: Mutex<Option<flume::Sender<BoxedCtx>>>,
+    pub lane_rx: Option<flume::Receiver<BoxedCtx>>,
+    pub parked: std::sync::atomic::AtomicBool,
+}
+/// Per-lane depth cap: small, so a slow handler can only ever delay this
+/// many queued neighbors (work stealing rescues them anyway).
+pub const LANE_DEPTH: usize = 4;
+impl WorkerSlot {
+    fn new(lanes: bool) -> Self {
+        let (lane_tx, lane_rx) = if lanes {
+            let (tx, rx) = flume::bounded(LANE_DEPTH);
+            (Some(tx), Some(rx))
+        } else {
+            (None, None)
+        };
+        WorkerSlot {
+            interrupted: std::sync::atomic::AtomicBool::new(false),
+            current: Mutex::new(smallvec::SmallVec::new()),
+            lane_tx: Mutex::new(lane_tx),
+            lane_rx,
+            parked: std::sync::atomic::AtomicBool::new(false),
+        }
+    }
+}
+static REGISTRY: OnceLock<RwLock<HashMap<u64, Arc<ServerInner>>>> = OnceLock::new();
+static NEXT_SERVER_ID: AtomicU64 = AtomicU64::new(1);
+fn registry() -> &'static RwLock<HashMap<u64, Arc<ServerInner>>> {
+    REGISTRY.get_or_init(|| RwLock::new(HashMap::default()))
+}
+pub fn next_server_id() -> u64 {
+    NEXT_SERVER_ID.fetch_add(1, Ordering::Relaxed)
+}
+pub fn insert(server: Arc<ServerInner>) {
+    registry().write().insert(server.id, server);
+}
+pub fn remove(id: u64) -> Option<Arc<ServerInner>> {
+    registry().write().remove(&id)
+}
+pub fn get(ruby: &magnus::Ruby, id: u64) -> Result<Arc<ServerInner>, magnus::Error> {
+    registry().read().get(&id).cloned().ok_or_else(|| {
+        magnus::Error::new(ruby.exception_arg_error(), format!("unknown server {id}"))
+    })
+}
+/// Tolerant lookup for lifecycle paths: a worker waking up after teardown
+/// must see "server gone" as a clean shutdown signal, not an exception.
+pub fn try_get(id: u64) -> Option<Arc<ServerInner>> {
+    registry().read().get(&id).cloned()
+}
+impl ServerInner {
+    /// Per-lane queue depths; None unless lane dispatch is on.
+    pub fn lane_depths(&self) -> Option<Vec<usize>> {
+        if !self.lanes {
+            return None;
+        }
+        Some(
+            self.slots
+                .read()
+                .iter()
+                .filter_map(|s| s.lane_rx.as_ref().map(|rx| rx.len()))
+                .collect(),
+        )
+    }
+    /// Requests waiting anywhere: the global queue plus any open lanes.
+    pub fn queued(&self) -> usize {
+        self.req_rx.len() + self.lane_depths().map_or(0, |d| d.iter().sum())
+    }
+    pub fn register_worker(&self) -> usize {
+        let mut slots = self.slots.write();
+        slots.push(Arc::new(WorkerSlot::new(self.lanes)));
+        slots.len() - 1
+    }
+    pub fn slot(
+        &self,
+        ruby: &magnus::Ruby,
+        worker_id: usize,
+    ) -> Result<Arc<WorkerSlot>, magnus::Error> {
+        self.slots.read().get(worker_id).cloned().ok_or_else(|| {
+            magnus::Error::new(
+                ruby.exception_arg_error(),
+                format!("unknown worker {worker_id}"),
+            )
+        })
+    }
+}
+/// A ServerInner with no runtime and no registry entry, for pure-Rust
+/// tests of queue accounting and dispatch. Ids are unique so tests that
+/// do insert into the global registry can run in parallel.
+#[cfg(test)]
+pub fn test_server(lanes: bool, queue_depth: usize) -> Arc<ServerInner> {
+    let (req_tx, req_rx) = flume::bounded(queue_depth);
+    let (shutdown_tx, _shutdown_rx) = tokio::sync::watch::channel(false);
+    Arc::new(ServerInner {
+        id: next_server_id(),
+        req_tx: Mutex::new(Some(req_tx)),
+        req_rx,
+        shutdown_tx,
+        runtime: Mutex::new(None),
+        slots: RwLock::new(Vec::new()),
+        in_flight: AtomicUsize::new(0),
+        served: AtomicU64::new(0),
+        rejected: AtomicU64::new(0),
+        queue_timeout_ms: 10,
+        request_timeout_ms: 0,
+        timeouts: AtomicU64::new(0),
+        https: false,
+        access_log: None,
+        lanes,
+        lane_cursor: AtomicUsize::new(0),
+    })
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::request::test_ctx;
+    #[test]
+    fn worker_registration_hands_out_sequential_slot_ids() {
+        let server = test_server(false, 4);
+        assert_eq!(server.register_worker(), 0);
+        assert_eq!(server.register_worker(), 1);
+        assert_eq!(server.slots.read().len(), 2);
+        // Shared-queue mode creates no lane channels.
+        assert!(server.slots.read()[0].lane_rx.is_none());
+    }
+    #[test]
+    fn lane_mode_slots_get_bounded_lanes() {
+        let server = test_server(true, 4);
+        server.register_worker();
+        let slots = server.slots.read();
+        let lane_rx = slots[0].lane_rx.as_ref().expect("lane created");
+        assert_eq!(lane_rx.capacity(), Some(LANE_DEPTH));
+        assert!(slots[0].lane_tx.lock().is_some());
+    }
+    #[test]
+    fn queued_counts_the_shared_queue() {
+        let server = test_server(false, 4);
+        assert_eq!(server.queued(), 0);
+        let tx = server.req_tx.lock().clone().expect("queue open");
+        tx.send(test_ctx()).expect("queue has room");
+        tx.send(test_ctx()).expect("queue has room");
+        assert_eq!(server.queued(), 2);
+        assert!(server.lane_depths().is_none());
+    }
+    #[test]
+    fn queued_includes_lanes_and_lane_depths_reports_per_slot() {
+        let server = test_server(true, 4);
+        server.register_worker();
+        server.register_worker();
+        let slots = server.slots.read();
+        let lane0 = slots[0].lane_tx.lock().clone().expect("lane open");
+        lane0.send(test_ctx()).expect("lane has room");
+        drop(slots);
+        assert_eq!(server.lane_depths(), Some(vec![1, 0]));
+        assert_eq!(server.queued(), 1);
+    }
+    #[test]
+    fn registry_lifecycle_insert_lookup_remove() {
+        let server = test_server(false, 1);
+        let id = server.id;
+        assert!(try_get(id).is_none());
+        insert(server);
+        assert!(try_get(id).is_some());
+        let removed = remove(id).expect("was registered");
+        assert_eq!(removed.id, id);
+        // Late wakers see "gone" as a clean shutdown signal, not a panic.
+        assert!(try_get(id).is_none());
+        assert!(remove(id).is_none());
+    }
+    #[test]
+    fn server_ids_are_unique() {
+        let a = next_server_id();
+        let b = next_server_id();
+        assert_ne!(a, b);
+    }
+}