honker 0.1.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3116 @@
1
+ //! Shared Rust core for the honker bindings.
2
+ //!
3
+ //! This crate is NOT intended for direct use. It's the plain-Rust
4
+ //! foundation that three binding crates depend on:
5
+ //!
6
+ //! * `honker` — PyO3 Python extension
7
+ //! * `honker-extension` — SQLite loadable extension (cdylib)
8
+ //! * `honker-node` — napi-rs Node.js binding
9
+ //!
10
+ //! Moving this code here once avoids the three-copies-of-the-same-SQL
11
+ //! problem every binding would otherwise suffer. Behavioral drift
12
+ //! between the three bindings was a real risk — one would get a new
13
+ //! PRAGMA, one wouldn't, and silent inconsistencies would surface only
14
+ //! when a Python process and a Node process tried to share a `.db`
15
+ //! file.
16
+ //!
17
+ //! What's here:
18
+ //!
19
+ //! - [`open_conn`] — open a SQLite connection with the library's
20
+ //! PRAGMA defaults (WAL, synchronous=NORMAL, 32MB cache, etc.).
21
+ //! - [`attach_notify`] — create `_honker_notifications` and
22
+ //! register the `notify(channel, payload)` SQL scalar function.
23
+ //! - [`Writer`] — single-connection write slot with blocking
24
+ //! acquire, non-blocking try_acquire, and release.
25
+ //! - [`Readers`] — bounded pool of reader connections that open
26
+ //! lazily up to a max.
27
+ //! - [`UpdateWatcher`] — 1 ms PRAGMA-polling thread that fires a
28
+ //! callback on every database commit. Uses `PRAGMA data_version`
29
+ //! for precise change detection, with a periodic stat identity check
30
+ //! to detect file replacement. Bindings wrap this to surface wake
31
+ //! events to their language's async primitive.
32
+ //!
33
+ //! Anything language-specific — PyO3 classes, napi classes, SQLite
34
+ //! entry-point symbols, row-materialization into Python dicts or JS
35
+ //! objects — stays in the respective binding crate.
36
+
37
+ pub mod cron;
38
+ mod honker_ops;
39
+ #[cfg(feature = "kernel-watcher")]
40
+ mod kernel_watcher;
41
+ #[cfg(feature = "shm-fast-path")]
42
+ mod shm_watcher;
43
+
44
+ pub use honker_ops::attach_honker_functions;
45
+
46
+ use parking_lot::{Condvar, Mutex};
47
+ use rusqlite::functions::FunctionFlags;
48
+ use rusqlite::{Connection, OpenFlags, ffi};
49
+ use std::collections::HashMap;
50
+ use std::path::{Path, PathBuf};
51
+ use std::sync::Arc;
52
+ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
53
+ use std::sync::mpsc::{SyncSender, TrySendError};
54
+ use std::time::{Duration, Instant};
55
+
56
+ // ---------------------------------------------------------------------
57
+ // Watcher backend configuration
58
+ // ---------------------------------------------------------------------
59
+
60
+ /// Which backend drives the update-detection loop.
61
+ ///
62
+ /// `Polling` is the default: 1 ms `PRAGMA data_version` loop, proven
63
+ /// correct across all platforms. The optional backends are **experimental**
64
+ /// — they must first prove equivalence to the polling path before
65
+ /// being relied on for correctness.
66
+ #[derive(Debug, Clone, Default)]
67
+ pub enum WatcherBackend {
68
+ /// Default: 1 ms `PRAGMA data_version` polling loop.
69
+ #[default]
70
+ Polling,
71
+ /// OS kernel filesystem notifications (experimental).
72
+ ///
73
+ /// Fires `on_change()` on every non-Access filesystem event in the
74
+ /// db's parent directory plus per-file events on `-wal`/`-shm`.
75
+ /// Spurious wakes possible (consumers re-read state, dedupe).
76
+ /// Missed wakes possible if the OS drops events; consumer's
77
+ /// `idle_poll_s` is the only backstop. Setup failures log and
78
+ /// silently disable — no fall-back to polling.
79
+ #[cfg(feature = "kernel-watcher")]
80
+ KernelWatch,
81
+ /// mmap `-shm` WAL index fast path (experimental).
82
+ ///
83
+ /// Reads `iChange` (offset 8 in the WAL index header) at 100 µs
84
+ /// cadence; fires `on_change()` when it advances. WAL mode only.
85
+ /// Trusts the on-disk shm layout (verified via the equivalence
86
+ /// test at build time). If the layout changes or the `-shm` file
87
+ /// is recreated mid-flight, wakes silently stop until restart.
88
+ #[cfg(feature = "shm-fast-path")]
89
+ ShmFastPath,
90
+ }
91
+
92
+ /// Configuration passed to [`UpdateWatcher::spawn_with_config`] and
93
+ /// [`SharedUpdateWatcher::new_with_config`].
94
+ #[derive(Debug, Clone, Default)]
95
+ pub struct WatcherConfig {
96
+ pub backend: WatcherBackend,
97
+ }
98
+
99
+ impl WatcherBackend {
100
+ /// Parse a binding-level string into a backend. Shared across
101
+ /// bindings so the accepted aliases stay in lockstep. If the
102
+ /// requested backend is not compiled in, returns an error; callers
103
+ /// must not silently substitute polling after an explicit opt-in.
104
+ ///
105
+ /// Accepted: `None` / `"polling"` / `"poll"`,
106
+ /// `"kernel"` / `"kernel-watcher"`, `"shm"` / `"shm-fast-path"`.
107
+ pub fn parse(name: Option<&str>) -> Result<Self, String> {
108
+ match name {
109
+ None | Some("polling" | "poll") => Ok(WatcherBackend::Polling),
110
+ Some("kernel" | "kernel-watcher") => {
111
+ #[cfg(feature = "kernel-watcher")]
112
+ {
113
+ Ok(WatcherBackend::KernelWatch)
114
+ }
115
+ #[cfg(not(feature = "kernel-watcher"))]
116
+ {
117
+ Err(
118
+ "watcher backend 'kernel' requires the kernel-watcher Cargo feature"
119
+ .to_string(),
120
+ )
121
+ }
122
+ }
123
+ Some("shm" | "shm-fast-path") => {
124
+ #[cfg(feature = "shm-fast-path")]
125
+ {
126
+ Ok(WatcherBackend::ShmFastPath)
127
+ }
128
+ #[cfg(not(feature = "shm-fast-path"))]
129
+ {
130
+ Err(
131
+ "watcher backend 'shm' requires the shm-fast-path Cargo feature"
132
+ .to_string(),
133
+ )
134
+ }
135
+ }
136
+ Some(other) => Err(format!(
137
+ "unknown watcher backend {other:?}; valid: None, 'polling', 'kernel', 'shm'"
138
+ )),
139
+ }
140
+ }
141
+
142
+ /// Verify the backend can actually initialize for `db_path`. Bindings
143
+ /// call this at `honker.open()` time so a backend that can't run
144
+ /// errors loudly instead of silently producing no wakes. Returns a
145
+ /// human-readable reason on failure.
146
+ pub fn probe(&self, db_path: &Path) -> Result<(), String> {
147
+ match self {
148
+ WatcherBackend::Polling => {
149
+ let _ = db_path;
150
+ Ok(())
151
+ }
152
+ #[cfg(feature = "kernel-watcher")]
153
+ WatcherBackend::KernelWatch => kernel_watcher::probe(db_path),
154
+ #[cfg(feature = "shm-fast-path")]
155
+ WatcherBackend::ShmFastPath => shm_watcher::probe(db_path),
156
+ }
157
+ }
158
+ }
159
+
160
+ #[derive(thiserror::Error, Debug)]
161
+ pub enum Error {
162
+ #[error("Database error: {0}")]
163
+ Sqlite(#[from] rusqlite::Error),
164
+ }
165
+
166
+ // ---------------------------------------------------------------------
167
+ // PRAGMAs
168
+ // ---------------------------------------------------------------------
169
+
170
+ /// Default PRAGMA block applied on every connection open. Rationale:
171
+ ///
172
+ /// * `journal_mode=WAL` — concurrent readers with one writer.
173
+ /// * `synchronous=NORMAL` — fsync WAL at checkpoint, not every
174
+ /// commit. Safe against app crashes; OS crashes may lose the last
175
+ /// few unchecked-pointed transactions.
176
+ /// * `busy_timeout=5000` — wait up to 5s for the writer lock
177
+ /// before returning SQLITE_BUSY.
178
+ /// * `foreign_keys=ON` — enforce FK constraints (off by
179
+ /// default in SQLite, a real footgun).
180
+ /// * `cache_size=-32000` — 32MB page cache (default was 2MB).
181
+ /// * `temp_store=MEMORY` — temp B-trees in RAM, not disk.
182
+ /// * `wal_autocheckpoint=10000`— fsync every 10k WAL pages. Reduces
183
+ /// fsync frequency 10× vs the default of 1k.
184
+ pub const DEFAULT_PRAGMAS: &str = "PRAGMA journal_mode = WAL;
185
+ PRAGMA synchronous = NORMAL;
186
+ PRAGMA busy_timeout = 5000;
187
+ PRAGMA foreign_keys = ON;
188
+ PRAGMA cache_size = -32000;
189
+ PRAGMA temp_store = MEMORY;
190
+ PRAGMA wal_autocheckpoint = 10000;";
191
+
192
+ /// Apply the library's default PRAGMAs to an already-open connection.
193
+ /// Idempotent.
194
+ pub fn apply_default_pragmas(conn: &Connection) -> rusqlite::Result<()> {
195
+ conn.execute_batch(DEFAULT_PRAGMAS)
196
+ }
197
+
198
+ // ---------------------------------------------------------------------
199
+ // notify() SQL function + notifications schema
200
+ // ---------------------------------------------------------------------
201
+
202
+ /// Install the `_honker_notifications` table and the
203
+ /// `notify(channel, payload)` SQL scalar function on `conn`. Idempotent.
204
+ ///
205
+ /// `notify()` is the public cross-process primitive. Callers do:
206
+ ///
207
+ /// ```sql
208
+ /// BEGIN IMMEDIATE;
209
+ /// INSERT INTO orders ...;
210
+ /// SELECT notify('orders', '{"id":42}');
211
+ /// COMMIT;
212
+ /// ```
213
+ ///
214
+ /// The scalar function returns the INSERTed row id. Listeners watch
215
+ /// database updates and SELECT new rows by channel.
216
+ ///
217
+ /// Pruning is NOT done here. Callers invoke
218
+ /// `Database.prune_notifications(older_than_s, max_keep)` when they want
219
+ /// to trim the table. No magic timer.
220
+ pub fn attach_notify(conn: &Connection) -> Result<(), Error> {
221
+ conn.execute_batch(
222
+ "CREATE TABLE IF NOT EXISTS _honker_notifications (
223
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
224
+ channel TEXT NOT NULL,
225
+ payload TEXT NOT NULL,
226
+ created_at INTEGER NOT NULL DEFAULT (unixepoch())
227
+ );
228
+ CREATE INDEX IF NOT EXISTS _honker_notifications_recent
229
+ ON _honker_notifications(channel, id);",
230
+ )?;
231
+
232
+ conn.create_scalar_function("notify", 2, FunctionFlags::SQLITE_UTF8, |ctx| {
233
+ let channel: String = ctx.get(0)?;
234
+ let payload: String = ctx.get(1)?;
235
+ let db = unsafe { ctx.get_connection() }?;
236
+ let mut ins = db.prepare_cached(
237
+ "INSERT INTO _honker_notifications (channel, payload) VALUES (?1, ?2)",
238
+ )?;
239
+ let id = ins.insert(rusqlite::params![channel, payload])?;
240
+ Ok(id)
241
+ })?;
242
+
243
+ Ok(())
244
+ }
245
+
246
+ // ---------------------------------------------------------------------
247
+ // honker queue schema
248
+ // ---------------------------------------------------------------------
249
+
250
+ /// Canonical DDL for the honker queue schema. Shared source of truth
251
+ /// so the Python binding's `Queue._init_schema`, the SQLite loadable
252
+ /// extension's `honker_bootstrap()`, and any future binding can't drift.
253
+ ///
254
+ /// Schema:
255
+ ///
256
+ /// * `_honker_live` — pending + processing jobs. Partial index
257
+ /// `_honker_live_claim` restricts to those two states so dead-row
258
+ /// history never slows down the claim hot path.
259
+ /// * `_honker_dead` — terminal rows (retry-exhausted or explicitly
260
+ /// failed). Never scanned by the claim path; retention policy is
261
+ /// the user's problem.
262
+ ///
263
+ /// Idempotent (`CREATE TABLE IF NOT EXISTS` / `CREATE INDEX IF NOT
264
+ /// EXISTS`). Views and schema-version cleanup live in the language
265
+ /// binding, not here — they're caller-specific.
266
+ pub const BOOTSTRAP_HONKER_SQL: &str = "
267
+ CREATE TABLE IF NOT EXISTS _honker_live (
268
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
269
+ queue TEXT NOT NULL,
270
+ payload TEXT NOT NULL,
271
+ state TEXT NOT NULL DEFAULT 'pending',
272
+ priority INTEGER NOT NULL DEFAULT 0,
273
+ run_at INTEGER NOT NULL DEFAULT (unixepoch()),
274
+ worker_id TEXT,
275
+ claim_expires_at INTEGER,
276
+ attempts INTEGER NOT NULL DEFAULT 0,
277
+ max_attempts INTEGER NOT NULL DEFAULT 3,
278
+ created_at INTEGER NOT NULL DEFAULT (unixepoch()),
279
+ expires_at INTEGER
280
+ );
281
+ CREATE INDEX IF NOT EXISTS _honker_live_claim
282
+ ON _honker_live(queue, priority DESC, run_at, id)
283
+ WHERE state IN ('pending', 'processing');
284
+ CREATE INDEX IF NOT EXISTS _honker_live_pending_deadline
285
+ ON _honker_live(queue, run_at)
286
+ WHERE state = 'pending';
287
+ CREATE INDEX IF NOT EXISTS _honker_live_processing_deadline
288
+ ON _honker_live(queue, claim_expires_at)
289
+ WHERE state = 'processing';
290
+ CREATE TABLE IF NOT EXISTS _honker_dead (
291
+ id INTEGER PRIMARY KEY,
292
+ queue TEXT NOT NULL,
293
+ payload TEXT NOT NULL,
294
+ priority INTEGER NOT NULL DEFAULT 0,
295
+ run_at INTEGER NOT NULL DEFAULT 0,
296
+ attempts INTEGER NOT NULL DEFAULT 0,
297
+ max_attempts INTEGER NOT NULL DEFAULT 0,
298
+ last_error TEXT,
299
+ created_at INTEGER NOT NULL DEFAULT (unixepoch()),
300
+ died_at INTEGER NOT NULL DEFAULT (unixepoch())
301
+ );
302
+ CREATE TABLE IF NOT EXISTS _honker_locks (
303
+ name TEXT PRIMARY KEY,
304
+ owner TEXT NOT NULL,
305
+ expires_at INTEGER NOT NULL
306
+ );
307
+ CREATE TABLE IF NOT EXISTS _honker_rate_limits (
308
+ name TEXT NOT NULL,
309
+ window_start INTEGER NOT NULL,
310
+ count INTEGER NOT NULL DEFAULT 0,
311
+ PRIMARY KEY (name, window_start)
312
+ );
313
+ CREATE TABLE IF NOT EXISTS _honker_scheduler_tasks (
314
+ name TEXT PRIMARY KEY,
315
+ queue TEXT NOT NULL,
316
+ cron_expr TEXT NOT NULL,
317
+ payload TEXT NOT NULL,
318
+ priority INTEGER NOT NULL DEFAULT 0,
319
+ expires_s INTEGER,
320
+ next_fire_at INTEGER NOT NULL,
321
+ enabled INTEGER NOT NULL DEFAULT 1
322
+ );
323
+ CREATE TABLE IF NOT EXISTS _honker_results (
324
+ job_id INTEGER PRIMARY KEY,
325
+ value TEXT,
326
+ created_at INTEGER NOT NULL DEFAULT (unixepoch()),
327
+ expires_at INTEGER
328
+ );
329
+ CREATE TABLE IF NOT EXISTS _honker_stream (
330
+ offset INTEGER PRIMARY KEY AUTOINCREMENT,
331
+ topic TEXT NOT NULL,
332
+ key TEXT,
333
+ payload TEXT NOT NULL,
334
+ created_at INTEGER NOT NULL DEFAULT (unixepoch())
335
+ );
336
+ CREATE INDEX IF NOT EXISTS _honker_stream_topic
337
+ ON _honker_stream(topic, offset);
338
+ CREATE TABLE IF NOT EXISTS _honker_stream_consumers (
339
+ name TEXT NOT NULL,
340
+ topic TEXT NOT NULL,
341
+ offset INTEGER NOT NULL DEFAULT 0,
342
+ PRIMARY KEY (name, topic)
343
+ );
344
+ ";
345
+
346
+ /// Install the honker queue schema on `conn`. Idempotent. See
347
+ /// [`BOOTSTRAP_HONKER_SQL`] for the DDL and rationale.
348
+ ///
349
+ /// Works in any journal mode. WAL mode is still the recommended
350
+ /// default (concurrent readers, one writer, efficient fsync), but
351
+ /// callers who know what they're doing can run honker tables on a
352
+ /// DELETE-journal database. Cross-process wake is their responsibility.
353
+ pub fn bootstrap_honker_schema(conn: &Connection) -> Result<(), Error> {
354
+ conn.execute_batch(BOOTSTRAP_HONKER_SQL)?;
355
+ // Migration: pre-Mantle databases lack `enabled` on
356
+ // _honker_scheduler_tasks. ADD COLUMN if absent.
357
+ //
358
+ // Race: two processes bootstrapping concurrently could both see
359
+ // "missing" and both attempt the ALTER. SQLite serializes writes
360
+ // file-wide, so they don't actually run at once — the second
361
+ // ALTER errors with "duplicate column" because by then the first
362
+ // has committed. Swallow that specific error; bubble anything else.
363
+ let has_enabled: bool = {
364
+ let mut stmt = conn.prepare(
365
+ "SELECT 1 FROM pragma_table_info('_honker_scheduler_tasks') WHERE name='enabled'",
366
+ )?;
367
+ stmt.query_row([], |_| Ok(true)).unwrap_or(false)
368
+ };
369
+ if !has_enabled {
370
+ match conn.execute(
371
+ "ALTER TABLE _honker_scheduler_tasks ADD COLUMN enabled INTEGER NOT NULL DEFAULT 1",
372
+ [],
373
+ ) {
374
+ Ok(_) => {}
375
+ Err(e) if e.to_string().to_lowercase().contains("duplicate column") => {
376
+ // Lost the race; the other process added it. Fine.
377
+ }
378
+ Err(e) => return Err(e.into()),
379
+ }
380
+ }
381
+ Ok(())
382
+ }
383
+
384
+ // ---------------------------------------------------------------------
385
+ // Opening connections
386
+ // ---------------------------------------------------------------------
387
+
388
+ /// Open a SQLite connection at `path` with the library's PRAGMA
389
+ /// defaults. If `install_notify` is true, also attach the notifications
390
+ /// table + `notify()` SQL function. Readers don't need it; only the
391
+ /// writer connection does.
392
+ pub fn open_conn(path: &str, install_notify: bool) -> Result<Connection, Error> {
393
+ let conn = Connection::open_with_flags(
394
+ path,
395
+ OpenFlags::SQLITE_OPEN_READ_WRITE
396
+ | OpenFlags::SQLITE_OPEN_CREATE
397
+ | OpenFlags::SQLITE_OPEN_URI,
398
+ )?;
399
+ apply_default_pragmas(&conn)?;
400
+ if install_notify {
401
+ attach_notify(&conn)?;
402
+ }
403
+ Ok(conn)
404
+ }
405
+
406
+ // ---------------------------------------------------------------------
407
+ // Writer slot
408
+ // ---------------------------------------------------------------------
409
+
410
+ /// Single-connection write slot. Writers serialize through one
411
+ /// rusqlite `Connection` because WAL mode allows only one writer at a
412
+ /// time anyway; doing it in user space avoids busy-timeout retries.
413
+ ///
414
+ /// Provides explicit [`close`](Self::close) so bindings can release the
415
+ /// underlying SQLite handle independent of `Arc<Writer>` reference
416
+ /// count. Without this, an outstanding `Arc<Writer>` clone (kept alive
417
+ /// by a not-yet-GC'd Transaction object on the JS/Python side) would
418
+ /// keep the connection open and the `.db` file locked. On Windows that
419
+ /// blocks `rmdir`/`unlink` of the temp directory until GC runs; on
420
+ /// Linux/macOS the unlink succeeds but the file descriptor leaks.
421
+ pub struct Writer {
422
+ slot: Mutex<Option<Connection>>,
423
+ available: Condvar,
424
+ closed: AtomicBool,
425
+ }
426
+
427
+ impl Writer {
428
+ pub fn new(conn: Connection) -> Self {
429
+ Self {
430
+ slot: Mutex::new(Some(conn)),
431
+ available: Condvar::new(),
432
+ closed: AtomicBool::new(false),
433
+ }
434
+ }
435
+
436
+ /// Blocking acquire. Waits on a condvar if the slot is held.
437
+ /// Returns `None` if the writer has been [closed](Self::close).
438
+ pub fn acquire(&self) -> Option<Connection> {
439
+ let mut guard = self.slot.lock();
440
+ loop {
441
+ if self.closed.load(Ordering::Acquire) {
442
+ return None;
443
+ }
444
+ if let Some(c) = guard.take() {
445
+ return Some(c);
446
+ }
447
+ self.available.wait(&mut guard);
448
+ }
449
+ }
450
+
451
+ /// Non-blocking. Returns `Some(conn)` if the slot was immediately
452
+ /// free, else `None`. Bindings use this for a fast path that
453
+ /// avoids GIL release (Python) or async thread-hops (Node) when
454
+ /// the slot is uncontended. Also returns `None` if closed.
455
+ pub fn try_acquire(&self) -> Option<Connection> {
456
+ if self.closed.load(Ordering::Acquire) {
457
+ return None;
458
+ }
459
+ self.slot.lock().take()
460
+ }
461
+
462
+ /// Return a connection to the slot. After [close](Self::close), the
463
+ /// connection is dropped instead of being returned to the pool.
464
+ pub fn release(&self, conn: Connection) {
465
+ if self.closed.load(Ordering::Acquire) {
466
+ // Drop conn instead of returning it to a closed pool.
467
+ return;
468
+ }
469
+ let mut guard = self.slot.lock();
470
+ *guard = Some(conn);
471
+ self.available.notify_one();
472
+ }
473
+
474
+ /// Drop the underlying connection and refuse further acquisitions.
475
+ /// Idempotent. Wakes any blocked `acquire()` callers; they observe
476
+ /// the closed flag and return `None`.
477
+ ///
478
+ /// If a transaction is currently holding the connection (i.e. the
479
+ /// slot is empty), it stays out — the transaction's eventual
480
+ /// `release` will see `closed == true` and drop the connection
481
+ /// itself. So the file handle is released either way; what
482
+ /// matters is that no further writes happen after `close`.
483
+ pub fn close(&self) {
484
+ self.closed.store(true, Ordering::Release);
485
+ let mut guard = self.slot.lock();
486
+ guard.take(); // drops the connection if the slot is occupied
487
+ self.available.notify_all();
488
+ }
489
+ }
490
+
491
+ // ---------------------------------------------------------------------
492
+ // Reader pool
493
+ // ---------------------------------------------------------------------
494
+
495
+ /// Bounded pool of reader connections. Readers are cheap (one file
496
+ /// descriptor + a page cache) and WAL mode allows any number to run
497
+ /// concurrently with the writer.
498
+ ///
499
+ /// Provides explicit [`close`](Self::close) for the same reason as
500
+ /// [`Writer::close`] — see that doc.
501
+ pub struct Readers {
502
+ pool: Mutex<Vec<Connection>>,
503
+ outstanding: Mutex<usize>,
504
+ available: Condvar,
505
+ path: String,
506
+ max: usize,
507
+ closed: AtomicBool,
508
+ }
509
+
510
+ impl Readers {
511
+ pub fn new(path: String, max: usize) -> Self {
512
+ Self {
513
+ pool: Mutex::new(Vec::new()),
514
+ outstanding: Mutex::new(0),
515
+ available: Condvar::new(),
516
+ path,
517
+ max: max.max(1),
518
+ closed: AtomicBool::new(false),
519
+ }
520
+ }
521
+
522
+ /// Acquire a reader. Pops a pooled one if available; otherwise
523
+ /// opens a new connection up to `max`. Above `max`, waits on the
524
+ /// condvar. After [`close`](Self::close), returns
525
+ /// `Err(rusqlite::Error::ExecuteReturnedResults)` as a sentinel —
526
+ /// bindings should map this to "Database is closed".
527
+ pub fn acquire(&self) -> Result<Connection, Error> {
528
+ loop {
529
+ if self.closed.load(Ordering::Acquire) {
530
+ return Err(closed_err());
531
+ }
532
+ let mut pool = self.pool.lock();
533
+ if let Some(c) = pool.pop() {
534
+ return Ok(c);
535
+ }
536
+ let mut out = self.outstanding.lock();
537
+ if *out < self.max {
538
+ *out += 1;
539
+ drop(out);
540
+ drop(pool);
541
+ let conn = open_conn(&self.path, false)?;
542
+ // Re-check: if close() raced us, drop the brand-new
543
+ // connection instead of handing it out.
544
+ if self.closed.load(Ordering::Acquire) {
545
+ drop(conn);
546
+ return Err(closed_err());
547
+ }
548
+ return Ok(conn);
549
+ }
550
+ drop(out);
551
+ self.available.wait(&mut pool);
552
+ }
553
+ }
554
+
555
+ /// Return a connection to the pool. After [close](Self::close), the
556
+ /// connection is dropped instead of pooled.
557
+ pub fn release(&self, conn: Connection) {
558
+ if self.closed.load(Ordering::Acquire) {
559
+ return;
560
+ }
561
+ let mut pool = self.pool.lock();
562
+ pool.push(conn);
563
+ self.available.notify_one();
564
+ }
565
+
566
+ /// Drop all pooled connections and refuse further acquisitions.
567
+ /// Idempotent. Wakes any blocked `acquire()` callers; they observe
568
+ /// the closed flag and return the closed sentinel.
569
+ pub fn close(&self) {
570
+ self.closed.store(true, Ordering::Release);
571
+ self.pool.lock().clear(); // drops pooled connections
572
+ self.available.notify_all();
573
+ }
574
+ }
575
+
576
+ /// Sentinel error for "pool closed". Bindings can match the inner
577
+ /// `rusqlite::Error::SqliteFailure` with code `SQLITE_MISUSE` and
578
+ /// message containing "Database is closed" to surface a clean error
579
+ /// to user code. `SQLITE_MISUSE` is appropriate here — calling
580
+ /// acquire on a closed pool is a misuse of the API.
581
+ fn closed_err() -> Error {
582
+ Error::Sqlite(rusqlite::Error::SqliteFailure(
583
+ rusqlite::ffi::Error::new(rusqlite::ffi::SQLITE_MISUSE),
584
+ Some("Database is closed".to_string()),
585
+ ))
586
+ }
587
+
588
+ // ---------------------------------------------------------------------
589
+ // Database file watcher
590
+ // ---------------------------------------------------------------------
591
+
592
+ /// Platform-specific file identity: `(dev, ino)` on Unix,
593
+ /// `(volume_serial, file_index)` on Windows. Used to detect when the
594
+ /// database file has been replaced underneath us (atomic rename,
595
+ /// litestream restore, volume remount).
596
+ ///
597
+ /// Uses the `file-id` crate on unix and windows for stable Rust
598
+ /// support without nightly features. Falls back to `(0, 0)` on other
599
+ /// targets (WASI, Redox, illumos, etc.) — same behavior as the
600
+ /// pre-`file-id` `#[cfg(not(any(unix, windows)))]` branch. On those
601
+ /// targets the dead-man's switch is a no-op (every `stat_identity`
602
+ /// returns `(0, 0)` so the equality check never trips); replacement
603
+ /// detection is disabled but the watcher still functions. Nobody is
604
+ /// known to deploy honker there today.
605
+ #[cfg(any(unix, windows))]
606
+ pub(crate) fn stat_identity(path: &Path) -> std::io::Result<(u64, u64)> {
607
+ let id = file_id::get_file_id(path)?;
608
+ match id {
609
+ file_id::FileId::Inode {
610
+ device_id,
611
+ inode_number,
612
+ } => Ok((device_id, inode_number)),
613
+ file_id::FileId::LowRes {
614
+ volume_serial_number,
615
+ file_index,
616
+ } => Ok((volume_serial_number as u64, file_index)),
617
+ file_id::FileId::HighRes {
618
+ volume_serial_number,
619
+ file_id,
620
+ } => Ok(fold_high_res(volume_serial_number, file_id)),
621
+ }
622
+ }
623
+
624
+ /// Fold a 128-bit ReFS / `FILE_ID_INFO` `file_id` into a 64-bit
625
+ /// identity that fits the `(u64, u64)` return type of
626
+ /// [`stat_identity`].
627
+ ///
628
+ /// NTFS leaves the upper 64 bits at 0 so the result is just the lower
629
+ /// 64 bits — bit-for-bit equivalent to truncation. ReFS can populate
630
+ /// both halves; XOR-folding mixes the bits so we use both halves'
631
+ /// entropy for symmetry.
632
+ ///
633
+ /// For the "did this file get atomically renamed?" detection that
634
+ /// `UpdateWatcher` uses, either truncation or XOR-fold works — ReFS
635
+ /// file_ids change wholesale on rename, so the lower 64 bits change
636
+ /// too. The practical collision probability is the same as
637
+ /// truncation (~2⁻⁶⁴) and acceptable for this use.
638
+ #[cfg(any(unix, windows))]
639
+ fn fold_high_res(volume_serial_number: u64, file_id: u128) -> (u64, u64) {
640
+ let file_index = ((file_id >> 64) as u64) ^ (file_id as u64);
641
+ (volume_serial_number, file_index)
642
+ }
643
+
644
+ #[cfg(not(any(unix, windows)))]
645
+ pub(crate) fn stat_identity(_path: &Path) -> std::io::Result<(u64, u64)> {
646
+ Ok((0, 0))
647
+ }
648
+
649
+ /// Read the pager's `data_version` counter via `PRAGMA data_version`.
650
+ /// Returns a monotonic u32 incremented on every commit by any
651
+ /// connection (and on checkpoint). Empirically verified to detect
652
+ /// cross-connection database updates on all SQLite versions tested.
653
+ /// Cost: ~3.5 µs/call = ~3.5 ms/sec at 1 kHz.
654
+ pub(crate) fn poll_data_version(conn: &Connection) -> Result<u32, String> {
655
+ conn.pragma_query_value(None, "data_version", |row| row.get(0))
656
+ .map_err(|e| e.to_string())
657
+ }
658
+
659
+ /// Returns true if `e` is a transient lock conflict (SQLITE_BUSY /
660
+ /// SQLITE_LOCKED). On non-WAL journal modes the writer holds an
661
+ /// exclusive lock during commit, so the watcher's PRAGMA frequently
662
+ /// races into one of these. Treat as "try again next tick", not as a
663
+ /// connection failure — dropping and re-opening would silently re-
664
+ /// baseline `last_version` and skip pending wakes.
665
+ fn is_transient_lock_error(e: &rusqlite::Error) -> bool {
666
+ matches!(
667
+ e,
668
+ rusqlite::Error::SqliteFailure(
669
+ ffi::Error {
670
+ code: ffi::ErrorCode::DatabaseBusy | ffi::ErrorCode::DatabaseLocked,
671
+ ..
672
+ },
673
+ _,
674
+ )
675
+ )
676
+ }
677
+
678
+ /// Polling loop body shared by [`UpdateWatcher`] (polling backend) and
679
+ /// the fallback path inside [`kernel_watcher`] / [`shm_watcher`].
680
+ ///
681
+ /// Three-layer defensive architecture:
682
+ ///
683
+ /// 1. **Fast path (every 1 ms):** `PRAGMA data_version`. Compare the
684
+ /// integer to last seen value. Notify on change. (~3.5 µs/call.)
685
+ /// 2. **Error recovery (every 1 ms on failure):** If the query fails,
686
+ /// reconnect the SQLite connection and force one wake.
687
+ /// 3. **Identity check (about every 100 ms):** `stat(db_path)` to compare
688
+ /// `(dev, ino)`. If the file was replaced, panic with a clear
689
+ /// message — continuing would silently watch stale data.
690
+ pub(crate) fn run_poll_loop<F>(
691
+ db_path: PathBuf,
692
+ on_change: F,
693
+ stop: Arc<AtomicBool>,
694
+ ready: std::sync::mpsc::SyncSender<()>,
695
+ ) where
696
+ F: Fn(),
697
+ {
698
+ let mut conn = match Connection::open_with_flags(
699
+ &db_path,
700
+ OpenFlags::SQLITE_OPEN_READ_WRITE | OpenFlags::SQLITE_OPEN_NO_MUTEX,
701
+ ) {
702
+ Ok(c) => Some(c),
703
+ Err(e) => {
704
+ eprintln!("honker: failed to open watcher connection: {e}");
705
+ None
706
+ }
707
+ };
708
+ let mut last_version = conn
709
+ .as_ref()
710
+ .and_then(|c| poll_data_version(c).ok())
711
+ .unwrap_or(0);
712
+ let initial_identity = match stat_identity(&db_path) {
713
+ Ok(id) => id,
714
+ Err(e) => {
715
+ eprintln!("honker: failed to stat database for identity check: {e}");
716
+ (0, 0)
717
+ }
718
+ };
719
+ // Wall-clock cadence: tick counting drifts on Windows where 1 ms
720
+ // sleeps round up to ~15 ms.
721
+ let mut next_identity_check = Instant::now() + UPDATE_WATCHER_IDENTITY_INTERVAL;
722
+ // Baseline captured; signal the spawner that it's safe to return.
723
+ let _ = ready.send(());
724
+ drop(ready);
725
+
726
+ while !stop.load(Ordering::Acquire) {
727
+ std::thread::sleep(Duration::from_millis(1));
728
+
729
+ // Path 1: PRAGMA data_version (fast path)
730
+ if let Some(ref c) = conn {
731
+ match c.pragma_query_value(None, "data_version", |row| row.get::<_, u32>(0)) {
732
+ Ok(version) => {
733
+ if version != last_version {
734
+ last_version = version;
735
+ on_change();
736
+ }
737
+ }
738
+ Err(e) if is_transient_lock_error(&e) => {
739
+ // Writer holds the db lock (mid-commit on a
740
+ // non-WAL journal mode). Don't drop the connection
741
+ // — that would silently re-baseline last_version
742
+ // and skip pending wakes. Just retry next tick.
743
+ }
744
+ Err(e) => {
745
+ eprintln!("honker: data_version poll failed: {e}");
746
+ conn = None;
747
+ on_change(); // conservative wake
748
+ }
749
+ }
750
+ } else {
751
+ // Path 2: reconnect after transient failure
752
+ match Connection::open_with_flags(
753
+ &db_path,
754
+ OpenFlags::SQLITE_OPEN_READ_WRITE | OpenFlags::SQLITE_OPEN_NO_MUTEX,
755
+ ) {
756
+ Ok(c) => {
757
+ last_version = poll_data_version(&c).unwrap_or(0);
758
+ conn = Some(c);
759
+ }
760
+ Err(e) => {
761
+ eprintln!("honker: reconnect failed: {e}");
762
+ }
763
+ }
764
+ }
765
+
766
+ // Path 3: dead-man's switch — panic if db inode changed
767
+ // (atomic rename, litestream restore, volume remount, NFS).
768
+ // Effectively a no-op on Windows: the kernel rejects
769
+ // rename-over-open files.
770
+ let now = Instant::now();
771
+ if now >= next_identity_check {
772
+ next_identity_check = now + UPDATE_WATCHER_IDENTITY_INTERVAL;
773
+ match stat_identity(&db_path) {
774
+ Ok(current) => {
775
+ if current != initial_identity {
776
+ panic!(
777
+ "honker: database file replaced: \
778
+ expected (dev={}, ino={}), \
779
+ found (dev={}, ino={}) at {:?}. \
780
+ The watcher cannot recover; \
781
+ close the Database and reopen with honker.open().",
782
+ initial_identity.0, initial_identity.1, current.0, current.1, db_path
783
+ );
784
+ }
785
+ }
786
+ Err(e) => {
787
+ eprintln!("honker: stat identity check failed: {e}");
788
+ conn = None;
789
+ on_change();
790
+ }
791
+ }
792
+ }
793
+ }
794
+ }
795
+
796
+ /// Background thread that polls a SQLite database file for changes.
797
+ /// Dispatches to the backend selected in [`WatcherConfig`].
798
+ /// See [`run_poll_loop`] for the default polling backend's architecture.
799
+ pub struct UpdateWatcher {
800
+ stop: Arc<AtomicBool>,
801
+ handle: Option<std::thread::JoinHandle<()>>,
802
+ }
803
+
804
+ const UPDATE_WATCHER_IDENTITY_INTERVAL: Duration = Duration::from_millis(100);
805
+
806
+ impl UpdateWatcher {
807
+ /// Spawn a watcher thread on `db_path` using the default polling
808
+ /// backend. `on_change` is called once per observed commit. The
809
+ /// thread runs until [`UpdateWatcher`] is dropped or
810
+ /// [`stop`](Self::stop) is called.
811
+ pub fn spawn<F>(db_path: PathBuf, on_change: F) -> Self
812
+ where
813
+ F: Fn() + Send + 'static,
814
+ {
815
+ Self::spawn_with_config(db_path, on_change, WatcherConfig::default())
816
+ }
817
+
818
+ /// Like [`spawn`](Self::spawn) but with an explicit watcher backend.
819
+ /// The optional `KernelWatch` and `ShmFastPath` backends are
820
+ /// experimental — see [`WatcherBackend`] for the safety contracts.
821
+ pub fn spawn_with_config<F>(db_path: PathBuf, on_change: F, config: WatcherConfig) -> Self
822
+ where
823
+ F: Fn() + Send + 'static,
824
+ {
825
+ let stop = Arc::new(AtomicBool::new(false));
826
+ let stop_t = stop.clone();
827
+ // The thread signals `ready` once it has captured its baseline
828
+ // (initial inode for the dead-man's switch, initial iChange for
829
+ // shm, etc.). spawn_with_config blocks on `ready` so the caller
830
+ // can do anything that mutates the file (rename, write) right
831
+ // after spawn without racing the baseline capture. If the
832
+ // thread fails to init, the sender drops and recv() returns
833
+ // Err — we still return so the caller can use the (no-op)
834
+ // watcher; the eprintln from the backend explains the failure.
835
+ let (ready_tx, ready_rx) = std::sync::mpsc::sync_channel::<()>(1);
836
+ let handle = std::thread::Builder::new()
837
+ .name("honker-update-poll".into())
838
+ .spawn(move || match config.backend {
839
+ WatcherBackend::Polling => run_poll_loop(db_path, on_change, stop_t, ready_tx),
840
+ #[cfg(feature = "kernel-watcher")]
841
+ WatcherBackend::KernelWatch => {
842
+ kernel_watcher::run_kernel_watch_loop(db_path, on_change, stop_t, ready_tx);
843
+ }
844
+ #[cfg(feature = "shm-fast-path")]
845
+ WatcherBackend::ShmFastPath => {
846
+ shm_watcher::run_shm_fast_path_loop(db_path, on_change, stop_t, ready_tx);
847
+ }
848
+ })
849
+ .expect("spawn update-poll thread");
850
+ let _ = ready_rx.recv();
851
+ Self {
852
+ stop,
853
+ handle: Some(handle),
854
+ }
855
+ }
856
+
857
+ /// Request the watcher thread to stop. Idempotent. Dropping the
858
+ /// `UpdateWatcher` also stops the thread.
859
+ pub fn stop(&self) {
860
+ self.stop.store(true, Ordering::Release);
861
+ }
862
+
863
+ /// Stop the watcher and wait for the thread to exit. Returns the
864
+ /// thread's result — `Ok(())` on clean shutdown, `Err(payload)`
865
+ /// if the thread panicked (e.g. the dead-man's switch detected
866
+ /// file replacement). Consumes `self` so the watcher can't be
867
+ /// used after joining.
868
+ pub fn join(mut self) -> std::thread::Result<()> {
869
+ self.stop();
870
+ match self.handle.take() {
871
+ Some(h) => h.join(),
872
+ None => Ok(()),
873
+ }
874
+ }
875
+ }
876
+
877
+ impl Drop for UpdateWatcher {
878
+ fn drop(&mut self) {
879
+ self.stop();
880
+ }
881
+ }
882
+
883
+ // ---------------------------------------------------------------------
884
+ // Shared update watcher (one thread per Database, N subscribers)
885
+ // ---------------------------------------------------------------------
886
+
887
+ /// Shared database-file watcher: one PRAGMA-poll thread per database
888
+ /// path, N subscribers. Each [`subscribe`](Self::subscribe) returns a
889
+ /// fresh `Receiver<()>` that sees a tick on every observed commit.
890
+ ///
891
+ /// Previously every call to `db.update_events()` spawned its own
892
+ /// update watcher thread, so N listeners in one process meant N threads
893
+ /// hammering `stat(2)` on the same file at 1 ms cadence. A web
894
+ /// process with 100 active SSE subscribers was doing ~200k stat
895
+ /// syscalls/sec against one file. Now a single shared thread
896
+ /// services all subscribers — 1 ms cadence, same kernel cost
897
+ /// regardless of N.
898
+ ///
899
+ /// Subscriber channels are bounded; on overflow, additional ticks for
900
+ /// that subscriber are dropped. Wakes are idempotent signals — the
901
+ /// consumer re-reads state from SQLite on each wake — so dropping
902
+ /// during backpressure is safe. A disconnected subscriber (receiver
903
+ /// dropped) gets pruned on the next wake via `TrySendError::Disconnected`.
904
+ /// Lives in the watcher thread's closure. Closure drops on clean exit
905
+ /// or panic; this Drop clears every subscriber's sender so their next
906
+ /// `recv()` returns Err. Without it, a panicking watcher leaves
907
+ /// subscribers blocking forever.
908
+ struct WatcherDeathGuard {
909
+ senders: Arc<Mutex<HashMap<u64, SyncSender<()>>>>,
910
+ }
911
+
912
+ impl Drop for WatcherDeathGuard {
913
+ fn drop(&mut self) {
914
+ self.senders.lock().clear();
915
+ }
916
+ }
917
+
918
+ pub struct SharedUpdateWatcher {
919
+ /// Hold the underlying poll thread alive. Dropping or
920
+ /// [`close`](Self::close)ing this stops it. Wrapped in
921
+ /// `Mutex<Option<...>>` so `close()` can take the watcher out and
922
+ /// `join()` it synchronously — required to release the watcher's
923
+ /// read-only `Connection` before a `db.close()` consumer tries to
924
+ /// `unlink` the database file (Windows: `EBUSY` until every
925
+ /// handle is dropped).
926
+ watcher: Mutex<Option<UpdateWatcher>>,
927
+ /// Shared with the watcher closure so it can fan out to every
928
+ /// subscriber and prune disconnected ones opportunistically.
929
+ senders: Arc<Mutex<HashMap<u64, SyncSender<()>>>>,
930
+ next_id: AtomicU64,
931
+ }
932
+
933
+ impl SharedUpdateWatcher {
934
+ /// Spawn the shared poll thread for `db_path` using the default
935
+ /// polling backend.
936
+ pub fn new(db_path: PathBuf) -> Self {
937
+ Self::new_with_config(db_path, WatcherConfig::default())
938
+ }
939
+
940
+ /// Like [`new`](Self::new) but with an explicit watcher backend.
941
+ pub fn new_with_config(db_path: PathBuf, config: WatcherConfig) -> Self {
942
+ let senders: Arc<Mutex<HashMap<u64, SyncSender<()>>>> =
943
+ Arc::new(Mutex::new(HashMap::new()));
944
+ let senders_t = senders.clone();
945
+ // Watcher thread exits → closure drops → this guard drops →
946
+ // every subscriber's sender is cleared. Their next `recv()`
947
+ // returns Err instead of blocking forever. Subscribers learn
948
+ // the watcher died programmatically, not via stderr.
949
+ let death_guard = WatcherDeathGuard {
950
+ senders: senders.clone(),
951
+ };
952
+ let watcher = UpdateWatcher::spawn_with_config(
953
+ db_path,
954
+ move || {
955
+ let _ = &death_guard;
956
+ let mut list = senders_t.lock();
957
+ list.retain(|_id, s| match s.try_send(()) {
958
+ Ok(()) | Err(TrySendError::Full(_)) => true,
959
+ Err(TrySendError::Disconnected(_)) => false,
960
+ });
961
+ },
962
+ config,
963
+ );
964
+ Self {
965
+ watcher: Mutex::new(Some(watcher)),
966
+ senders,
967
+ next_id: AtomicU64::new(0),
968
+ }
969
+ }
970
+
971
+ /// Subscribe. Returns a subscriber id and a [`Receiver<()>`] that
972
+ /// sees one tick per observed database update. Callers MUST
973
+ /// [`unsubscribe`](Self::unsubscribe) the returned id when done —
974
+ /// otherwise the sender stays in the map and a bridge thread
975
+ /// blocking on `recv()` will never see a disconnect.
976
+ ///
977
+ /// Channel capacity is 1: bursts coalesce into one wake per drain
978
+ /// cycle. Wakes are "go re-read state" signals — the consumer's
979
+ /// SQL query reads current state regardless of how many wakes
980
+ /// were dropped, so dropped redundant wakes never cost data, only
981
+ /// signal redundancy. The kernel-watcher backend in particular
982
+ /// fires one event per filesystem write (multiple per commit);
983
+ /// without coalescing, consumers would run N redundant queries
984
+ /// per commit burst. With cap=1 they run ~1.
985
+ pub fn subscribe(&self) -> (u64, std::sync::mpsc::Receiver<()>) {
986
+ let id = self.next_id.fetch_add(1, Ordering::Relaxed);
987
+ let (tx, rx) = std::sync::mpsc::sync_channel(1);
988
+ self.senders.lock().insert(id, tx);
989
+ (id, rx)
990
+ }
991
+
992
+ /// Remove a subscriber. The corresponding receiver sees
993
+ /// `Err(RecvError)` on its next blocking `recv()`, letting a
994
+ /// bridge thread exit cleanly.
995
+ pub fn unsubscribe(&self, id: u64) {
996
+ self.senders.lock().remove(&id);
997
+ }
998
+
999
+ /// Current subscriber count. Test/introspection helper.
1000
+ pub fn subscriber_count(&self) -> usize {
1001
+ self.senders.lock().len()
1002
+ }
1003
+
1004
+ /// Disconnect all subscribers and synchronously join the poll
1005
+ /// thread. The thread owns the watcher's read-only `Connection`;
1006
+ /// joining drops that connection and releases the file handle.
1007
+ /// Idempotent — safe to call more than once.
1008
+ pub fn close(&self) -> std::thread::Result<()> {
1009
+ self.senders.lock().clear();
1010
+ match self.watcher.lock().take() {
1011
+ Some(watcher) => watcher.join(),
1012
+ None => Ok(()),
1013
+ }
1014
+ }
1015
+ }
1016
+
1017
+ impl Drop for SharedUpdateWatcher {
1018
+ fn drop(&mut self) {
1019
+ // Best-effort: signal stop. We don't synchronously join here
1020
+ // because Drop runs from arbitrary contexts (including async
1021
+ // executors) where blocking on a thread join is unsafe.
1022
+ // Bindings that need a synchronous release should call
1023
+ // `close()` explicitly.
1024
+ self.senders.lock().clear();
1025
+ if let Some(watcher) = self.watcher.get_mut().take() {
1026
+ // Dropping UpdateWatcher signals stop; the thread exits
1027
+ // shortly after and its Connection drops then.
1028
+ drop(watcher);
1029
+ }
1030
+ }
1031
+ }
1032
+
1033
+ // ---------------------------------------------------------------------
1034
+ // Tests
1035
+ // ---------------------------------------------------------------------
1036
+
1037
+ #[cfg(test)]
1038
+ mod tests {
1039
+ use super::*;
1040
+ use rusqlite::Connection;
1041
+ use std::collections::HashSet;
1042
+ use std::sync::atomic::{AtomicUsize, Ordering};
1043
+ use std::sync::{Arc, Barrier};
1044
+
1045
+ fn mem() -> Connection {
1046
+ Connection::open_in_memory().unwrap()
1047
+ }
1048
+
1049
+ fn temp_db(name: &str) -> PathBuf {
1050
+ let p = std::env::temp_dir().join(format!(
1051
+ "honker-{name}-{}-{:?}.db",
1052
+ std::process::id(),
1053
+ std::thread::current().id()
1054
+ ));
1055
+ let _ = std::fs::remove_file(&p);
1056
+ let _ = std::fs::remove_file(format!("{}-wal", p.display()));
1057
+ let _ = std::fs::remove_file(format!("{}-shm", p.display()));
1058
+ p
1059
+ }
1060
+
1061
+ fn open_core_test_conn(path: &Path) -> Connection {
1062
+ let conn = open_conn(path.to_str().unwrap(), true).unwrap();
1063
+ attach_honker_functions(&conn).unwrap();
1064
+ conn.query_row("SELECT honker_bootstrap()", [], |_| Ok(()))
1065
+ .unwrap();
1066
+ conn
1067
+ }
1068
+
1069
+ #[test]
1070
+ fn core_sql_functions_survive_concurrent_queue_stream_notify_pressure() {
1071
+ let path = temp_db("core-pressure");
1072
+ let producer_count = 4usize;
1073
+ let jobs_per_producer = 75usize;
1074
+ let worker_count = 6usize;
1075
+ let total_jobs = producer_count * jobs_per_producer;
1076
+
1077
+ {
1078
+ let conn = open_core_test_conn(&path);
1079
+ let mode: String = conn
1080
+ .pragma_query_value(None, "journal_mode", |r| r.get(0))
1081
+ .unwrap();
1082
+ assert_eq!(mode.to_ascii_uppercase(), "WAL");
1083
+ }
1084
+
1085
+ let start = Arc::new(Barrier::new(producer_count + worker_count));
1086
+ let producers_done = Arc::new(AtomicUsize::new(0));
1087
+ let processed = Arc::new(Mutex::new(Vec::<(i64, String)>::new()));
1088
+ let mut handles = Vec::new();
1089
+
1090
+ for producer in 0..producer_count {
1091
+ let path = path.clone();
1092
+ let start = start.clone();
1093
+ let producers_done = producers_done.clone();
1094
+ handles.push(std::thread::spawn(move || {
1095
+ let conn = open_core_test_conn(&path);
1096
+ start.wait();
1097
+ for seq in 0..jobs_per_producer {
1098
+ let key = format!("p{producer}-{seq:03}");
1099
+ let payload = format!(r#"{{"producer":{producer},"seq":{seq},"key":"{key}"}}"#);
1100
+ conn.query_row(
1101
+ "SELECT honker_enqueue('pressure', ?1, NULL, NULL, ?2, 3, NULL)",
1102
+ rusqlite::params![payload, (seq % 7) as i64],
1103
+ |r| r.get::<_, i64>(0),
1104
+ )
1105
+ .unwrap();
1106
+ conn.query_row(
1107
+ "SELECT honker_stream_publish('pressure-events', ?1, ?2)",
1108
+ rusqlite::params![key, payload],
1109
+ |r| r.get::<_, i64>(0),
1110
+ )
1111
+ .unwrap();
1112
+ conn.query_row(
1113
+ "SELECT notify('pressure-note', ?1)",
1114
+ rusqlite::params![format!(r#"{{"key":"{key}"}}"#)],
1115
+ |r| r.get::<_, i64>(0),
1116
+ )
1117
+ .unwrap();
1118
+ if seq % 11 == 0 {
1119
+ std::thread::sleep(Duration::from_millis(1));
1120
+ }
1121
+ }
1122
+ producers_done.fetch_add(1, Ordering::Release);
1123
+ }));
1124
+ }
1125
+
1126
+ for worker in 0..worker_count {
1127
+ let path = path.clone();
1128
+ let start = start.clone();
1129
+ let producers_done = producers_done.clone();
1130
+ let processed = processed.clone();
1131
+ handles.push(std::thread::spawn(move || {
1132
+ let conn = open_core_test_conn(&path);
1133
+ start.wait();
1134
+ let worker_id = format!("core-worker-{worker}");
1135
+ let deadline = Instant::now() + Duration::from_secs(20);
1136
+ let mut idle_since: Option<Instant> = None;
1137
+ loop {
1138
+ assert!(Instant::now() < deadline, "{worker_id} timed out");
1139
+ let rows_json: String = conn
1140
+ .query_row(
1141
+ "SELECT honker_claim_batch('pressure', ?1, 7, 30)",
1142
+ rusqlite::params![worker_id],
1143
+ |r| r.get(0),
1144
+ )
1145
+ .unwrap();
1146
+ let mut stmt = conn
1147
+ .prepare(
1148
+ "SELECT
1149
+ json_extract(value, '$.id'),
1150
+ json_extract(json_extract(value, '$.payload'), '$.key')
1151
+ FROM json_each(?1)",
1152
+ )
1153
+ .unwrap();
1154
+ let claimed = stmt
1155
+ .query_map(rusqlite::params![rows_json], |r| {
1156
+ Ok((r.get::<_, i64>(0)?, r.get::<_, String>(1)?))
1157
+ })
1158
+ .unwrap()
1159
+ .collect::<Result<Vec<_>, _>>()
1160
+ .unwrap();
1161
+
1162
+ if claimed.is_empty() {
1163
+ if producers_done.load(Ordering::Acquire) == producer_count {
1164
+ match idle_since {
1165
+ Some(t) if t.elapsed() >= Duration::from_millis(500) => break,
1166
+ Some(_) => {}
1167
+ None => idle_since = Some(Instant::now()),
1168
+ }
1169
+ }
1170
+ std::thread::sleep(Duration::from_millis(5));
1171
+ continue;
1172
+ }
1173
+
1174
+ idle_since = None;
1175
+ let ids_json = format!(
1176
+ "[{}]",
1177
+ claimed
1178
+ .iter()
1179
+ .map(|(id, _)| id.to_string())
1180
+ .collect::<Vec<_>>()
1181
+ .join(",")
1182
+ );
1183
+ let acked: i64 = conn
1184
+ .query_row(
1185
+ "SELECT honker_ack_batch(?1, ?2)",
1186
+ rusqlite::params![ids_json, worker_id],
1187
+ |r| r.get(0),
1188
+ )
1189
+ .unwrap();
1190
+ assert_eq!(acked as usize, claimed.len());
1191
+ processed.lock().extend(claimed);
1192
+ std::thread::sleep(Duration::from_millis(2));
1193
+ }
1194
+ }));
1195
+ }
1196
+
1197
+ for handle in handles {
1198
+ handle.join().unwrap();
1199
+ }
1200
+
1201
+ let processed = processed.lock();
1202
+ assert_eq!(processed.len(), total_jobs);
1203
+ let unique_ids: HashSet<i64> = processed.iter().map(|(id, _)| *id).collect();
1204
+ assert_eq!(unique_ids.len(), total_jobs, "job id claimed twice");
1205
+ let unique_keys: HashSet<String> = processed.iter().map(|(_, k)| k.clone()).collect();
1206
+ assert_eq!(unique_keys.len(), total_jobs, "logical key claimed twice");
1207
+ for producer in 0..producer_count {
1208
+ for seq in 0..jobs_per_producer {
1209
+ assert!(
1210
+ unique_keys.contains(&format!("p{producer}-{seq:03}")),
1211
+ "missing key p{producer}-{seq:03}"
1212
+ );
1213
+ }
1214
+ }
1215
+ drop(processed);
1216
+
1217
+ let conn = open_core_test_conn(&path);
1218
+ let live: i64 = conn
1219
+ .query_row(
1220
+ "SELECT COUNT(*) FROM _honker_live WHERE queue='pressure'",
1221
+ [],
1222
+ |r| r.get(0),
1223
+ )
1224
+ .unwrap();
1225
+ let dead: i64 = conn
1226
+ .query_row(
1227
+ "SELECT COUNT(*) FROM _honker_dead WHERE queue='pressure'",
1228
+ [],
1229
+ |r| r.get(0),
1230
+ )
1231
+ .unwrap();
1232
+ let stream_rows: i64 = conn
1233
+ .query_row(
1234
+ "SELECT COUNT(*) FROM _honker_stream WHERE topic='pressure-events'",
1235
+ [],
1236
+ |r| r.get(0),
1237
+ )
1238
+ .unwrap();
1239
+ let notes: i64 = conn
1240
+ .query_row(
1241
+ "SELECT COUNT(*) FROM _honker_notifications WHERE channel='pressure-note'",
1242
+ [],
1243
+ |r| r.get(0),
1244
+ )
1245
+ .unwrap();
1246
+ let enqueue_wakes: i64 = conn
1247
+ .query_row(
1248
+ "SELECT COUNT(*) FROM _honker_notifications WHERE channel='honker:pressure'",
1249
+ [],
1250
+ |r| r.get(0),
1251
+ )
1252
+ .unwrap();
1253
+ let integrity: String = conn
1254
+ .query_row("PRAGMA integrity_check", [], |r| r.get(0))
1255
+ .unwrap();
1256
+ assert_eq!(live, 0);
1257
+ assert_eq!(dead, 0);
1258
+ assert_eq!(stream_rows as usize, total_jobs);
1259
+ assert_eq!(notes as usize, total_jobs);
1260
+ assert_eq!(enqueue_wakes as usize, total_jobs);
1261
+ assert_eq!(integrity, "ok");
1262
+
1263
+ drop(conn);
1264
+ let _ = std::fs::remove_file(&path);
1265
+ let _ = std::fs::remove_file(format!("{}-wal", path.display()));
1266
+ let _ = std::fs::remove_file(format!("{}-shm", path.display()));
1267
+ }
1268
+
1269
+ #[test]
1270
+ fn notify_inserts_row() {
1271
+ let conn = mem();
1272
+ attach_notify(&conn).unwrap();
1273
+ conn.execute_batch("BEGIN IMMEDIATE;").unwrap();
1274
+ conn.query_row("SELECT notify('orders', 'new')", [], |_| Ok(()))
1275
+ .unwrap();
1276
+ conn.execute_batch("COMMIT;").unwrap();
1277
+
1278
+ let n: i64 = conn
1279
+ .query_row(
1280
+ "SELECT COUNT(*) FROM _honker_notifications WHERE channel='orders'",
1281
+ [],
1282
+ |row| row.get(0),
1283
+ )
1284
+ .unwrap();
1285
+ assert_eq!(n, 1);
1286
+ }
1287
+
1288
+ #[test]
1289
+ fn rollback_drops_notification() {
1290
+ let conn = mem();
1291
+ attach_notify(&conn).unwrap();
1292
+ conn.execute_batch("BEGIN IMMEDIATE;").unwrap();
1293
+ conn.query_row("SELECT notify('x', 'y')", [], |_| Ok(()))
1294
+ .unwrap();
1295
+ conn.execute_batch("ROLLBACK;").unwrap();
1296
+
1297
+ let n: i64 = conn
1298
+ .query_row("SELECT COUNT(*) FROM _honker_notifications", [], |row| {
1299
+ row.get(0)
1300
+ })
1301
+ .unwrap();
1302
+ assert_eq!(n, 0);
1303
+ }
1304
+
1305
+ #[test]
1306
+ fn writer_try_acquire_returns_none_when_held() {
1307
+ let w = Writer::new(Connection::open_in_memory().unwrap());
1308
+ let conn = w.acquire().expect("acquire on fresh Writer");
1309
+ assert!(w.try_acquire().is_none());
1310
+ w.release(conn);
1311
+ assert!(w.try_acquire().is_some());
1312
+ }
1313
+
1314
+ #[test]
1315
+ fn writer_close_drops_idle_connection() {
1316
+ let w = Writer::new(Connection::open_in_memory().unwrap());
1317
+ // Slot is currently occupied (Some(conn)).
1318
+ w.close();
1319
+ // After close, acquire and try_acquire return None even though
1320
+ // a slot was free at close time — the connection was dropped.
1321
+ assert!(w.acquire().is_none());
1322
+ assert!(w.try_acquire().is_none());
1323
+ }
1324
+
1325
+ #[test]
1326
+ fn writer_close_drops_returned_connection() {
1327
+ let w = Writer::new(Connection::open_in_memory().unwrap());
1328
+ let conn = w.acquire().expect("acquire on fresh Writer");
1329
+ // Close while a transaction is "holding" the connection.
1330
+ w.close();
1331
+ // Releasing after close drops the connection (no-op into the
1332
+ // pool); acquire still returns None.
1333
+ w.release(conn);
1334
+ assert!(w.try_acquire().is_none());
1335
+ }
1336
+
1337
+ #[test]
1338
+ fn readers_close_returns_closed_err() {
1339
+ let tmp = std::env::temp_dir().join(format!("honker-readers-close-{}", std::process::id()));
1340
+ let _ = std::fs::remove_file(&tmp);
1341
+ // Create the file so open_conn succeeds.
1342
+ Connection::open(&tmp)
1343
+ .unwrap()
1344
+ .execute_batch("PRAGMA journal_mode=WAL;")
1345
+ .unwrap();
1346
+
1347
+ let r = Readers::new(tmp.to_string_lossy().into_owned(), 4);
1348
+ // Acquire one to populate the pool indirectly via outstanding count.
1349
+ let c = r.acquire().expect("first acquire");
1350
+ r.release(c);
1351
+ r.close();
1352
+ // After close, acquire returns the closed sentinel.
1353
+ match r.acquire() {
1354
+ Err(Error::Sqlite(rusqlite::Error::SqliteFailure(_, Some(msg)))) => {
1355
+ assert!(msg.contains("Database is closed"));
1356
+ }
1357
+ other => panic!("expected closed err, got {other:?}"),
1358
+ }
1359
+ let _ = std::fs::remove_file(&tmp);
1360
+ }
1361
+
1362
+ #[test]
1363
+ fn shared_update_watcher_fans_out_to_many_subscribers() {
1364
+ let tmp = std::env::temp_dir().join(format!("honker-shared-test-{}", std::process::id()));
1365
+ let _ = std::fs::remove_file(&tmp);
1366
+ // Create a real SQLite database in WAL mode so the watcher can
1367
+ // open a read-only connection and poll data_version.
1368
+ {
1369
+ let conn = Connection::open(&tmp).unwrap();
1370
+ conn.execute_batch("PRAGMA journal_mode = WAL;").unwrap();
1371
+ }
1372
+
1373
+ let shared = SharedUpdateWatcher::new(tmp.clone());
1374
+ let subs: Vec<(u64, std::sync::mpsc::Receiver<()>)> =
1375
+ (0..50).map(|_| shared.subscribe()).collect();
1376
+
1377
+ // Open a separate writer connection to trigger commits.
1378
+ let writer = Connection::open(&tmp).unwrap();
1379
+ writer
1380
+ .execute(
1381
+ "CREATE TABLE IF NOT EXISTS _test_trigger(id INTEGER PRIMARY KEY)",
1382
+ [],
1383
+ )
1384
+ .unwrap();
1385
+ for i in 0..5 {
1386
+ std::thread::sleep(Duration::from_millis(5));
1387
+ writer
1388
+ .execute("INSERT INTO _test_trigger(id) VALUES (?)", [i])
1389
+ .unwrap();
1390
+ }
1391
+ std::thread::sleep(Duration::from_millis(50));
1392
+
1393
+ for (i, (_id, rx)) in subs.iter().enumerate() {
1394
+ let mut got_any = false;
1395
+ while rx.try_recv().is_ok() {
1396
+ got_any = true;
1397
+ }
1398
+ assert!(got_any, "subscriber {} saw no ticks", i);
1399
+ }
1400
+
1401
+ let _ = std::fs::remove_file(&tmp);
1402
+ }
1403
+
1404
+ #[test]
1405
+ fn shared_update_watcher_explicit_unsubscribe_disconnects_receiver() {
1406
+ let tmp = std::env::temp_dir().join(format!("honker-unsub-test-{}", std::process::id()));
1407
+ let _ = std::fs::remove_file(&tmp);
1408
+ {
1409
+ let conn = Connection::open(&tmp).unwrap();
1410
+ conn.execute_batch("PRAGMA journal_mode = WAL;").unwrap();
1411
+ }
1412
+
1413
+ let shared = SharedUpdateWatcher::new(tmp.clone());
1414
+ let (id, rx) = shared.subscribe();
1415
+ assert_eq!(shared.subscriber_count(), 1);
1416
+
1417
+ shared.unsubscribe(id);
1418
+ assert_eq!(shared.subscriber_count(), 0);
1419
+
1420
+ // Receiver now sees Err on blocking recv — the contract that
1421
+ // lets a bridge thread exit cleanly when its UpdateEvents drops.
1422
+ assert!(rx.recv().is_err());
1423
+
1424
+ let _ = std::fs::remove_file(&tmp);
1425
+ }
1426
+
1427
+ /// Subscribers must learn that the watcher thread died — not just
1428
+ /// stop receiving wakes silently. We force the watcher to panic via
1429
+ /// the dead-man's switch (file replacement) and assert that an
1430
+ /// already-subscribed receiver returns `Err(RecvError)` on its
1431
+ /// next blocking `recv()`. Without WatcherDeathGuard this test
1432
+ /// hangs (subscriber blocks forever) and times out.
1433
+ #[test]
1434
+ #[cfg(unix)]
1435
+ fn shared_update_watcher_signals_subscribers_on_watcher_death() {
1436
+ let tmp = std::env::temp_dir().join(format!(
1437
+ "honker-death-signal-{}-{}",
1438
+ std::process::id(),
1439
+ std::time::SystemTime::now()
1440
+ .duration_since(std::time::UNIX_EPOCH)
1441
+ .unwrap()
1442
+ .subsec_nanos()
1443
+ ));
1444
+ let _ = std::fs::remove_file(&tmp);
1445
+ {
1446
+ let conn = Connection::open(&tmp).unwrap();
1447
+ conn.execute_batch("PRAGMA journal_mode = WAL;").unwrap();
1448
+ }
1449
+
1450
+ let shared = SharedUpdateWatcher::new(tmp.clone());
1451
+ let (_id, rx) = shared.subscribe();
1452
+
1453
+ // Let the watcher snapshot the initial inode.
1454
+ std::thread::sleep(Duration::from_millis(200));
1455
+
1456
+ // Replace the file with a different inode. Triggers the
1457
+ // dead-man's switch on the next 100 ms tick.
1458
+ let other = std::env::temp_dir().join(format!(
1459
+ "honker-death-other-{}-{}",
1460
+ std::process::id(),
1461
+ std::time::SystemTime::now()
1462
+ .duration_since(std::time::UNIX_EPOCH)
1463
+ .unwrap()
1464
+ .subsec_nanos()
1465
+ ));
1466
+ let _ = std::fs::remove_file(&other);
1467
+ std::fs::File::create(&other).unwrap();
1468
+ std::fs::rename(&other, &tmp).unwrap();
1469
+
1470
+ // Within ~150 ms the watcher's identity check fires and panics;
1471
+ // WatcherDeathGuard's Drop clears senders; rx.recv() returns Err.
1472
+ // Use a generous timeout — give the watcher up to 2 s to die
1473
+ // and the guard to fire.
1474
+ let deadline = std::time::Instant::now() + Duration::from_secs(2);
1475
+ loop {
1476
+ if rx.try_recv().is_err() && rx.try_recv() != Ok(()) {
1477
+ // try_recv returns Err(Empty) for "alive but no msg",
1478
+ // Err(Disconnected) for "watcher died, sender cleared".
1479
+ // Use blocking recv with a poll instead.
1480
+ match rx.recv_timeout(Duration::from_millis(100)) {
1481
+ Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => break,
1482
+ _ => {}
1483
+ }
1484
+ }
1485
+ if std::time::Instant::now() > deadline {
1486
+ panic!(
1487
+ "watcher died but subscriber's channel never disconnected — \
1488
+ WatcherDeathGuard didn't fire?"
1489
+ );
1490
+ }
1491
+ }
1492
+
1493
+ let _ = std::fs::remove_file(&tmp);
1494
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
1495
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
1496
+ }
1497
+
1498
+ #[test]
1499
+ fn shared_update_watcher_prunes_subscribers_when_receiver_dropped() {
1500
+ let tmp = std::env::temp_dir().join(format!("honker-prune-test-{}", std::process::id()));
1501
+ let _ = std::fs::remove_file(&tmp);
1502
+ {
1503
+ let conn = Connection::open(&tmp).unwrap();
1504
+ conn.execute_batch("PRAGMA journal_mode = WAL;").unwrap();
1505
+ }
1506
+
1507
+ let shared = SharedUpdateWatcher::new(tmp.clone());
1508
+ {
1509
+ let _subs: Vec<_> = (0..10).map(|_| shared.subscribe()).collect();
1510
+ assert_eq!(shared.subscriber_count(), 10);
1511
+ }
1512
+
1513
+ // Trigger commits so the watcher attempts to send on each
1514
+ // dropped receiver and prunes them.
1515
+ let writer = Connection::open(&tmp).unwrap();
1516
+ writer
1517
+ .execute(
1518
+ "CREATE TABLE IF NOT EXISTS _test_prune(id INTEGER PRIMARY KEY)",
1519
+ [],
1520
+ )
1521
+ .unwrap();
1522
+ // Poll for pruning instead of sleeping a fixed duration —
1523
+ // the 1 ms poll thread needs to notice the commit AND
1524
+ // attempt to send on each dropped receiver AND prune. Under
1525
+ // parallel test load, 30 ms is not enough.
1526
+ let deadline = std::time::Instant::now() + Duration::from_secs(2);
1527
+ while shared.subscriber_count() != 0 && std::time::Instant::now() < deadline {
1528
+ std::thread::sleep(Duration::from_millis(5));
1529
+ writer
1530
+ .execute("INSERT INTO _test_prune(id) VALUES (random())", [])
1531
+ .unwrap();
1532
+ }
1533
+ assert_eq!(shared.subscriber_count(), 0);
1534
+
1535
+ let _ = std::fs::remove_file(&tmp);
1536
+ }
1537
+
1538
+ #[test]
1539
+ fn data_version_detects_commits_and_ignores_rollbacks() {
1540
+ let tmp = std::env::temp_dir().join(format!("honker-dv-test-{}", std::process::id()));
1541
+ let _ = std::fs::remove_file(&tmp);
1542
+ // PRAGMA data_version detects changes from OTHER connections.
1543
+ let watcher = Connection::open(&tmp).unwrap();
1544
+ watcher.execute_batch("PRAGMA journal_mode = WAL;").unwrap();
1545
+ let writer = Connection::open(&tmp).unwrap();
1546
+
1547
+ let v0 = poll_data_version(&watcher).unwrap();
1548
+
1549
+ // Commit increments data_version (observed by watcher).
1550
+ writer.execute("CREATE TABLE t(x INTEGER)", []).unwrap();
1551
+ let v1 = poll_data_version(&watcher).unwrap();
1552
+ assert!(v1 > v0, "commit should increment data_version");
1553
+
1554
+ // Rollback does NOT increment data_version.
1555
+ writer.execute_batch("BEGIN IMMEDIATE;").unwrap();
1556
+ writer.execute("INSERT INTO t VALUES (1)", []).unwrap();
1557
+ writer.execute_batch("ROLLBACK;").unwrap();
1558
+ let v2 = poll_data_version(&watcher).unwrap();
1559
+ assert_eq!(v2, v1, "rollback should not increment data_version");
1560
+
1561
+ let _ = std::fs::remove_file(&tmp);
1562
+ }
1563
+
1564
+ #[test]
1565
+ fn data_version_survives_wal_checkpoint() {
1566
+ let tmp = std::env::temp_dir().join(format!("honker-dv-ckpt-test-{}", std::process::id()));
1567
+ let _ = std::fs::remove_file(&tmp);
1568
+ // Watcher connection — observe changes from the writer.
1569
+ let watcher = Connection::open(&tmp).unwrap();
1570
+ watcher.execute_batch("PRAGMA journal_mode = WAL;").unwrap();
1571
+ let w0 = poll_data_version(&watcher).unwrap();
1572
+
1573
+ // Writer connection — make changes.
1574
+ let writer = Connection::open(&tmp).unwrap();
1575
+ writer.execute("CREATE TABLE t(x INTEGER)", []).unwrap();
1576
+ let w1 = poll_data_version(&watcher).unwrap();
1577
+ assert!(
1578
+ w1 > w0,
1579
+ "commit from other conn should increment data_version"
1580
+ );
1581
+
1582
+ // Checkpoint truncates WAL; watcher should still see the change.
1583
+ writer
1584
+ .execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")
1585
+ .unwrap();
1586
+ let w2 = poll_data_version(&watcher).unwrap();
1587
+ assert!(
1588
+ w2 > w1,
1589
+ "checkpoint from other conn should increment data_version"
1590
+ );
1591
+
1592
+ // Post-checkpoint commit still detected.
1593
+ writer.execute("INSERT INTO t VALUES (1)", []).unwrap();
1594
+ let w3 = poll_data_version(&watcher).unwrap();
1595
+ assert!(
1596
+ w3 > w2,
1597
+ "post-checkpoint commit should increment data_version"
1598
+ );
1599
+
1600
+ let _ = std::fs::remove_file(&tmp);
1601
+ }
1602
+
1603
+ // Gate to platforms where stat_identity returns real values.
1604
+ // On other targets the function returns (0, 0) for every call,
1605
+ // so the assert_ne! below would fire.
1606
+ #[cfg(any(unix, windows))]
1607
+ #[test]
1608
+ fn stat_identity_detects_file_replacement() {
1609
+ let tmp = std::env::temp_dir().join(format!("honker-id-test-{}", std::process::id()));
1610
+ let tmp2 = std::env::temp_dir().join(format!("honker-id-test2-{}", std::process::id()));
1611
+ let _ = std::fs::remove_file(&tmp);
1612
+ let _ = std::fs::remove_file(&tmp2);
1613
+
1614
+ // Create two distinct files.
1615
+ std::fs::write(&tmp, b"original").unwrap();
1616
+ std::fs::write(&tmp2, b"replacement").unwrap();
1617
+
1618
+ let id1 = stat_identity(&tmp).unwrap();
1619
+ let id2 = stat_identity(&tmp2).unwrap();
1620
+ assert_ne!(id1, id2, "different files should have different identities");
1621
+
1622
+ // After atomic rename, tmp now has tmp2's identity.
1623
+ std::fs::rename(&tmp2, &tmp).unwrap();
1624
+ let id3 = stat_identity(&tmp).unwrap();
1625
+ assert_eq!(
1626
+ id3, id2,
1627
+ "renamed file should carry the replacement's identity"
1628
+ );
1629
+
1630
+ let _ = std::fs::remove_file(&tmp);
1631
+ }
1632
+
1633
+ /// Direct test of the XOR-fold logic on synthetic 128-bit
1634
+ /// inputs. CI runners use NTFS, so the live `stat_identity` test
1635
+ /// above never exercises the `HighRes` arm with non-zero upper
1636
+ /// bits. This unit test does.
1637
+ #[cfg(any(unix, windows))]
1638
+ #[test]
1639
+ fn fold_high_res_uses_both_halves() {
1640
+ // NTFS-shaped: upper = 0, fold == lower.
1641
+ let (vsn, idx) = fold_high_res(0xAABB, 0x0000_0000_0000_0000_DEAD_BEEF_CAFE_F00D);
1642
+ assert_eq!(vsn, 0xAABB);
1643
+ assert_eq!(idx, 0xDEAD_BEEF_CAFE_F00D);
1644
+
1645
+ // ReFS-shaped: both halves non-zero, fold == upper XOR lower.
1646
+ let upper = 0x1111_2222_3333_4444u64;
1647
+ let lower = 0x5555_6666_7777_8888u64;
1648
+ let file_id = ((upper as u128) << 64) | (lower as u128);
1649
+ let (vsn, idx) = fold_high_res(0xCCDD, file_id);
1650
+ assert_eq!(vsn, 0xCCDD);
1651
+ assert_eq!(idx, upper ^ lower);
1652
+
1653
+ // Adversarial: upper == lower → fold == 0. This is the known
1654
+ // XOR weakness; documented and acceptable because ReFS
1655
+ // file_ids aren't constructed to satisfy this property.
1656
+ let same = 0xDEAD_BEEF_CAFE_F00Du64;
1657
+ let file_id = ((same as u128) << 64) | (same as u128);
1658
+ let (_, idx) = fold_high_res(0, file_id);
1659
+ assert_eq!(idx, 0);
1660
+ }
1661
+
1662
+ /// Watcher's dead-man's switch panics when the db file is
1663
+ /// replaced under it. Unix-only: rename-over-open works on
1664
+ /// Linux/macOS (the litestream / NFS-remount scenario) but
1665
+ /// Windows rejects it even with FILE_SHARE_DELETE, so the
1666
+ /// trigger is unreachable there. Windows behavior intentionally
1667
+ /// untested — replacement isn't a typical Windows pattern.
1668
+ #[cfg(unix)]
1669
+ #[test]
1670
+ fn update_watcher_panics_on_file_replacement() {
1671
+ let tmp =
1672
+ std::env::temp_dir().join(format!("honker-watcher-replace-{}", std::process::id()));
1673
+ let _ = std::fs::remove_file(&tmp);
1674
+
1675
+ // Create the DB so the watcher can open + stat it.
1676
+ {
1677
+ let conn = Connection::open(&tmp).unwrap();
1678
+ conn.execute_batch("PRAGMA journal_mode = WAL;").unwrap();
1679
+ }
1680
+
1681
+ let watcher = UpdateWatcher::spawn(tmp.clone(), || {});
1682
+
1683
+ // Give the watcher thread time to open and capture the initial
1684
+ // file identity before we replace the file.
1685
+ std::thread::sleep(Duration::from_millis(200));
1686
+
1687
+ // Replace the file. Atomic-rename instead of delete+create so
1688
+ // it works even when SQLite has the destination open
1689
+ // (Windows allows replace-on-rename for files opened with
1690
+ // FILE_SHARE_DELETE, which SQLite uses).
1691
+ let tmp2 =
1692
+ std::env::temp_dir().join(format!("honker-watcher-replace-new-{}", std::process::id()));
1693
+ let _ = std::fs::remove_file(&tmp2);
1694
+ {
1695
+ let conn = Connection::open(&tmp2).unwrap();
1696
+ conn.execute_batch("PRAGMA journal_mode = WAL;").unwrap();
1697
+ }
1698
+ std::fs::rename(&tmp2, &tmp).unwrap();
1699
+
1700
+ // Wait for the next time-based identity check to fire and
1701
+ // panic.
1702
+ std::thread::sleep(Duration::from_millis(500));
1703
+
1704
+ // Stop and join. Should be Err because the thread panicked.
1705
+ let result = watcher.join();
1706
+ assert!(
1707
+ result.is_err(),
1708
+ "watcher should have panicked on file replacement, instead got Ok"
1709
+ );
1710
+ let payload = result.unwrap_err();
1711
+ let msg = if let Some(s) = payload.downcast_ref::<String>() {
1712
+ s.clone()
1713
+ } else if let Some(s) = payload.downcast_ref::<&str>() {
1714
+ (*s).to_string()
1715
+ } else {
1716
+ String::from("<panic payload not a string>")
1717
+ };
1718
+ assert!(
1719
+ msg.contains("database file replaced"),
1720
+ "panic message should mention replacement; got: {msg}"
1721
+ );
1722
+
1723
+ let _ = std::fs::remove_file(&tmp);
1724
+ }
1725
+
1726
+ /// Verify `poll_data_version` detects cross-connection commits in
1727
+ /// every supported journal mode. WAL was the only mode that had
1728
+ /// explicit coverage before; the bootstrap-without-database update in
1729
+ /// commit `c6716d5` made the watcher work in any mode but never
1730
+ /// added tests for the others. This closes that gap.
1731
+ fn poll_data_version_works_in_journal_mode(mode: &str) {
1732
+ let tmp = std::env::temp_dir().join(format!(
1733
+ "honker-jm-{}-{}",
1734
+ mode.to_ascii_lowercase(),
1735
+ std::process::id()
1736
+ ));
1737
+ let _ = std::fs::remove_file(&tmp);
1738
+
1739
+ let watcher = Connection::open(&tmp).unwrap();
1740
+ watcher
1741
+ .execute_batch(&format!("PRAGMA journal_mode = {mode};"))
1742
+ .unwrap();
1743
+
1744
+ // Verify the mode actually took effect. SQLite returns the
1745
+ // resulting mode from the PRAGMA, but `execute_batch`
1746
+ // discards the result — without this assertion, a silent
1747
+ // fallback (e.g., to `MEMORY` for `:memory:` databases, or
1748
+ // a sticky setting that won't change) would leave the test
1749
+ // green while exercising a different mode entirely.
1750
+ let actual: String = watcher
1751
+ .pragma_query_value(None, "journal_mode", |r| r.get(0))
1752
+ .unwrap();
1753
+ assert_eq!(
1754
+ actual.to_ascii_uppercase(),
1755
+ mode.to_ascii_uppercase(),
1756
+ "PRAGMA journal_mode = {mode} silently fell back to {actual}"
1757
+ );
1758
+
1759
+ let writer = Connection::open(&tmp).unwrap();
1760
+
1761
+ let v0 = poll_data_version(&watcher).unwrap();
1762
+
1763
+ // Commit increments data_version (observed across connections).
1764
+ writer.execute("CREATE TABLE t(x INTEGER)", []).unwrap();
1765
+ let v1 = poll_data_version(&watcher).unwrap();
1766
+ assert!(
1767
+ v1 > v0,
1768
+ "journal_mode={mode}: cross-conn commit should bump \
1769
+ data_version; saw {v0} -> {v1}"
1770
+ );
1771
+
1772
+ // Rollback should NOT increment data_version (still true in
1773
+ // non-WAL modes — the docs are journal-mode-agnostic on this).
1774
+ writer.execute_batch("BEGIN IMMEDIATE;").unwrap();
1775
+ writer.execute("INSERT INTO t VALUES (1)", []).unwrap();
1776
+ writer.execute_batch("ROLLBACK;").unwrap();
1777
+ let v2 = poll_data_version(&watcher).unwrap();
1778
+ assert_eq!(
1779
+ v2, v1,
1780
+ "journal_mode={mode}: rollback should not bump data_version"
1781
+ );
1782
+
1783
+ let _ = std::fs::remove_file(&tmp);
1784
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
1785
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
1786
+ let _ = std::fs::remove_file(format!("{}-journal", tmp.display()));
1787
+ }
1788
+
1789
+ #[test]
1790
+ fn poll_data_version_works_in_wal() {
1791
+ poll_data_version_works_in_journal_mode("WAL");
1792
+ }
1793
+
1794
+ #[test]
1795
+ fn poll_data_version_works_in_delete() {
1796
+ poll_data_version_works_in_journal_mode("DELETE");
1797
+ }
1798
+
1799
+ #[test]
1800
+ fn poll_data_version_works_in_truncate() {
1801
+ poll_data_version_works_in_journal_mode("TRUNCATE");
1802
+ }
1803
+
1804
+ #[test]
1805
+ fn poll_data_version_works_in_persist() {
1806
+ poll_data_version_works_in_journal_mode("PERSIST");
1807
+ }
1808
+
1809
+ // MEMORY journal mode is per-connection (the journal lives in
1810
+ // RAM, not a file), so cross-connection rollback semantics are
1811
+ // different. SQLite's docs are clear that MEMORY is intended for
1812
+ // single-process use. honker doesn't promise MEMORY support, so
1813
+ // we don't test it here — flagging in case it ever becomes a
1814
+ // user-visible question.
1815
+
1816
+ /// Crash-recovery: python3 child commits in a loop, parent
1817
+ /// SIGKILLs it mid-flight, reopens, asserts integrity_check=ok,
1818
+ /// committed rows survive, and reopen works (WAL replay).
1819
+ /// Cross-platform — Windows tests that file-handle release on
1820
+ /// kill is clean enough for reopen to succeed.
1821
+ #[test]
1822
+ fn writer_killed_mid_workload_leaves_db_consistent() {
1823
+ use std::process::{Command, Stdio};
1824
+
1825
+ // Try `python3` then `python`. CI always has one; dev boxes
1826
+ // may not. Skip loudly rather than fail silently.
1827
+ let python = ["python3", "python"]
1828
+ .iter()
1829
+ .find(|cmd| {
1830
+ Command::new(cmd)
1831
+ .arg("--version")
1832
+ .stdout(Stdio::null())
1833
+ .stderr(Stdio::null())
1834
+ .status()
1835
+ .map(|s| s.success())
1836
+ .unwrap_or(false)
1837
+ })
1838
+ .map(|s| *s);
1839
+ let Some(python) = python else {
1840
+ eprintln!(
1841
+ "writer_killed_mid_workload_leaves_db_consistent: \
1842
+ no `python3` or `python` on PATH; skipping (set up Python \
1843
+ to exercise the crash-recovery path)"
1844
+ );
1845
+ return;
1846
+ };
1847
+
1848
+ let tmp = std::env::temp_dir().join(format!("honker-crash-{}", std::process::id()));
1849
+ let _ = std::fs::remove_file(&tmp);
1850
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
1851
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
1852
+
1853
+ // Bootstrap schema + WAL mode in the parent.
1854
+ {
1855
+ let conn = Connection::open(&tmp).unwrap();
1856
+ conn.execute_batch(
1857
+ "PRAGMA journal_mode = WAL;
1858
+ PRAGMA synchronous = NORMAL;
1859
+ CREATE TABLE q(id INTEGER PRIMARY KEY AUTOINCREMENT, v INTEGER);",
1860
+ )
1861
+ .unwrap();
1862
+ }
1863
+
1864
+ // Spawn a Python child that writes committed rows in a tight
1865
+ // loop. Open DB in WAL mode + synchronous=NORMAL to match
1866
+ // honker's default. Each iteration is its own auto-commit
1867
+ // transaction. The path is debug-formatted so quoting is
1868
+ // correct on every platform (Windows backslashes are
1869
+ // escaped, unix paths get safe quoting).
1870
+ let writer_script = format!(
1871
+ r#"
1872
+ import sqlite3
1873
+ conn = sqlite3.connect({path:?})
1874
+ conn.execute("PRAGMA journal_mode = WAL")
1875
+ conn.execute("PRAGMA synchronous = NORMAL")
1876
+ i = 0
1877
+ while True:
1878
+ conn.execute("INSERT INTO q(v) VALUES (?)", (i,))
1879
+ conn.commit()
1880
+ i += 1
1881
+ "#,
1882
+ path = tmp.to_str().unwrap()
1883
+ );
1884
+
1885
+ let mut child = Command::new(python)
1886
+ .arg("-c")
1887
+ .arg(&writer_script)
1888
+ .stdout(Stdio::null())
1889
+ .stderr(Stdio::piped())
1890
+ .spawn()
1891
+ .unwrap_or_else(|e| panic!("spawn {python} child writer: {e}"));
1892
+
1893
+ // Poll the database from a separate connection until we see
1894
+ // at least one committed row. This turns a timing-fragile
1895
+ // "sleep N ms and hope" into a deterministic "kill once
1896
+ // we've observed a commit" — robust across slow-Python
1897
+ // startup on Windows, loaded CI runners, etc.
1898
+ let read_conn = Connection::open(&tmp).unwrap();
1899
+ let deadline = std::time::Instant::now() + Duration::from_secs(15);
1900
+ let mut high_water: i64 = 0;
1901
+ while std::time::Instant::now() < deadline {
1902
+ // Bail early if the child died — surface its stderr
1903
+ // rather than the downstream "got 0 rows" symptom.
1904
+ if let Ok(Some(status)) = child.try_wait() {
1905
+ let mut stderr = String::new();
1906
+ if let Some(mut s) = child.stderr.take() {
1907
+ use std::io::Read;
1908
+ let _ = s.read_to_string(&mut stderr);
1909
+ }
1910
+ panic!(
1911
+ "python child exited before kill (status={status:?}); \
1912
+ stderr: {stderr}"
1913
+ );
1914
+ }
1915
+ if let Ok(c) = read_conn.query_row("SELECT count(*) FROM q", [], |r| r.get::<_, i64>(0))
1916
+ {
1917
+ if c > 0 {
1918
+ high_water = c;
1919
+ break;
1920
+ }
1921
+ }
1922
+ std::thread::sleep(Duration::from_millis(50));
1923
+ }
1924
+ // Drop the read connection before kill so we don't hold any
1925
+ // shared lock when the child's process is reaped.
1926
+ drop(read_conn);
1927
+
1928
+ // Let a few more commits accumulate so we test "lots of
1929
+ // committed transactions, then sudden death" rather than
1930
+ // "exactly one commit" — gives the WAL-replay path
1931
+ // something more interesting to recover.
1932
+ std::thread::sleep(Duration::from_millis(200));
1933
+
1934
+ // Hard kill. `Child::kill` sends SIGKILL on unix and
1935
+ // `TerminateProcess` on Windows. No chance for graceful
1936
+ // close — file handles are released by the OS, and any
1937
+ // outstanding writes-since-last-fsync are lost.
1938
+ let _ = child.kill();
1939
+ let _ = child.wait();
1940
+
1941
+ // Reopen and verify. On Windows the OS may take a moment
1942
+ // to fully release the killed process's file locks; a tight
1943
+ // retry loop on the open absorbs that without flaking.
1944
+ let conn = (0..20)
1945
+ .find_map(|i| match Connection::open(&tmp) {
1946
+ Ok(c) => Some(c),
1947
+ Err(_) => {
1948
+ std::thread::sleep(Duration::from_millis(50 * (i + 1)));
1949
+ None
1950
+ }
1951
+ })
1952
+ .unwrap_or_else(|| {
1953
+ Connection::open(&tmp).expect("reopen after retry budget exhausted")
1954
+ });
1955
+ let integrity: String = conn
1956
+ .query_row("PRAGMA integrity_check", [], |r| r.get(0))
1957
+ .unwrap();
1958
+ assert_eq!(
1959
+ integrity, "ok",
1960
+ "DB should be intact after writer hard-kill during WAL writes"
1961
+ );
1962
+
1963
+ let count: i64 = conn
1964
+ .query_row("SELECT count(*) FROM q", [], |r| r.get(0))
1965
+ .unwrap();
1966
+ // Stronger durability assertion: at least the rows we
1967
+ // observed before kill must still be there. (Likely many
1968
+ // more committed in the +200ms window before the kill —
1969
+ // we're checking the floor, not the exact count.)
1970
+ assert!(
1971
+ count >= high_water,
1972
+ "lost committed rows: observed {high_water} before kill, \
1973
+ only {count} present after reopen"
1974
+ );
1975
+ assert!(
1976
+ count > 0,
1977
+ "expected the child to commit some rows in the 15s window \
1978
+ before timeout; got {count}"
1979
+ );
1980
+
1981
+ // Drop the connection before cleanup — Windows can't unlink
1982
+ // open files. (Linux/macOS tolerate this either way.)
1983
+ drop(conn);
1984
+
1985
+ let _ = std::fs::remove_file(&tmp);
1986
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
1987
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
1988
+ }
1989
+
1990
+ /// Long-running soak: watcher + committer for
1991
+ /// `HONKER_SOAK_DURATION_SECS` (default 1h). Asserts
1992
+ /// integrity_check=ok, exact row count, and ≥10% of expected
1993
+ /// wake rate. Doesn't track leaks (run under valgrind/heaptrack
1994
+ /// for that — issue #12). Ignored by default; CI never runs it.
1995
+ ///
1996
+ /// ```sh
1997
+ /// HONKER_SOAK_DURATION_SECS=600 \
1998
+ /// cargo test -p honker-core --release --lib \
1999
+ /// soak_watcher_durability -- --ignored --nocapture
2000
+ /// ```
2001
+ #[test]
2002
+ #[ignore]
2003
+ fn soak_watcher_durability() {
2004
+ use std::sync::Arc;
2005
+ use std::sync::atomic::{AtomicU64, Ordering};
2006
+
2007
+ let duration_secs: u64 = std::env::var("HONKER_SOAK_DURATION_SECS")
2008
+ .ok()
2009
+ .and_then(|s| s.parse().ok())
2010
+ .unwrap_or(3600);
2011
+
2012
+ eprintln!("soak: running for {duration_secs} seconds");
2013
+
2014
+ let tmp = std::env::temp_dir().join(format!("honker-soak-{}", std::process::id()));
2015
+ let _ = std::fs::remove_file(&tmp);
2016
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
2017
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
2018
+
2019
+ {
2020
+ let conn = Connection::open(&tmp).unwrap();
2021
+ conn.execute_batch(
2022
+ "PRAGMA journal_mode = WAL;
2023
+ PRAGMA synchronous = NORMAL;
2024
+ CREATE TABLE q(id INTEGER PRIMARY KEY AUTOINCREMENT, v INTEGER);",
2025
+ )
2026
+ .unwrap();
2027
+ }
2028
+
2029
+ let observed = Arc::new(AtomicU64::new(0));
2030
+ let observed_w = observed.clone();
2031
+ let watcher = UpdateWatcher::spawn(tmp.clone(), move || {
2032
+ observed_w.fetch_add(1, Ordering::Relaxed);
2033
+ });
2034
+
2035
+ // Committer thread. Commits ~100/sec — pacing keeps WAL from
2036
+ // growing unboundedly between checkpoints and gives the
2037
+ // watcher time to actually observe each change.
2038
+ let stop = Arc::new(std::sync::atomic::AtomicBool::new(false));
2039
+ let stop_w = stop.clone();
2040
+ let tmp_w = tmp.clone();
2041
+ let writer_handle = std::thread::Builder::new()
2042
+ .name("soak-writer".into())
2043
+ .spawn(move || {
2044
+ let conn = Connection::open(&tmp_w).unwrap();
2045
+ let mut i: i64 = 0;
2046
+ while !stop_w.load(Ordering::Acquire) {
2047
+ conn.execute("INSERT INTO q(v) VALUES (?1)", [i]).unwrap();
2048
+ i += 1;
2049
+ std::thread::sleep(Duration::from_millis(10));
2050
+ }
2051
+ i
2052
+ })
2053
+ .unwrap();
2054
+
2055
+ // Run the soak.
2056
+ std::thread::sleep(Duration::from_secs(duration_secs));
2057
+
2058
+ // Stop the writer and join. join() returns Err if the
2059
+ // thread panicked; surface that explicitly rather than the
2060
+ // opaque `unwrap` panic-on-Err message.
2061
+ stop.store(true, Ordering::Release);
2062
+ let writer_result = writer_handle.join();
2063
+ assert!(
2064
+ writer_result.is_ok(),
2065
+ "writer thread panicked during soak: {writer_result:?}"
2066
+ );
2067
+ let writes = writer_result.unwrap();
2068
+
2069
+ // Stop the watcher and join. join() returns Err if it
2070
+ // panicked; for a clean soak we expect Ok.
2071
+ let watcher_result = watcher.join();
2072
+ assert!(
2073
+ watcher_result.is_ok(),
2074
+ "watcher thread panicked during soak: {watcher_result:?}"
2075
+ );
2076
+
2077
+ // Verify integrity, row count, and that the watcher observed
2078
+ // a reasonable fraction of the writes.
2079
+ let conn = Connection::open(&tmp).unwrap();
2080
+ let integrity: String = conn
2081
+ .query_row("PRAGMA integrity_check", [], |r| r.get(0))
2082
+ .unwrap();
2083
+ assert_eq!(integrity, "ok", "soak ended with corrupt DB");
2084
+
2085
+ let count: i64 = conn
2086
+ .query_row("SELECT count(*) FROM q", [], |r| r.get(0))
2087
+ .unwrap();
2088
+ assert_eq!(
2089
+ count, writes,
2090
+ "row count {count} should match writer's reported {writes}"
2091
+ );
2092
+
2093
+ let observed_count = observed.load(Ordering::Relaxed);
2094
+ // The committer commits every 10ms → ~100 wakes/sec
2095
+ // expected. Floor at 10% of that absorbs runner jitter,
2096
+ // merged ticks (multiple commits in one watcher poll), and
2097
+ // initial-warmup time. Anything below this floor means the
2098
+ // watcher silently stalled or fired far below the commit
2099
+ // rate — both real regressions worth catching.
2100
+ let expected = duration_secs * 100;
2101
+ let floor = expected / 10;
2102
+ assert!(
2103
+ observed_count >= floor,
2104
+ "watcher saw only {observed_count} wakes in {duration_secs}s; \
2105
+ expected ≥ {floor} (10% of theoretical {expected}; writer committed {writes})"
2106
+ );
2107
+
2108
+ eprintln!(
2109
+ "soak: {duration_secs}s, {writes} writes, {observed_count} observed wakes, integrity ok"
2110
+ );
2111
+
2112
+ let _ = std::fs::remove_file(&tmp);
2113
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
2114
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
2115
+ }
2116
+
2117
+ #[test]
2118
+ fn bootstrap_pre_mantle_database_gets_enabled_column() {
2119
+ // Simulate a pre-Mantle database: create _honker_scheduler_tasks
2120
+ // by hand WITHOUT the `enabled` column. Then bootstrap should
2121
+ // detect the missing column and ALTER TABLE ADD it.
2122
+ let conn = mem();
2123
+ conn.execute_batch(
2124
+ "CREATE TABLE _honker_scheduler_tasks (
2125
+ name TEXT PRIMARY KEY,
2126
+ queue TEXT NOT NULL,
2127
+ cron_expr TEXT NOT NULL,
2128
+ payload TEXT NOT NULL,
2129
+ priority INTEGER NOT NULL DEFAULT 0,
2130
+ expires_s INTEGER,
2131
+ next_fire_at INTEGER NOT NULL
2132
+ );",
2133
+ )
2134
+ .unwrap();
2135
+ // Insert a row to prove existing data survives the migration.
2136
+ conn.execute(
2137
+ "INSERT INTO _honker_scheduler_tasks
2138
+ (name, queue, cron_expr, payload, priority, expires_s, next_fire_at)
2139
+ VALUES ('legacy', 'q', '0 9 * * *', '{}', 0, NULL, 1)",
2140
+ [],
2141
+ )
2142
+ .unwrap();
2143
+
2144
+ bootstrap_honker_schema(&conn).unwrap();
2145
+
2146
+ // Column exists now.
2147
+ let has: bool = conn
2148
+ .query_row(
2149
+ "SELECT 1 FROM pragma_table_info('_honker_scheduler_tasks') WHERE name='enabled'",
2150
+ [],
2151
+ |_| Ok(true),
2152
+ )
2153
+ .unwrap_or(false);
2154
+ assert!(has, "enabled column should be present after bootstrap");
2155
+
2156
+ // Existing row got the default and survived.
2157
+ let (cnt, enabled): (i64, i64) = conn
2158
+ .query_row(
2159
+ "SELECT COUNT(*), COALESCE(MAX(enabled), -1) FROM _honker_scheduler_tasks WHERE name='legacy'",
2160
+ [],
2161
+ |r| Ok((r.get(0)?, r.get(1)?)),
2162
+ )
2163
+ .unwrap();
2164
+ assert_eq!(cnt, 1, "existing row must survive migration");
2165
+ assert_eq!(enabled, 1, "existing row must default to enabled=1");
2166
+
2167
+ // Re-running bootstrap is a no-op (idempotent).
2168
+ bootstrap_honker_schema(&conn).unwrap();
2169
+ }
2170
+
2171
+ #[test]
2172
+ fn bootstrap_honker_schema_creates_tables_and_index() {
2173
+ let conn = mem();
2174
+ bootstrap_honker_schema(&conn).unwrap();
2175
+
2176
+ // Idempotent.
2177
+ bootstrap_honker_schema(&conn).unwrap();
2178
+
2179
+ // _honker_live has the 12 columns we expect (Python binding
2180
+ // and the extension have historically disagreed on _honker_dead
2181
+ // column count; this pins both).
2182
+ let live_cols: Vec<String> = conn
2183
+ .prepare("SELECT name FROM pragma_table_info('_honker_live')")
2184
+ .unwrap()
2185
+ .query_map([], |r| r.get::<_, String>(0))
2186
+ .unwrap()
2187
+ .collect::<Result<Vec<_>, _>>()
2188
+ .unwrap();
2189
+ assert_eq!(live_cols.len(), 12);
2190
+ assert!(live_cols.contains(&"expires_at".to_string()));
2191
+
2192
+ let dead_cols: Vec<String> = conn
2193
+ .prepare("SELECT name FROM pragma_table_info('_honker_dead')")
2194
+ .unwrap()
2195
+ .query_map([], |r| r.get::<_, String>(0))
2196
+ .unwrap()
2197
+ .collect::<Result<Vec<_>, _>>()
2198
+ .unwrap();
2199
+ assert_eq!(dead_cols.len(), 10);
2200
+ assert!(dead_cols.contains(&"priority".to_string()));
2201
+ assert!(dead_cols.contains(&"run_at".to_string()));
2202
+ assert!(dead_cols.contains(&"max_attempts".to_string()));
2203
+ assert!(dead_cols.contains(&"created_at".to_string()));
2204
+
2205
+ // Partial index present.
2206
+ let idx: i64 = conn
2207
+ .query_row(
2208
+ "SELECT COUNT(*) FROM sqlite_master
2209
+ WHERE type='index' AND name='_honker_live_claim'",
2210
+ [],
2211
+ |r| r.get(0),
2212
+ )
2213
+ .unwrap();
2214
+ assert_eq!(idx, 1);
2215
+
2216
+ // _honker_locks table present for db.lock() support.
2217
+ let locks_cols: Vec<String> = conn
2218
+ .prepare("SELECT name FROM pragma_table_info('_honker_locks')")
2219
+ .unwrap()
2220
+ .query_map([], |r| r.get::<_, String>(0))
2221
+ .unwrap()
2222
+ .collect::<Result<Vec<_>, _>>()
2223
+ .unwrap();
2224
+ assert_eq!(locks_cols, vec!["name", "owner", "expires_at"]);
2225
+
2226
+ // _honker_rate_limits table present for db.try_rate_limit().
2227
+ let rl_cols: Vec<String> = conn
2228
+ .prepare("SELECT name FROM pragma_table_info('_honker_rate_limits')")
2229
+ .unwrap()
2230
+ .query_map([], |r| r.get::<_, String>(0))
2231
+ .unwrap()
2232
+ .collect::<Result<Vec<_>, _>>()
2233
+ .unwrap();
2234
+ assert_eq!(rl_cols, vec!["name", "window_start", "count"]);
2235
+
2236
+ // _honker_scheduler_tasks table present for Scheduler's
2237
+ // per-task registration + next_fire_at persistence.
2238
+ let sched_cols: Vec<String> = conn
2239
+ .prepare("SELECT name FROM pragma_table_info('_honker_scheduler_tasks')")
2240
+ .unwrap()
2241
+ .query_map([], |r| r.get::<_, String>(0))
2242
+ .unwrap()
2243
+ .collect::<Result<Vec<_>, _>>()
2244
+ .unwrap();
2245
+ assert_eq!(
2246
+ sched_cols,
2247
+ vec![
2248
+ "name",
2249
+ "queue",
2250
+ "cron_expr",
2251
+ "payload",
2252
+ "priority",
2253
+ "expires_s",
2254
+ "next_fire_at",
2255
+ "enabled",
2256
+ ],
2257
+ );
2258
+ let res_cols: Vec<String> = conn
2259
+ .prepare("SELECT name FROM pragma_table_info('_honker_results')")
2260
+ .unwrap()
2261
+ .query_map([], |r| r.get::<_, String>(0))
2262
+ .unwrap()
2263
+ .collect::<Result<Vec<_>, _>>()
2264
+ .unwrap();
2265
+ assert_eq!(
2266
+ res_cols,
2267
+ vec!["job_id", "value", "created_at", "expires_at"]
2268
+ );
2269
+
2270
+ // _honker_stream + _honker_stream_consumers tables for
2271
+ // durable pub/sub streams.
2272
+ let stream_cols: Vec<String> = conn
2273
+ .prepare("SELECT name FROM pragma_table_info('_honker_stream')")
2274
+ .unwrap()
2275
+ .query_map([], |r| r.get::<_, String>(0))
2276
+ .unwrap()
2277
+ .collect::<Result<Vec<_>, _>>()
2278
+ .unwrap();
2279
+ assert_eq!(
2280
+ stream_cols,
2281
+ vec!["offset", "topic", "key", "payload", "created_at"]
2282
+ );
2283
+ let sc_cols: Vec<String> = conn
2284
+ .prepare("SELECT name FROM pragma_table_info('_honker_stream_consumers')")
2285
+ .unwrap()
2286
+ .query_map([], |r| r.get::<_, String>(0))
2287
+ .unwrap()
2288
+ .collect::<Result<Vec<_>, _>>()
2289
+ .unwrap();
2290
+ assert_eq!(sc_cols, vec!["name", "topic", "offset"]);
2291
+ }
2292
+
2293
+ // -----------------------------------------------------------------
2294
+ // Optional backend tests
2295
+ // -----------------------------------------------------------------
2296
+
2297
+ /// Run the wake/listen suite against the kernel-watch backend.
2298
+ /// Each commit separated by 20 ms ensures both the 1 ms poller
2299
+ /// and the kernel-watch loop have time to fire before the next.
2300
+ #[test]
2301
+ #[cfg(feature = "kernel-watcher")]
2302
+ fn kernel_watcher_detects_all_commits() {
2303
+ use std::sync::atomic::{AtomicU32, Ordering as AO};
2304
+
2305
+ let tmp = std::env::temp_dir().join(format!(
2306
+ "honker-kernel-watcher-{}-{}",
2307
+ std::process::id(),
2308
+ std::time::SystemTime::now()
2309
+ .duration_since(std::time::UNIX_EPOCH)
2310
+ .unwrap()
2311
+ .subsec_nanos()
2312
+ ));
2313
+ let _ = std::fs::remove_file(&tmp);
2314
+
2315
+ let writer = open_conn(tmp.to_str().unwrap(), false).unwrap();
2316
+ writer.execute_batch("CREATE TABLE t (x INT)").unwrap();
2317
+ // One initial write ensures the -wal file exists so the watcher
2318
+ // can attach a per-file watch at startup (kqueue watches the file
2319
+ // descriptor, not the directory, for write events).
2320
+ writer.execute("INSERT INTO t VALUES (0)", []).unwrap();
2321
+ std::thread::sleep(Duration::from_millis(20));
2322
+
2323
+ let count = Arc::new(AtomicU32::new(0));
2324
+ let count_t = count.clone();
2325
+ let watcher = UpdateWatcher::spawn_with_config(
2326
+ tmp.clone(),
2327
+ move || {
2328
+ count_t.fetch_add(1, AO::Relaxed);
2329
+ },
2330
+ WatcherConfig {
2331
+ backend: WatcherBackend::KernelWatch,
2332
+ },
2333
+ );
2334
+
2335
+ // Drain any initialization wakes.
2336
+ std::thread::sleep(Duration::from_millis(50));
2337
+ count.store(0, AO::SeqCst);
2338
+
2339
+ // n commits spaced 30 ms apart — gives the event loop time to
2340
+ // process each event individually before the next arrives.
2341
+ let n: u32 = 5;
2342
+ for i in 1..=n {
2343
+ writer
2344
+ .execute(&format!("INSERT INTO t VALUES ({i})"), [])
2345
+ .unwrap();
2346
+ std::thread::sleep(Duration::from_millis(30));
2347
+ }
2348
+ // Wait longer than both the event delivery latency and the
2349
+ // safety-net interval to drain any pending events.
2350
+ std::thread::sleep(Duration::from_millis(600));
2351
+
2352
+ let observed = count.load(AO::SeqCst);
2353
+ drop(watcher);
2354
+ let _ = std::fs::remove_file(&tmp);
2355
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
2356
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
2357
+
2358
+ // Experimental contract: spurious wakes are allowed (the backend
2359
+ // fires on every filesystem event, and SQLite produces several
2360
+ // events per commit). The thing that must not happen is a *missed*
2361
+ // commit — assert at least n wakes.
2362
+ assert!(
2363
+ observed >= n,
2364
+ "kernel watcher detected {observed} wakes for {n} commits — \
2365
+ missed at least one"
2366
+ );
2367
+ }
2368
+
2369
+ /// Prove that the shm fast path fires on the same commits as the
2370
+ /// baseline `PRAGMA data_version` poller.
2371
+ ///
2372
+ /// Phase gate: both detectors must report exactly N wakes for N
2373
+ /// commits spaced far enough apart that neither can batch them.
2374
+ #[test]
2375
+ #[cfg(feature = "shm-fast-path")]
2376
+ fn shm_fast_path_equivalence_with_pragma_baseline() {
2377
+ use std::sync::atomic::{AtomicU32, Ordering as AO};
2378
+
2379
+ let tmp = std::env::temp_dir().join(format!(
2380
+ "honker-shm-equiv-{}-{}",
2381
+ std::process::id(),
2382
+ std::time::SystemTime::now()
2383
+ .duration_since(std::time::UNIX_EPOCH)
2384
+ .unwrap()
2385
+ .subsec_nanos()
2386
+ ));
2387
+ let _ = std::fs::remove_file(&tmp);
2388
+
2389
+ let writer = open_conn(tmp.to_str().unwrap(), false).unwrap();
2390
+ writer.execute_batch("CREATE TABLE t (x INT)").unwrap();
2391
+ // One write ensures the -shm file exists before spawning the shm watcher.
2392
+ writer.execute("INSERT INTO t VALUES (0)", []).unwrap();
2393
+ std::thread::sleep(Duration::from_millis(20));
2394
+
2395
+ let baseline_count = Arc::new(AtomicU32::new(0));
2396
+ let baseline_t = baseline_count.clone();
2397
+ let baseline = UpdateWatcher::spawn(tmp.clone(), move || {
2398
+ baseline_t.fetch_add(1, AO::Relaxed);
2399
+ });
2400
+
2401
+ let shm_count = Arc::new(AtomicU32::new(0));
2402
+ let shm_t = shm_count.clone();
2403
+ let shm = UpdateWatcher::spawn_with_config(
2404
+ tmp.clone(),
2405
+ move || {
2406
+ shm_t.fetch_add(1, AO::Relaxed);
2407
+ },
2408
+ WatcherConfig {
2409
+ backend: WatcherBackend::ShmFastPath,
2410
+ },
2411
+ );
2412
+
2413
+ // Drain initialization wakes.
2414
+ std::thread::sleep(Duration::from_millis(30));
2415
+ baseline_count.store(0, AO::SeqCst);
2416
+ shm_count.store(0, AO::SeqCst);
2417
+
2418
+ // Commits spaced 20 ms apart — well above both polling intervals.
2419
+ let n: u32 = 5;
2420
+ for i in 1..=n {
2421
+ writer
2422
+ .execute(&format!("INSERT INTO t VALUES ({i})"), [])
2423
+ .unwrap();
2424
+ std::thread::sleep(Duration::from_millis(20));
2425
+ }
2426
+ std::thread::sleep(Duration::from_millis(100));
2427
+
2428
+ let b = baseline_count.load(AO::SeqCst);
2429
+ let s = shm_count.load(AO::SeqCst);
2430
+
2431
+ drop(baseline);
2432
+ drop(shm);
2433
+ drop(writer);
2434
+ let _ = std::fs::remove_file(&tmp);
2435
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
2436
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
2437
+
2438
+ assert_eq!(b, n, "baseline detected {b} wakes, expected {n}");
2439
+ assert_eq!(
2440
+ s, n,
2441
+ "shm fast path detected {s} wakes, expected {n} (same as baseline {b})"
2442
+ );
2443
+ }
2444
+
2445
+ // -----------------------------------------------------------------
2446
+ // Journal-mode coverage for the experimental backends
2447
+ //
2448
+ // honker's `open_conn` always sets WAL, so the public Python/Node
2449
+ // surface is WAL-only. These tests poke the watchers directly at
2450
+ // databases pre-set to non-WAL modes so we can prove behavior when
2451
+ // the file is in DELETE / TRUNCATE / PERSIST. Justification per
2452
+ // backend:
2453
+ //
2454
+ // - Polling — universally works because `PRAGMA data_version`
2455
+ // advances on every commit regardless of journal mode. Already
2456
+ // exercised by `poll_data_version_works_in_*`.
2457
+ //
2458
+ // - Kernel watcher — in non-WAL modes there is no `-wal` file to
2459
+ // watch directly; we must rely on the parent-directory watch to
2460
+ // pick up `-journal` create / modify / delete events around each
2461
+ // commit. The PRAGMA verification step still gates `on_change()`,
2462
+ // so spurious events (e.g. another file in the dir) just produce
2463
+ // harmless no-op checks.
2464
+ //
2465
+ // - SHM fast path — in non-WAL modes there is no `-shm` file;
2466
+ // `read_ichange` returns `None` and the loop falls back to the
2467
+ // PRAGMA check on every iteration. Effectively becomes a 100 µs
2468
+ // PRAGMA poller — correct, just CPU-heavier than the polling
2469
+ // backend.
2470
+ // -----------------------------------------------------------------
2471
+
2472
+ /// Drive `n` committed inserts through `writer`, spaced
2473
+ /// `spacing_ms` apart, and return how many `on_change()` calls the
2474
+ /// watcher observed (with the initial drain already deducted).
2475
+ fn drive_and_count_wakes(
2476
+ backend: WatcherBackend,
2477
+ db_path: PathBuf,
2478
+ n: u32,
2479
+ spacing_ms: u64,
2480
+ ) -> u32 {
2481
+ use std::sync::atomic::{AtomicU32, Ordering as AO};
2482
+
2483
+ let count = Arc::new(AtomicU32::new(0));
2484
+ let count_t = count.clone();
2485
+ let watcher = UpdateWatcher::spawn_with_config(
2486
+ db_path.clone(),
2487
+ move || {
2488
+ count_t.fetch_add(1, AO::Relaxed);
2489
+ },
2490
+ WatcherConfig { backend },
2491
+ );
2492
+
2493
+ // Drain init wakes (covers shm + kernel setup) before baseline.
2494
+ std::thread::sleep(Duration::from_millis(80));
2495
+ count.store(0, AO::SeqCst);
2496
+
2497
+ let writer = Connection::open(&db_path).unwrap();
2498
+ for i in 1..=n {
2499
+ writer
2500
+ .execute(&format!("INSERT INTO t VALUES ({i})"), [])
2501
+ .unwrap();
2502
+ std::thread::sleep(Duration::from_millis(spacing_ms));
2503
+ }
2504
+ // Drain the slowest safety net (kernel = 500 ms) + one cycle.
2505
+ std::thread::sleep(Duration::from_millis(700));
2506
+
2507
+ let observed = count.load(AO::SeqCst);
2508
+ drop(watcher);
2509
+ drop(writer);
2510
+ observed
2511
+ }
2512
+
2513
+ /// Set up a fresh database file in `mode` and verify the watcher
2514
+ /// detects every committed insert. Tolerates +1 wake (a commit
2515
+ /// straddling the drain boundary) but does not tolerate misses.
2516
+ fn watcher_works_in_journal_mode(backend: WatcherBackend, mode: &str) {
2517
+ let tmp = std::env::temp_dir().join(format!(
2518
+ "honker-watcher-{}-{}-{}-{}",
2519
+ mode.to_ascii_lowercase(),
2520
+ std::process::id(),
2521
+ std::time::SystemTime::now()
2522
+ .duration_since(std::time::UNIX_EPOCH)
2523
+ .unwrap()
2524
+ .subsec_nanos(),
2525
+ match backend {
2526
+ WatcherBackend::Polling => "poll",
2527
+ #[cfg(feature = "kernel-watcher")]
2528
+ WatcherBackend::KernelWatch => "kw",
2529
+ #[cfg(feature = "shm-fast-path")]
2530
+ WatcherBackend::ShmFastPath => "shm",
2531
+ },
2532
+ ));
2533
+ let _ = std::fs::remove_file(&tmp);
2534
+
2535
+ // Watcher inherits the file's journal mode, so set it before opening.
2536
+ let setup = Connection::open(&tmp).unwrap();
2537
+ setup
2538
+ .execute_batch(&format!("PRAGMA journal_mode = {mode};"))
2539
+ .unwrap();
2540
+ let actual: String = setup
2541
+ .pragma_query_value(None, "journal_mode", |r| r.get(0))
2542
+ .unwrap();
2543
+ assert_eq!(
2544
+ actual.to_ascii_uppercase(),
2545
+ mode.to_ascii_uppercase(),
2546
+ "PRAGMA journal_mode = {mode} silently fell back to {actual}"
2547
+ );
2548
+ setup.execute("CREATE TABLE t (x INTEGER)", []).unwrap();
2549
+ // One prior write so -shm exists at watcher startup (shm fast path
2550
+ // needs it; harmless otherwise).
2551
+ setup.execute("INSERT INTO t VALUES (0)", []).unwrap();
2552
+ // Pin -shm only for shm+WAL: Linux/Windows reap -shm on last close.
2553
+ // Other modes must drop setup or Windows errors on shared non-WAL db.
2554
+ #[cfg(feature = "shm-fast-path")]
2555
+ let keep_setup_open =
2556
+ matches!(backend, WatcherBackend::ShmFastPath) && mode.eq_ignore_ascii_case("WAL");
2557
+ #[cfg(not(feature = "shm-fast-path"))]
2558
+ let keep_setup_open = false;
2559
+ let _pinning = if keep_setup_open {
2560
+ Some(setup)
2561
+ } else {
2562
+ drop(setup);
2563
+ None
2564
+ };
2565
+
2566
+ let n: u32 = 5;
2567
+ let observed = drive_and_count_wakes(backend.clone(), tmp.clone(), n, 30);
2568
+
2569
+ drop(_pinning);
2570
+ let _ = std::fs::remove_file(&tmp);
2571
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
2572
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
2573
+ let _ = std::fs::remove_file(format!("{}-journal", tmp.display()));
2574
+
2575
+ // Polling/shm dedupe → ~1 wake per commit. Kernel fires per
2576
+ // filesystem event (inotify is granular) → upper bound is just
2577
+ // a runaway-watcher guard, not a precise expectation.
2578
+ let upper = match backend {
2579
+ WatcherBackend::Polling => n + 1,
2580
+ #[cfg(feature = "kernel-watcher")]
2581
+ WatcherBackend::KernelWatch => n * 200,
2582
+ #[cfg(feature = "shm-fast-path")]
2583
+ WatcherBackend::ShmFastPath => n + 1,
2584
+ };
2585
+ assert!(
2586
+ observed >= n,
2587
+ "journal_mode={mode}: observed {observed} wakes for {n} commits \
2588
+ (missed at least one)"
2589
+ );
2590
+ assert!(
2591
+ observed <= upper,
2592
+ "journal_mode={mode}: observed {observed} wakes for {n} commits, \
2593
+ upper bound {upper} (runaway watcher?)"
2594
+ );
2595
+ }
2596
+
2597
+ // ---- Polling × every supported journal mode (regression coverage) ----
2598
+
2599
+ #[test]
2600
+ fn polling_watcher_works_in_wal() {
2601
+ watcher_works_in_journal_mode(WatcherBackend::Polling, "WAL");
2602
+ }
2603
+
2604
+ #[test]
2605
+ fn polling_watcher_works_in_delete() {
2606
+ watcher_works_in_journal_mode(WatcherBackend::Polling, "DELETE");
2607
+ }
2608
+
2609
+ #[test]
2610
+ fn polling_watcher_works_in_truncate() {
2611
+ watcher_works_in_journal_mode(WatcherBackend::Polling, "TRUNCATE");
2612
+ }
2613
+
2614
+ #[test]
2615
+ fn polling_watcher_works_in_persist() {
2616
+ watcher_works_in_journal_mode(WatcherBackend::Polling, "PERSIST");
2617
+ }
2618
+
2619
+ // ---- Kernel watcher × every supported journal mode ----
2620
+
2621
+ // macOS kqueue limitation: directory-level watches do NOT fire on
2622
+ // writes within existing files (only on entry create/delete/rename).
2623
+ // Per-file watches fire on regular-file writes, but rollback-journal
2624
+ // commit dances on a loaded macOS CI runner produce so few kqueue
2625
+ // events that the wake count is unreliable for non-WAL modes. We
2626
+ // attach to db + journal + dir to maximize coverage, and ship the
2627
+ // backend with documented "missed wakes possible" semantics, but
2628
+ // we don't gate CI on a behavior the kernel won't reliably deliver.
2629
+ // WAL-mode kernel coverage stays mandatory (kernel_watcher_works_in_wal).
2630
+ #[test]
2631
+ #[cfg(feature = "kernel-watcher")]
2632
+ #[cfg_attr(
2633
+ target_os = "macos",
2634
+ ignore = "kqueue: in-place writes don't fire dir events"
2635
+ )]
2636
+ fn kernel_watcher_works_in_delete() {
2637
+ watcher_works_in_journal_mode(WatcherBackend::KernelWatch, "DELETE");
2638
+ }
2639
+
2640
+ #[test]
2641
+ #[cfg(feature = "kernel-watcher")]
2642
+ #[cfg_attr(
2643
+ target_os = "macos",
2644
+ ignore = "kqueue: in-place writes don't fire dir events"
2645
+ )]
2646
+ fn kernel_watcher_works_in_truncate() {
2647
+ watcher_works_in_journal_mode(WatcherBackend::KernelWatch, "TRUNCATE");
2648
+ }
2649
+
2650
+ #[test]
2651
+ #[cfg(feature = "kernel-watcher")]
2652
+ #[cfg_attr(
2653
+ target_os = "macos",
2654
+ ignore = "kqueue: in-place writes don't fire dir events"
2655
+ )]
2656
+ fn kernel_watcher_works_in_persist() {
2657
+ watcher_works_in_journal_mode(WatcherBackend::KernelWatch, "PERSIST");
2658
+ }
2659
+
2660
+ // ---- SHM fast path: WAL only (it's experimental — non-WAL is
2661
+ // explicitly out of scope, the backend logs and disables itself
2662
+ // when -shm doesn't exist). ----
2663
+
2664
+ #[test]
2665
+ #[cfg(feature = "shm-fast-path")]
2666
+ fn shm_fast_path_works_in_wal() {
2667
+ watcher_works_in_journal_mode(WatcherBackend::ShmFastPath, "WAL");
2668
+ }
2669
+
2670
+ // -----------------------------------------------------------------
2671
+ // Wake-latency invariants — proves the experimental backends
2672
+ // actually deliver wakes via their fast paths (kernel events /
2673
+ // mmap reads), not via some accidental fallback. The simplified
2674
+ // backends have no safety nets, so a missed wake just doesn't
2675
+ // fire — these tests would catch that immediately.
2676
+ // -----------------------------------------------------------------
2677
+
2678
+ /// Helper: spawn a watcher with the given backend, commit `n` writes
2679
+ /// spaced `spacing_ms` apart, return the per-commit wake latency in
2680
+ /// milliseconds. Caller asserts on the distribution.
2681
+ #[cfg(any(feature = "kernel-watcher", feature = "shm-fast-path"))]
2682
+ fn measure_wake_latencies_ms(
2683
+ backend: WatcherBackend,
2684
+ db_path: PathBuf,
2685
+ n: usize,
2686
+ spacing_ms: u64,
2687
+ ) -> Vec<f64> {
2688
+ use std::sync::Mutex as StdMutex;
2689
+
2690
+ let writer = open_conn(db_path.to_str().unwrap(), false).unwrap();
2691
+ writer.execute_batch("CREATE TABLE t (x INTEGER)").unwrap();
2692
+ // First write so -wal exists at watcher startup.
2693
+ writer.execute("INSERT INTO t VALUES (0)", []).unwrap();
2694
+ std::thread::sleep(Duration::from_millis(20));
2695
+
2696
+ let wake_times: Arc<StdMutex<Vec<std::time::Instant>>> =
2697
+ Arc::new(StdMutex::new(Vec::new()));
2698
+ let wake_times_t = wake_times.clone();
2699
+ let watcher = UpdateWatcher::spawn_with_config(
2700
+ db_path.clone(),
2701
+ move || {
2702
+ wake_times_t
2703
+ .lock()
2704
+ .expect("wake_times mutex poisoned")
2705
+ .push(std::time::Instant::now());
2706
+ },
2707
+ WatcherConfig { backend },
2708
+ );
2709
+
2710
+ // Drain initialization wakes.
2711
+ std::thread::sleep(Duration::from_millis(100));
2712
+ wake_times.lock().expect("wake_times").clear();
2713
+
2714
+ // Commit each write, recording the commit time. Pair with the
2715
+ // first wake timestamp that arrives after that commit time.
2716
+ let mut commit_times: Vec<std::time::Instant> = Vec::with_capacity(n);
2717
+ for i in 1..=n {
2718
+ let t0 = std::time::Instant::now();
2719
+ writer
2720
+ .execute(&format!("INSERT INTO t VALUES ({i})"), [])
2721
+ .unwrap();
2722
+ commit_times.push(t0);
2723
+ std::thread::sleep(Duration::from_millis(spacing_ms));
2724
+ }
2725
+ // Wait long enough for any in-flight wakes to land. The
2726
+ // backend-specific safety nets are 500 ms (kernel-watcher) and
2727
+ // 100 ms (shm-fast-path); 700 ms covers either.
2728
+ std::thread::sleep(Duration::from_millis(700));
2729
+
2730
+ let wakes = wake_times.lock().expect("wake_times").clone();
2731
+ drop(watcher);
2732
+ drop(writer);
2733
+
2734
+ // Pair each commit with the first wake at-or-after its commit
2735
+ // time. Wakes are monotonic; commits are monotonic; so a single
2736
+ // forward pass suffices.
2737
+ let mut latencies = Vec::with_capacity(n);
2738
+ let mut wake_cursor = 0;
2739
+ for &commit_t in &commit_times {
2740
+ while wake_cursor < wakes.len() && wakes[wake_cursor] < commit_t {
2741
+ wake_cursor += 1;
2742
+ }
2743
+ if wake_cursor >= wakes.len() {
2744
+ latencies.push(f64::INFINITY); // missed wake — caller will assert
2745
+ } else {
2746
+ let dt = wakes[wake_cursor].duration_since(commit_t);
2747
+ latencies.push(dt.as_secs_f64() * 1000.0);
2748
+ wake_cursor += 1;
2749
+ }
2750
+ }
2751
+ latencies
2752
+ }
2753
+
2754
+ #[cfg(any(feature = "kernel-watcher", feature = "shm-fast-path"))]
2755
+ fn percentile(mut samples: Vec<f64>, pct: f64) -> f64 {
2756
+ samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
2757
+ let idx = ((samples.len() as f64) * pct) as usize;
2758
+ samples[idx.min(samples.len() - 1)]
2759
+ }
2760
+
2761
+ /// Kernel watcher: wakes must come via kernel events, not the
2762
+ /// 500 ms safety net. p90 < 200 ms is way below half of the
2763
+ /// safety-net interval, so a backend stuck on the safety net would
2764
+ /// have p90 ≈ 250 ms (mean half of 500) and fail this assertion.
2765
+ #[test]
2766
+ #[cfg_attr(
2767
+ target_os = "macos",
2768
+ ignore = "notify/kqueue can drop the watcher thread under CI load; functional kernel watcher tests still run"
2769
+ )]
2770
+ #[cfg(feature = "kernel-watcher")]
2771
+ #[cfg_attr(target_os = "macos", ignore = "kqueue under CI load may deliver zero wakes")]
2772
+ fn kernel_watcher_wake_latency_is_event_driven() {
2773
+ let tmp = std::env::temp_dir().join(format!(
2774
+ "honker-kw-lat-{}-{}",
2775
+ std::process::id(),
2776
+ std::time::SystemTime::now()
2777
+ .duration_since(std::time::UNIX_EPOCH)
2778
+ .unwrap()
2779
+ .subsec_nanos()
2780
+ ));
2781
+ let _ = std::fs::remove_file(&tmp);
2782
+
2783
+ let lats = measure_wake_latencies_ms(WatcherBackend::KernelWatch, tmp.clone(), 10, 50);
2784
+
2785
+ let _ = std::fs::remove_file(&tmp);
2786
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
2787
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
2788
+
2789
+ // Contract allows missed wakes (kqueue/inotify/ReadDir can
2790
+ // coalesce). Assert: some wakes arrived AND p50 is well below
2791
+ // the 5 s `idle_poll_s` fallback — proves we're event-driven,
2792
+ // not riding the paranoia poll. Windows ReadDirectoryChangesW
2793
+ // under CI load can stretch past 100 ms; 500 ms threshold
2794
+ // still rules out the fallback.
2795
+ let arrived: Vec<f64> = lats.iter().copied().filter(|l| l.is_finite()).collect();
2796
+ assert!(
2797
+ !arrived.is_empty(),
2798
+ "kernel watcher delivered zero wakes for 10 commits — events \
2799
+ aren't being delivered at all on this platform: {lats:?}"
2800
+ );
2801
+ let p50 = percentile(arrived.clone(), 0.50);
2802
+ assert!(
2803
+ p50 < 500.0,
2804
+ "kernel watcher p50 wake latency = {p50:.1} ms, expected < 500 \
2805
+ (high median latency means events arrive but slowly — possibly \
2806
+ a stuck-thread fallback). Arrived: {arrived:?}, all samples \
2807
+ (inf = no wake): {lats:?}"
2808
+ );
2809
+ }
2810
+
2811
+ /// SHM fast path: wakes must come via the mmap tickle.
2812
+ #[test]
2813
+ #[cfg(feature = "shm-fast-path")]
2814
+ fn shm_fast_path_wake_latency_is_event_driven() {
2815
+ let tmp = std::env::temp_dir().join(format!(
2816
+ "honker-shm-lat-{}-{}",
2817
+ std::process::id(),
2818
+ std::time::SystemTime::now()
2819
+ .duration_since(std::time::UNIX_EPOCH)
2820
+ .unwrap()
2821
+ .subsec_nanos()
2822
+ ));
2823
+ let _ = std::fs::remove_file(&tmp);
2824
+
2825
+ let lats = measure_wake_latencies_ms(WatcherBackend::ShmFastPath, tmp.clone(), 10, 50);
2826
+
2827
+ let _ = std::fs::remove_file(&tmp);
2828
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
2829
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
2830
+
2831
+ // Same shape as the kernel-watcher latency test: assert that
2832
+ // *some* wakes arrived and that they were fast. Missed wakes
2833
+ // are part of the documented experimental contract.
2834
+ let arrived: Vec<f64> = lats.iter().copied().filter(|l| l.is_finite()).collect();
2835
+ assert!(
2836
+ !arrived.is_empty(),
2837
+ "shm fast path delivered zero wakes for 10 commits: {lats:?}"
2838
+ );
2839
+ let p50 = percentile(arrived.clone(), 0.50);
2840
+ assert!(
2841
+ p50 < 50.0,
2842
+ "shm fast path p50 wake latency (over arrived wakes only) = {p50:.1} ms, expected < 50 \
2843
+ (high latency means iChange isn't \
2844
+ being read via mmap). Samples: {lats:?}"
2845
+ );
2846
+ }
2847
+
2848
+ /// Graceful shutdown latency. Bounded by `RX_POLL_MS = 50 ms`.
2849
+ #[test]
2850
+ #[cfg_attr(
2851
+ target_os = "macos",
2852
+ ignore = "notify/kqueue shutdown can hang under CI load; functional kernel watcher tests still run"
2853
+ )]
2854
+ #[cfg(feature = "kernel-watcher")]
2855
+ fn kernel_watcher_shutdown_is_responsive() {
2856
+ let tmp = std::env::temp_dir().join(format!(
2857
+ "honker-kw-shutdown-{}-{}",
2858
+ std::process::id(),
2859
+ std::time::SystemTime::now()
2860
+ .duration_since(std::time::UNIX_EPOCH)
2861
+ .unwrap()
2862
+ .subsec_nanos()
2863
+ ));
2864
+ let _ = std::fs::remove_file(&tmp);
2865
+
2866
+ let writer = open_conn(tmp.to_str().unwrap(), false).unwrap();
2867
+ writer.execute_batch("CREATE TABLE t (x INTEGER)").unwrap();
2868
+ writer.execute("INSERT INTO t VALUES (0)", []).unwrap();
2869
+
2870
+ let watcher = UpdateWatcher::spawn_with_config(
2871
+ tmp.clone(),
2872
+ || {},
2873
+ WatcherConfig {
2874
+ backend: WatcherBackend::KernelWatch,
2875
+ },
2876
+ );
2877
+
2878
+ // Let the watcher reach steady state (in its recv_timeout block).
2879
+ std::thread::sleep(Duration::from_millis(200));
2880
+
2881
+ let t0 = std::time::Instant::now();
2882
+ let _ = watcher.join();
2883
+ let elapsed = t0.elapsed();
2884
+
2885
+ drop(writer);
2886
+ let _ = std::fs::remove_file(&tmp);
2887
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
2888
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
2889
+
2890
+ assert!(
2891
+ elapsed < Duration::from_millis(150),
2892
+ "kernel watcher shutdown took {elapsed:?}, expected < 150 ms \
2893
+ (RX_POLL_MS = 50 ms; if this exceeds 500 ms the recv_timeout \
2894
+ is blocking on the safety-net interval again)"
2895
+ );
2896
+ }
2897
+
2898
+ // -----------------------------------------------------------------
2899
+ // Probe failures must surface as Err — proving "no silent fallback"
2900
+ // when an experimental backend can't initialize.
2901
+ // -----------------------------------------------------------------
2902
+
2903
+ #[test]
2904
+ fn watcher_backend_polling_probe_always_succeeds() {
2905
+ // Polling never fails — works on any path, any state.
2906
+ let nope = std::path::PathBuf::from("/nonexistent/no/way/this/exists.db");
2907
+ assert!(WatcherBackend::Polling.probe(&nope).is_ok());
2908
+ }
2909
+
2910
+ #[test]
2911
+ fn watcher_backend_parse_rejects_unknown_names() {
2912
+ for backend in ["bogus", "KERNEL", " polling "] {
2913
+ let err = WatcherBackend::parse(Some(backend)).unwrap_err();
2914
+ assert!(err.contains("unknown watcher backend"), "got: {err}");
2915
+ }
2916
+ }
2917
+
2918
+ #[test]
2919
+ fn watcher_backend_parse_accepts_polling_aliases() {
2920
+ assert!(matches!(
2921
+ WatcherBackend::parse(None),
2922
+ Ok(WatcherBackend::Polling)
2923
+ ));
2924
+ assert!(matches!(
2925
+ WatcherBackend::parse(Some("poll")),
2926
+ Ok(WatcherBackend::Polling)
2927
+ ));
2928
+ assert!(matches!(
2929
+ WatcherBackend::parse(Some("polling")),
2930
+ Ok(WatcherBackend::Polling)
2931
+ ));
2932
+ }
2933
+
2934
+ #[test]
2935
+ #[cfg(not(feature = "kernel-watcher"))]
2936
+ fn watcher_backend_parse_rejects_uncompiled_kernel() {
2937
+ let err = WatcherBackend::parse(Some("kernel")).unwrap_err();
2938
+ assert!(
2939
+ err.contains("requires the kernel-watcher Cargo feature"),
2940
+ "got: {err}"
2941
+ );
2942
+ }
2943
+
2944
+ #[test]
2945
+ #[cfg(not(feature = "shm-fast-path"))]
2946
+ fn watcher_backend_parse_rejects_uncompiled_shm() {
2947
+ let err = WatcherBackend::parse(Some("shm")).unwrap_err();
2948
+ assert!(
2949
+ err.contains("requires the shm-fast-path Cargo feature"),
2950
+ "got: {err}"
2951
+ );
2952
+ }
2953
+
2954
+ #[test]
2955
+ #[cfg(feature = "kernel-watcher")]
2956
+ fn watcher_backend_parse_accepts_compiled_kernel_aliases() {
2957
+ assert!(matches!(
2958
+ WatcherBackend::parse(Some("kernel")),
2959
+ Ok(WatcherBackend::KernelWatch)
2960
+ ));
2961
+ assert!(matches!(
2962
+ WatcherBackend::parse(Some("kernel-watcher")),
2963
+ Ok(WatcherBackend::KernelWatch)
2964
+ ));
2965
+ }
2966
+
2967
+ #[test]
2968
+ #[cfg(feature = "shm-fast-path")]
2969
+ fn watcher_backend_parse_accepts_compiled_shm_aliases() {
2970
+ assert!(matches!(
2971
+ WatcherBackend::parse(Some("shm")),
2972
+ Ok(WatcherBackend::ShmFastPath)
2973
+ ));
2974
+ assert!(matches!(
2975
+ WatcherBackend::parse(Some("shm-fast-path")),
2976
+ Ok(WatcherBackend::ShmFastPath)
2977
+ ));
2978
+ }
2979
+
2980
+ #[test]
2981
+ #[cfg(feature = "shm-fast-path")]
2982
+ fn watcher_backend_shm_probe_fails_when_shm_missing() {
2983
+ // Path with no -shm file — probe must report it, not silently
2984
+ // disable the backend at runtime.
2985
+ let tmp = std::env::temp_dir().join(format!(
2986
+ "honker-shm-probe-{}-{}",
2987
+ std::process::id(),
2988
+ std::time::SystemTime::now()
2989
+ .duration_since(std::time::UNIX_EPOCH)
2990
+ .unwrap()
2991
+ .subsec_nanos()
2992
+ ));
2993
+ let _ = std::fs::remove_file(&tmp);
2994
+ let result = WatcherBackend::ShmFastPath.probe(&tmp);
2995
+ assert!(result.is_err(), "expected probe to fail for missing -shm");
2996
+ let msg = result.unwrap_err();
2997
+ assert!(
2998
+ msg.contains("-shm unavailable"),
2999
+ "probe error message should explain why; got: {msg}"
3000
+ );
3001
+ }
3002
+
3003
+ /// Parity with `update_watcher_panics_on_file_replacement` for the
3004
+ /// kernel-watcher backend. The polling backend panics when it sees
3005
+ /// the db file replaced; the kernel watcher must do the same so a
3006
+ /// stale per-file watch fails loudly instead of silently missing
3007
+ /// wakes after a litestream-style restore.
3008
+ ///
3009
+ /// The experimental backends don't open a SQLite connection of
3010
+ /// their own (only the polling backend does), so the test setup
3011
+ /// can use a plain empty file at `db_path`. That dodges Windows'
3012
+ /// "can't rename over a file SQLite has open" problem and lets us
3013
+ /// run on every platform — unlike the polling test, which still
3014
+ /// has to use a real SQLite db and stays `#[cfg(unix)]`.
3015
+ #[test]
3016
+ #[cfg(feature = "kernel-watcher")]
3017
+ fn kernel_watcher_panics_on_file_replacement() {
3018
+ replacement_panic_test(WatcherBackend::KernelWatch);
3019
+ }
3020
+
3021
+ /// Parity for the SHM fast path. SQLite may recreate the `-shm`
3022
+ /// file during normal WAL lifecycle churn, so that path reopens and
3023
+ /// rebases. The database file itself is still a dead-man condition.
3024
+ #[test]
3025
+ #[cfg_attr(
3026
+ windows,
3027
+ ignore = "Windows prevents replacing the watched db path while the SHM watcher is open"
3028
+ )]
3029
+ #[cfg(feature = "shm-fast-path")]
3030
+ fn shm_fast_path_panics_on_file_replacement() {
3031
+ replacement_panic_test(WatcherBackend::ShmFastPath);
3032
+ }
3033
+
3034
+ #[cfg(any(feature = "kernel-watcher", feature = "shm-fast-path"))]
3035
+ fn replacement_panic_test(backend: WatcherBackend) {
3036
+ use std::io::Write;
3037
+
3038
+ let tmp = std::env::temp_dir().join(format!(
3039
+ "honker-replace-{}-{}",
3040
+ std::process::id(),
3041
+ std::time::SystemTime::now()
3042
+ .duration_since(std::time::UNIX_EPOCH)
3043
+ .unwrap()
3044
+ .subsec_nanos()
3045
+ ));
3046
+ let _ = std::fs::remove_file(&tmp);
3047
+ // Plain empty file at the db path. The kernel/shm watchers
3048
+ // don't open a SQLite connection — they just stat / watch /
3049
+ // mmap files. So a real SQLite db isn't needed and we avoid
3050
+ // Windows' inability to rename over a SQLite-held file.
3051
+ std::fs::File::create(&tmp).unwrap();
3052
+
3053
+ // For the SHM backend, also write a fake -shm. macOS mmap can
3054
+ // be finicky about tiny files; write at least a page (4 KiB)
3055
+ // with the valid WAL index header up front.
3056
+ if matches!(backend, WatcherBackend::ShmFastPath) {
3057
+ let shm_path = std::path::PathBuf::from(format!("{}-shm", tmp.display()));
3058
+ let mut buf = [0u8; 4096];
3059
+ buf[0..4].copy_from_slice(&3_007_000u32.to_ne_bytes()); // WALINDEX_MAX_VERSION
3060
+ // iChange (offset 8) starts at 0; doesn't matter for this test.
3061
+ let mut f = std::fs::File::create(&shm_path).unwrap();
3062
+ f.write_all(&buf).unwrap();
3063
+ }
3064
+
3065
+ let watcher =
3066
+ UpdateWatcher::spawn_with_config(tmp.clone(), || {}, WatcherConfig { backend });
3067
+ // Generous initial wait so the watcher has snapshotted the
3068
+ // initial inode under CI scheduling pressure.
3069
+ std::thread::sleep(Duration::from_millis(300));
3070
+
3071
+ // Replace the db file with a different inode. Atomic rename
3072
+ // works on every platform when no SQLite handle is held open.
3073
+ let other = std::env::temp_dir().join(format!(
3074
+ "honker-replace-other-{}-{}",
3075
+ std::process::id(),
3076
+ std::time::SystemTime::now()
3077
+ .duration_since(std::time::UNIX_EPOCH)
3078
+ .unwrap()
3079
+ .subsec_nanos()
3080
+ ));
3081
+ let _ = std::fs::remove_file(&other);
3082
+ std::fs::File::create(&other).unwrap();
3083
+ #[cfg(windows)]
3084
+ {
3085
+ // Windows does not replace an existing destination with
3086
+ // rename(). Remove first; the watcher will still observe the
3087
+ // replacement when the new file appears with a different id.
3088
+ std::fs::remove_file(&tmp).unwrap();
3089
+ }
3090
+ std::fs::rename(&other, &tmp).unwrap();
3091
+ // Wait long enough for the dead-man's switch to fire on a
3092
+ // slow CI runner. Identity check is 100 ms; give it 10 cycles.
3093
+ std::thread::sleep(Duration::from_millis(1000));
3094
+
3095
+ let result = watcher.join();
3096
+ let _ = std::fs::remove_file(&tmp);
3097
+ let _ = std::fs::remove_file(format!("{}-wal", tmp.display()));
3098
+ let _ = std::fs::remove_file(format!("{}-shm", tmp.display()));
3099
+ assert!(
3100
+ result.is_err(),
3101
+ "expected watcher thread to panic on db file replacement"
3102
+ );
3103
+ }
3104
+
3105
+ #[test]
3106
+ #[cfg(feature = "kernel-watcher")]
3107
+ fn watcher_backend_kernel_probe_fails_for_inaccessible_dir() {
3108
+ // Path under a non-existent parent — notify can't watch it.
3109
+ let nope = std::path::PathBuf::from("/this/parent/does/not/exist/honker-kernel-probe.db");
3110
+ let result = WatcherBackend::KernelWatch.probe(&nope);
3111
+ assert!(
3112
+ result.is_err(),
3113
+ "expected probe to fail for inaccessible dir, got Ok"
3114
+ );
3115
+ }
3116
+ }