@kitlangton/motel 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/AGENTS.md +5 -0
  2. package/package.json +7 -5
  3. package/src/App.tsx +233 -59
  4. package/src/daemon.test.ts +213 -6
  5. package/src/daemon.ts +174 -38
  6. package/src/domain.test.ts +62 -0
  7. package/src/domain.ts +16 -0
  8. package/src/localServer.ts +114 -128
  9. package/src/mcp.ts +172 -0
  10. package/src/motelClient.ts +166 -14
  11. package/src/registry.ts +26 -23
  12. package/src/runtime.ts +8 -2
  13. package/src/server.ts +10 -9
  14. package/src/services/AsyncIngest.ts +68 -0
  15. package/src/services/TelemetryStore.ts +262 -119
  16. package/src/services/TraceQueryService.ts +3 -1
  17. package/src/services/ingestRpc.ts +41 -0
  18. package/src/services/telemetryWorker.ts +62 -0
  19. package/src/storybook/aiChatStory.tsx +244 -0
  20. package/src/storybook/fixtures/errorState.ts +44 -0
  21. package/src/storybook/fixtures/imagePaste.ts +34 -0
  22. package/src/storybook/fixtures/index.ts +62 -0
  23. package/src/storybook/fixtures/kitchenSink.ts +148 -0
  24. package/src/storybook/fixtures/rawPrompt.ts +15 -0
  25. package/src/storybook/fixtures/short.ts +27 -0
  26. package/src/storybook/fixtures/toolHeavy.ts +65 -0
  27. package/src/telemetry.test.ts +28 -0
  28. package/src/ui/AiChatView.tsx +308 -0
  29. package/src/ui/SpanContentView.tsx +181 -0
  30. package/src/ui/SpanDetail.tsx +98 -17
  31. package/src/ui/TraceDetailsPane.tsx +11 -28
  32. package/src/ui/Waterfall.tsx +43 -148
  33. package/src/ui/aiChatModel.test.ts +391 -0
  34. package/src/ui/aiChatModel.ts +773 -0
  35. package/src/ui/aiState.ts +71 -0
  36. package/src/ui/app/TraceWorkspace.tsx +288 -124
  37. package/src/ui/app/useAppLayout.ts +14 -11
  38. package/src/ui/app/useTraceScreenData.ts +174 -40
  39. package/src/ui/atoms.ts +131 -0
  40. package/src/ui/loaders.ts +120 -0
  41. package/src/ui/persistence.ts +41 -0
  42. package/src/ui/primitives.tsx +27 -13
  43. package/src/ui/state.ts +4 -199
  44. package/src/ui/useAttrFilterPicker.ts +63 -23
  45. package/src/ui/useKeyboardNav.ts +552 -364
  46. package/src/ui/waterfallModel.ts +130 -0
  47. package/src/ui/waterfallNav.test.ts +17 -1
  48. package/src/ui/waterfallNav.ts +1 -1
@@ -7,6 +7,9 @@ import type { AiCallDetail, AiCallSummary, FacetItem, LogItem, SpanItem, StatsIt
7
7
  import { AI_ATTR_MAP, AI_FTS_KEYS, AI_TEXT_SEARCH_KEYS, truncatePreview } from "../domain.js"
8
8
  import { attributeMap, nanosToMilliseconds, parseAnyValue, spanKindLabel, spanStatusLabel, stringifyValue, type OtlpLogExportRequest, type OtlpTraceExportRequest } from "../otlp.js"
9
9
 
10
+ const isSqliteLockError = (error: unknown) =>
11
+ error instanceof Error && /(database is locked|database table is locked|SQLITE_BUSY)/i.test(error.message)
12
+
10
13
  interface SpanRow {
11
14
  readonly trace_id: string
12
15
  readonly span_id: string
@@ -447,117 +450,185 @@ export class TelemetryStore extends Context.Service<
447
450
  >()("motel/TelemetryStore") {}
448
451
 
449
452
 
450
- export const TelemetryStoreLive = Layer.effect(
453
+ /**
454
+ * How this TelemetryStore instance behaves:
455
+ *
456
+ * - `readonly` — opens the SQLite connection read-only and skips every
457
+ * DDL/DML initialisation. Use this from the TUI (and anywhere else
458
+ * that only queries); it avoids the "database is locked" race that
459
+ * happens when a TUI process races a daemon's writer for the schema
460
+ * pragmas on startup. Writes through the service interface become
461
+ * runtime errors — but readers don't call them.
462
+ *
463
+ * - `runRetention` — fork the background cleanup loop (age + size cap
464
+ * eviction, WAL checkpoint). Only one process should own this at a
465
+ * time. Currently the main daemon (localServer) does; the ingest
466
+ * worker and the TUI skip it.
467
+ */
468
+ export interface TelemetryStoreOptions {
469
+ readonly readonly: boolean
470
+ readonly runRetention: boolean
471
+ }
472
+
473
+ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.effect(
451
474
  TelemetryStore,
452
475
  Effect.gen(function* () {
453
476
  mkdirSync(dirname(config.otel.databasePath), { recursive: true })
454
477
  const db = yield* Effect.acquireRelease(
455
- Effect.sync(() => new Database(config.otel.databasePath, { create: true })),
478
+ Effect.sync(() => new Database(config.otel.databasePath, {
479
+ create: !opts.readonly,
480
+ readonly: opts.readonly,
481
+ })),
456
482
  (db) => Effect.sync(() => {
457
- // `PRAGMA optimize` at close persists any stats SQLite gathered
458
- // during the session, so the next process start gets an accurate
459
- // query planner on the first query instead of a 3-second cold
460
- // run. Cheap: it skips work unless stats have drifted.
461
- try { db.exec(`PRAGMA optimize;`) } catch { /* nothing */ }
483
+ if (!opts.readonly) {
484
+ // `PRAGMA optimize` at close persists any stats SQLite gathered
485
+ // during the session, so the next process start gets an accurate
486
+ // query planner on the first query instead of a 3-second cold
487
+ // run. Cheap: it skips work unless stats have drifted.
488
+ try { db.exec(`PRAGMA optimize;`) } catch { /* nothing */ }
489
+ }
462
490
  db.close()
463
491
  }),
464
492
  )
465
- db.exec(`
466
- PRAGMA journal_mode = WAL;
467
- PRAGMA synchronous = NORMAL;
468
- PRAGMA temp_store = MEMORY;
469
- PRAGMA busy_timeout = 5000;
470
- -- Bump cache above the 2MB default. 64MB fits most hot index pages
471
- -- (trace_summaries, spans, span_attributes indexes) in RAM even on
472
- -- multi-GB databases, cutting cold-read latency meaningfully on
473
- -- picker / search queries that sweep the index.
474
- PRAGMA cache_size = -65536;
475
- -- Let SQLite memory-map the first 256MB of the file. This is a
476
- -- cheap way to avoid read() syscalls on hot pages and lets the OS
477
- -- page cache serve index lookups directly. Safe on macOS and Linux;
478
- -- SQLite silently caps at actual file size for smaller DBs.
479
- PRAGMA mmap_size = 268435456;
480
-
481
- CREATE TABLE IF NOT EXISTS spans (
482
- trace_id TEXT NOT NULL,
483
- span_id TEXT NOT NULL,
484
- parent_span_id TEXT,
485
- service_name TEXT NOT NULL,
486
- scope_name TEXT,
487
- operation_name TEXT NOT NULL,
488
- kind TEXT,
489
- start_time_ms INTEGER NOT NULL,
490
- end_time_ms INTEGER NOT NULL,
491
- duration_ms REAL NOT NULL,
492
- status TEXT NOT NULL,
493
- attributes_json TEXT NOT NULL,
494
- resource_json TEXT NOT NULL,
495
- events_json TEXT NOT NULL,
496
- PRIMARY KEY (trace_id, span_id)
497
- );
498
-
499
- CREATE INDEX IF NOT EXISTS idx_spans_service_time ON spans(service_name, start_time_ms DESC);
500
- CREATE INDEX IF NOT EXISTS idx_spans_trace_time ON spans(trace_id, start_time_ms ASC);
501
- CREATE INDEX IF NOT EXISTS idx_spans_span_id ON spans(span_id);
502
- CREATE INDEX IF NOT EXISTS idx_spans_status_time ON spans(status, start_time_ms DESC);
503
-
504
- CREATE TABLE IF NOT EXISTS logs (
505
- id INTEGER PRIMARY KEY AUTOINCREMENT,
506
- trace_id TEXT,
507
- span_id TEXT,
508
- service_name TEXT NOT NULL,
509
- scope_name TEXT,
510
- severity_text TEXT NOT NULL,
511
- timestamp_ms INTEGER NOT NULL,
512
- body TEXT NOT NULL,
513
- attributes_json TEXT NOT NULL,
514
- resource_json TEXT NOT NULL
515
- );
516
-
517
- CREATE INDEX IF NOT EXISTS idx_logs_service_time ON logs(service_name, timestamp_ms DESC);
518
- CREATE INDEX IF NOT EXISTS idx_logs_trace_time ON logs(trace_id, timestamp_ms DESC);
519
- CREATE INDEX IF NOT EXISTS idx_logs_span_time ON logs(span_id, timestamp_ms DESC);
520
- CREATE INDEX IF NOT EXISTS idx_logs_severity_time ON logs(severity_text, timestamp_ms DESC);
521
-
522
- CREATE TABLE IF NOT EXISTS trace_summaries (
523
- trace_id TEXT PRIMARY KEY,
524
- service_name TEXT NOT NULL,
525
- root_operation_name TEXT NOT NULL,
526
- started_at_ms INTEGER NOT NULL,
527
- ended_at_ms INTEGER NOT NULL,
528
- active_span_count INTEGER NOT NULL DEFAULT 0,
529
- duration_ms REAL NOT NULL,
530
- span_count INTEGER NOT NULL,
531
- error_count INTEGER NOT NULL
532
- );
533
-
534
- CREATE INDEX IF NOT EXISTS idx_trace_summaries_started_at ON trace_summaries(started_at_ms DESC, trace_id DESC);
535
- CREATE INDEX IF NOT EXISTS idx_trace_summaries_service_started_at ON trace_summaries(service_name, started_at_ms DESC, trace_id DESC);
536
- CREATE INDEX IF NOT EXISTS idx_trace_summaries_duration ON trace_summaries(duration_ms DESC);
537
-
538
- CREATE TABLE IF NOT EXISTS span_attributes (
539
- trace_id TEXT NOT NULL,
540
- span_id TEXT NOT NULL,
541
- key TEXT NOT NULL,
542
- value TEXT NOT NULL,
543
- PRIMARY KEY (trace_id, span_id, key)
544
- );
545
-
546
- CREATE INDEX IF NOT EXISTS idx_span_attributes_key_value ON span_attributes(key, value, trace_id, span_id);
547
- CREATE INDEX IF NOT EXISTS idx_span_attributes_trace_span ON span_attributes(trace_id, span_id);
548
-
549
- CREATE TABLE IF NOT EXISTS log_attributes (
550
- log_id INTEGER NOT NULL,
551
- key TEXT NOT NULL,
552
- value TEXT NOT NULL,
553
- PRIMARY KEY (log_id, key)
554
- );
555
-
556
- CREATE INDEX IF NOT EXISTS idx_log_attributes_key_value ON log_attributes(key, value, log_id);
557
- CREATE INDEX IF NOT EXISTS idx_log_attributes_log_id ON log_attributes(log_id);
558
- `)
493
+ if (opts.readonly) {
494
+ // Readonly connections skip schema init entirely — the schema
495
+ // already exists (a writer created it) and any `CREATE TABLE IF
496
+ // NOT EXISTS` / `PRAGMA journal_mode = WAL` statement would
497
+ // attempt a write and fight the daemon for the write lock.
498
+ // `query_only = 1` logically blocks any DML the app might
499
+ // accidentally send; still bump cache + mmap since those are
500
+ // safe and keep queries fast.
501
+ db.exec(`
502
+ PRAGMA query_only = 1;
503
+ PRAGMA busy_timeout = 15000;
504
+ PRAGMA cache_size = -65536;
505
+ PRAGMA mmap_size = 268435456;
506
+ `)
507
+ } else {
508
+ db.exec(`
509
+ -- Bump cache above the 2MB default. 64MB fits most hot index pages
510
+ -- (trace_summaries, spans, span_attributes indexes) in RAM even on
511
+ -- multi-GB databases, cutting cold-read latency meaningfully on
512
+ -- picker / search queries that sweep the index.
513
+ PRAGMA cache_size = -65536;
514
+ -- Let SQLite memory-map the first 256MB of the file. This is a
515
+ -- cheap way to avoid read() syscalls on hot pages and lets the OS
516
+ -- page cache serve index lookups directly. Safe on macOS and Linux;
517
+ -- SQLite silently caps at actual file size for smaller DBs.
518
+ PRAGMA mmap_size = 268435456;
519
+ `)
520
+ try {
521
+ db.exec(`
522
+ PRAGMA journal_mode = WAL;
523
+ PRAGMA synchronous = NORMAL;
524
+ PRAGMA temp_store = MEMORY;
525
+ -- WAL checkpoint automatically when it grows past ~16MB. Without
526
+ -- this the WAL happily runs into the hundreds of MB and queries
527
+ -- start paying the cost of walking the WAL on every read.
528
+ PRAGMA wal_autocheckpoint = 4000;
529
+
530
+ CREATE TABLE IF NOT EXISTS spans (
531
+ trace_id TEXT NOT NULL,
532
+ span_id TEXT NOT NULL,
533
+ parent_span_id TEXT,
534
+ service_name TEXT NOT NULL,
535
+ scope_name TEXT,
536
+ operation_name TEXT NOT NULL,
537
+ kind TEXT,
538
+ start_time_ms INTEGER NOT NULL,
539
+ end_time_ms INTEGER NOT NULL,
540
+ duration_ms REAL NOT NULL,
541
+ status TEXT NOT NULL,
542
+ attributes_json TEXT NOT NULL,
543
+ resource_json TEXT NOT NULL,
544
+ events_json TEXT NOT NULL,
545
+ PRIMARY KEY (trace_id, span_id)
546
+ );
547
+
548
+ CREATE INDEX IF NOT EXISTS idx_spans_service_time ON spans(service_name, start_time_ms DESC);
549
+ CREATE INDEX IF NOT EXISTS idx_spans_trace_time ON spans(trace_id, start_time_ms ASC);
550
+ CREATE INDEX IF NOT EXISTS idx_spans_span_id ON spans(span_id);
551
+ CREATE INDEX IF NOT EXISTS idx_spans_status_time ON spans(status, start_time_ms DESC);
552
+
553
+ CREATE TABLE IF NOT EXISTS logs (
554
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
555
+ trace_id TEXT,
556
+ span_id TEXT,
557
+ service_name TEXT NOT NULL,
558
+ scope_name TEXT,
559
+ severity_text TEXT NOT NULL,
560
+ timestamp_ms INTEGER NOT NULL,
561
+ body TEXT NOT NULL,
562
+ attributes_json TEXT NOT NULL,
563
+ resource_json TEXT NOT NULL
564
+ );
565
+
566
+ CREATE INDEX IF NOT EXISTS idx_logs_service_time ON logs(service_name, timestamp_ms DESC);
567
+ CREATE INDEX IF NOT EXISTS idx_logs_trace_time ON logs(trace_id, timestamp_ms DESC);
568
+ CREATE INDEX IF NOT EXISTS idx_logs_span_time ON logs(span_id, timestamp_ms DESC);
569
+ CREATE INDEX IF NOT EXISTS idx_logs_severity_time ON logs(severity_text, timestamp_ms DESC);
570
+
571
+ CREATE TABLE IF NOT EXISTS trace_summaries (
572
+ trace_id TEXT PRIMARY KEY,
573
+ service_name TEXT NOT NULL,
574
+ root_operation_name TEXT NOT NULL,
575
+ started_at_ms INTEGER NOT NULL,
576
+ ended_at_ms INTEGER NOT NULL,
577
+ active_span_count INTEGER NOT NULL DEFAULT 0,
578
+ duration_ms REAL NOT NULL,
579
+ span_count INTEGER NOT NULL,
580
+ error_count INTEGER NOT NULL
581
+ );
582
+
583
+ CREATE INDEX IF NOT EXISTS idx_trace_summaries_started_at ON trace_summaries(started_at_ms DESC, trace_id DESC);
584
+ CREATE INDEX IF NOT EXISTS idx_trace_summaries_service_started_at ON trace_summaries(service_name, started_at_ms DESC, trace_id DESC);
585
+ CREATE INDEX IF NOT EXISTS idx_trace_summaries_duration ON trace_summaries(duration_ms DESC);
586
+
587
+ CREATE TABLE IF NOT EXISTS span_attributes (
588
+ trace_id TEXT NOT NULL,
589
+ span_id TEXT NOT NULL,
590
+ key TEXT NOT NULL,
591
+ value TEXT NOT NULL,
592
+ PRIMARY KEY (trace_id, span_id, key)
593
+ );
594
+
595
+ CREATE INDEX IF NOT EXISTS idx_span_attributes_key_value ON span_attributes(key, value, trace_id, span_id);
596
+ CREATE INDEX IF NOT EXISTS idx_span_attributes_trace_span ON span_attributes(trace_id, span_id);
559
597
 
598
+ CREATE TABLE IF NOT EXISTS log_attributes (
599
+ log_id INTEGER NOT NULL,
600
+ key TEXT NOT NULL,
601
+ value TEXT NOT NULL,
602
+ PRIMARY KEY (log_id, key)
603
+ );
604
+
605
+ CREATE INDEX IF NOT EXISTS idx_log_attributes_key_value ON log_attributes(key, value, log_id);
606
+ CREATE INDEX IF NOT EXISTS idx_log_attributes_log_id ON log_attributes(log_id);
607
+ `)
608
+ } catch (err) {
609
+ if (!isSqliteLockError(err)) throw err
610
+ console.warn(`motel: writer bootstrap skipped during startup: ${(err as Error).message}`)
611
+ }
612
+ }
613
+
614
+ // Tables detected at runtime. For writer connections these flags are
615
+ // set by the FTS `CREATE VIRTUAL TABLE IF NOT EXISTS` try/catch; for
616
+ // readonly connections we probe `sqlite_master` and set them based on
617
+ // what the writer has already provisioned.
560
618
  let hasFts = true
619
+ let hasAttrFts = true
620
+ if (opts.readonly) {
621
+ try {
622
+ const row = db.query(`SELECT name FROM sqlite_master WHERE type='table' AND name='span_operation_fts'`).get()
623
+ hasFts = row !== null
624
+ } catch { hasFts = false }
625
+ try {
626
+ const row = db.query(`SELECT name FROM sqlite_master WHERE type='table' AND name='span_attr_fts'`).get()
627
+ hasAttrFts = row !== null
628
+ } catch { hasAttrFts = false }
629
+ }
630
+
631
+ if (!opts.readonly) {
561
632
  try {
562
633
  db.exec(`
563
634
  CREATE VIRTUAL TABLE IF NOT EXISTS span_operation_fts USING fts5(
@@ -589,7 +660,6 @@ export const TelemetryStoreLive = Layer.effect(
589
660
  // Keys are inlined into the trigger DDL rather than looked up in a
590
661
  // side table so the `WHEN` guard stays constant-cost (a subquery
591
662
  // would run on every span_attributes insert — ~60/span).
592
- let hasAttrFts = hasFts
593
663
  if (hasFts) {
594
664
  try {
595
665
  const keyList = AI_FTS_KEYS.map((k) => `'${k.replace(/'/g, "''")}'`).join(", ")
@@ -661,6 +731,13 @@ export const TelemetryStoreLive = Layer.effect(
661
731
  // ANALYZE / optimize failures are never fatal — queries still work,
662
732
  // they just run with default row estimates.
663
733
  }
734
+ // Longer busy timeout: the ingest worker holds the write lock for up
735
+ // to a few seconds during big OTLP batches, and the daemon's retention
736
+ // passes can do the same. Apply this AFTER startup maintenance so
737
+ // lock-conflicted bootstrap steps fail fast instead of stalling health
738
+ // for the full 15s timeout.
739
+ try { db.exec(`PRAGMA busy_timeout = 15000;`) } catch { /* ignore */ }
740
+ } // end: if (!opts.readonly) writer init
664
741
 
665
742
  const insertSpan = db.query(`
666
743
  INSERT INTO spans (
@@ -708,8 +785,15 @@ export const TelemetryStoreLive = Layer.effect(
708
785
  GROUP BY trace_id
709
786
  `)
710
787
 
711
- db.query(`DELETE FROM trace_summaries`).run()
712
- rebuildTraceSummaries.run()
788
+ const reconcileTraceSummaries = Effect.sync(() => {
789
+ try {
790
+ db.query(`DELETE FROM trace_summaries`).run()
791
+ rebuildTraceSummaries.run()
792
+ } catch (err) {
793
+ if (!isSqliteLockError(err)) throw err
794
+ console.warn(`motel: trace summary rebuild skipped during startup: ${(err as Error).message}`)
795
+ }
796
+ })
713
797
 
714
798
  const deleteSpanAttributes = db.query(`DELETE FROM span_attributes WHERE trace_id = ? AND span_id = ?`)
715
799
  const insertSpanAttribute = db.query(`INSERT INTO span_attributes (trace_id, span_id, key, value) VALUES (?, ?, ?, ?)`)
@@ -792,28 +876,61 @@ export const TelemetryStoreLive = Layer.effect(
792
876
  } catch {
793
877
  // FTS table may not exist on old DBs.
794
878
  }
879
+
880
+ // Truncate the WAL after a big delete pass. Without this the
881
+ // WAL keeps growing (observed: 640MB) because wal_autocheckpoint
882
+ // only triggers when WAL pages exceed the threshold during
883
+ // writes — a retention pass that evicts millions of rows can
884
+ // blow far past that before the auto-checkpoint fires. Using
885
+ // PASSIVE so active readers aren't interrupted; if the WAL
886
+ // can't be fully reclaimed right now, we'll try again next
887
+ // cycle.
888
+ try { db.exec(`PRAGMA wal_checkpoint(PASSIVE);`) } catch { /* ignore */ }
889
+
890
+ // Incremental vacuum reclaims some of the freed pages back
891
+ // to the OS so the file size actually shrinks over time
892
+ // instead of just growing the freelist. Bounded to 2000
893
+ // pages per pass (≈8MB) to avoid a long-running transaction.
894
+ try { db.exec(`PRAGMA incremental_vacuum(2000);`) } catch { /* ignore */ }
795
895
  })
796
896
  })
797
897
 
798
- // Run cleanup every 60 seconds in the background, tied to the layer's scope
799
- yield* Effect.forkScoped(Effect.repeat(cleanupExpired(), Schedule.spaced("60 seconds")))
800
-
801
- // Periodically refresh query planner stats. `PRAGMA optimize` is a
802
- // no-op when nothing has changed, so this is essentially free on idle
803
- // servers and keeps facet/search planner estimates accurate as data
804
- // grows. 15 minutes is slower than ingestion rates we care about but
805
- // frequent enough that the attribute picker stays snappy.
806
- const refreshPlannerStats = Effect.sync(() => {
807
- try { db.exec(`PRAGMA optimize;`) } catch { /* ignore */ }
808
- })
809
- yield* Effect.forkScoped(Effect.repeat(refreshPlannerStats, Schedule.spaced("15 minutes")))
898
+ // Retention only runs in processes that opt in (currently the main
899
+ // daemon). The ingest worker and TUI skip it to avoid two writers
900
+ // competing for the write lock with overlapping DELETE passes.
901
+ if (opts.runRetention) {
902
+ // Reconcile any summary drift from interrupted ingests, but do it
903
+ // after the server becomes healthy. Running this synchronously at
904
+ // open can sit behind another writer's lock for ~15s and make the
905
+ // daemon look hung even though the port is already bound.
906
+ yield* Effect.forkScoped(reconcileTraceSummaries)
907
+
908
+ // Enable incremental vacuum so retention can reclaim freed
909
+ // pages over time instead of needing a stop-the-world VACUUM.
910
+ // Idempotent: repeat calls after the first are no-ops.
911
+ try { db.exec(`PRAGMA auto_vacuum = INCREMENTAL;`) } catch { /* ignore */ }
912
+
913
+ // Run cleanup every 60 seconds in the background, tied to the layer's scope
914
+ yield* Effect.forkScoped(Effect.repeat(cleanupExpired(), Schedule.spaced("60 seconds")))
915
+
916
+ // Periodically refresh query planner stats. `PRAGMA optimize` is a
917
+ // no-op when nothing has changed, so this is essentially free on idle
918
+ // servers and keeps facet/search planner estimates accurate as data
919
+ // grows. 15 minutes is slower than ingestion rates we care about but
920
+ // frequent enough that the attribute picker stays snappy.
921
+ const refreshPlannerStats = Effect.sync(() => {
922
+ try { db.exec(`PRAGMA optimize;`) } catch { /* ignore */ }
923
+ })
924
+ yield* Effect.forkScoped(Effect.repeat(refreshPlannerStats, Schedule.spaced("15 minutes")))
925
+ }
810
926
 
811
927
  // One-time backfill for existing DBs: if span_attr_fts is empty but
812
928
  // span_attributes has rows with AI_FTS_KEYS, populate the index.
813
929
  // Runs forked so server startup isn't blocked; queries hitting the
814
930
  // FTS will just return empty until the fill lands. On a 2 GB DB with
815
- // ~400 matching rows this takes ~3-8 seconds.
816
- if (hasAttrFts) {
931
+ // ~400 matching rows this takes ~3-8 seconds. Writer-only because
932
+ // it does INSERT INTO ... — readonly connections would error.
933
+ if (hasAttrFts && !opts.readonly) {
817
934
  const backfillAttrFts = Effect.sync(() => {
818
935
  try {
819
936
  const ftsCount = (db.query(`SELECT COUNT(*) AS c FROM span_attr_fts`).get() as { c: number }).c
@@ -1705,7 +1822,11 @@ export const TelemetryStoreLive = Layer.effect(
1705
1822
 
1706
1823
  /** Builds WHERE clauses for AI call search against the spans table (aliased as s) */
1707
1824
  const buildAiWhereClauses = (input: AiCallSearch | AiCallStatsSearch, cutoff: number) => {
1708
- const clauses: string[] = ["s.operation_name LIKE 'ai.%'", "s.start_time_ms >= ?"]
1825
+ const clauses: string[] = [
1826
+ "s.operation_name LIKE 'ai.%'",
1827
+ "s.operation_name NOT LIKE 'ai.%.do%'",
1828
+ "s.start_time_ms >= ?",
1829
+ ]
1709
1830
  const params: Array<string | number> = [cutoff]
1710
1831
 
1711
1832
  if (input.service) {
@@ -2094,3 +2215,25 @@ export const TelemetryStoreLive = Layer.effect(
2094
2215
  })
2095
2216
  }),
2096
2217
  )
2218
+
2219
+ /**
2220
+ * Default writer instance: the main daemon uses this. Owns schema
2221
+ * migrations, FTS backfill, and the retention loop.
2222
+ */
2223
+ export const TelemetryStoreLive = makeTelemetryStoreLayer({ readonly: false, runRetention: true })
2224
+
2225
+ /**
2226
+ * Writer instance that SKIPS retention. The ingest worker uses this
2227
+ * so the daemon and the worker aren't both running DELETE passes at
2228
+ * the same time (they'd just serialise behind the write lock and
2229
+ * duplicate work).
2230
+ */
2231
+ export const TelemetryStoreWorkerLive = makeTelemetryStoreLayer({ readonly: false, runRetention: false })
2232
+
2233
+ /**
2234
+ * Read-only instance for query-only processes (currently the TUI).
2235
+ * Skips every DDL/DML statement at startup so the connection can be
2236
+ * opened while a writer is mid-transaction without racing for the
2237
+ * write lock. Writes through the service interface will throw.
2238
+ */
2239
+ export const TelemetryStoreReadonlyLive = makeTelemetryStoreLayer({ readonly: true, runRetention: false })
@@ -1,5 +1,5 @@
1
1
  import { Effect, Layer, Context } from "effect"
2
- import type { SpanItem, TraceItem, TraceSummaryItem } from "../domain.js"
2
+ import type { AiCallDetail, SpanItem, TraceItem, TraceSummaryItem } from "../domain.js"
3
3
  import { TelemetryStore } from "./TelemetryStore.js"
4
4
 
5
5
  export class TraceQueryService extends Context.Service<
@@ -14,6 +14,7 @@ export class TraceQueryService extends Context.Service<
14
14
  readonly traceStats: (input: { readonly groupBy: string; readonly agg: "count" | "avg_duration" | "p95_duration" | "error_rate"; readonly serviceName?: string | null; readonly operation?: string | null; readonly status?: "ok" | "error" | null; readonly minDurationMs?: number | null; readonly lookbackMinutes?: number; readonly limit?: number; readonly attributeFilters?: Readonly<Record<string, string>> }) => Effect.Effect<readonly { readonly group: string; readonly value: number; readonly count: number }[], Error>
15
15
  readonly getTrace: (traceId: string) => Effect.Effect<TraceItem | null, Error>
16
16
  readonly getSpan: (spanId: string) => Effect.Effect<SpanItem | null, Error>
17
+ readonly getAiCall: (spanId: string) => Effect.Effect<AiCallDetail | null, Error>
17
18
  readonly listTraceSpans: (traceId: string) => Effect.Effect<readonly SpanItem[], Error>
18
19
  readonly searchSpans: (input: { readonly serviceName?: string | null; readonly operation?: string | null; readonly parentOperation?: string | null; readonly status?: "ok" | "error" | null; readonly lookbackMinutes?: number; readonly limit?: number; readonly attributeFilters?: Readonly<Record<string, string>> }) => Effect.Effect<readonly SpanItem[], Error>
19
20
  }
@@ -68,6 +69,7 @@ export const TraceQueryServiceLive = Layer.effect(
68
69
  traceStats: store.traceStats,
69
70
  getTrace,
70
71
  getSpan,
72
+ getAiCall: store.getAiCall,
71
73
  listTraceSpans: store.listTraceSpans,
72
74
  searchSpans: store.searchSpans,
73
75
  })
@@ -0,0 +1,41 @@
1
+ /**
2
+ * RPC contract for OTLP ingest. Lives in its own file so both the
3
+ * main thread (client) and the telemetry worker (server) can import
4
+ * the schema without pulling in each other's runtime code.
5
+ *
6
+ * Only ingestTraces and ingestLogs run through RPC — those are the
7
+ * methods whose SQLite writes used to block the main event loop for
8
+ * seconds at a time. Every other TelemetryStore method stays on the
9
+ * main thread with its own direct DB connection; SQLite's WAL mode
10
+ * lets the reader (main) and writer (worker) hold independent
11
+ * connections to the same file concurrently without contention.
12
+ *
13
+ * Payloads are typed as Schema.Unknown because OTLP's protobuf-JSON
14
+ * shape is enormous and nested — the store validates structurally
15
+ * during the actual insert loop, and serialising a schema through
16
+ * the worker boundary would add overhead that beats the purpose of
17
+ * the offload. If a payload is malformed we surface it as an
18
+ * IngestError rather than a RpcSchemaError, which keeps the failure
19
+ * mode consistent with the old direct-call behaviour.
20
+ */
21
+
22
+ import { Schema } from "effect"
23
+ import * as Rpc from "effect/unstable/rpc/Rpc"
24
+ import * as RpcGroup from "effect/unstable/rpc/RpcGroup"
25
+
26
+ export class IngestError extends Schema.TaggedErrorClass<IngestError>()("IngestError", {
27
+ message: Schema.String,
28
+ }) {}
29
+
30
+ export const IngestRpcs = RpcGroup.make(
31
+ Rpc.make("ingestTraces", {
32
+ payload: { payload: Schema.Unknown },
33
+ success: Schema.Struct({ insertedSpans: Schema.Number }),
34
+ error: IngestError,
35
+ }),
36
+ Rpc.make("ingestLogs", {
37
+ payload: { payload: Schema.Unknown },
38
+ success: Schema.Struct({ insertedLogs: Schema.Number }),
39
+ error: IngestError,
40
+ }),
41
+ )
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Worker-thread entry point for OTLP ingest.
3
+ *
4
+ * Spawned by the main process via `new Worker(new URL("./telemetryWorker.ts", import.meta.url))`.
5
+ * This file runs inside a Bun Worker, so anything it imports is
6
+ * evaluated in a FRESH module graph on the worker side. In particular
7
+ * `TelemetryStoreWorkerLive` opens its own SQLite connection here — the main
8
+ * thread's store connection is unrelated. SQLite's WAL journal mode
9
+ * lets both connections coexist against the same `.sqlite` file: the
10
+ * worker writes, the main thread reads, and neither blocks the other.
11
+ *
12
+ * The worker only exposes `ingestTraces` / `ingestLogs` (see
13
+ * ingestRpc.ts). Query methods stay on the main thread because they're
14
+ * already fast (1-14ms) and round-tripping them through structured-
15
+ * clone would add more overhead than it saves. This is a deliberately
16
+ * narrow interface — the payoff is that main-thread HTTP queries
17
+ * never queue behind a heavy OTLP batch again.
18
+ */
19
+
20
+ import { BunRuntime } from "@effect/platform-bun"
21
+ import * as BunWorkerRunner from "@effect/platform-bun/BunWorkerRunner"
22
+ import { Effect, Layer } from "effect"
23
+ import * as RpcSerialization from "effect/unstable/rpc/RpcSerialization"
24
+ import * as RpcServer from "effect/unstable/rpc/RpcServer"
25
+ import type { OtlpLogExportRequest, OtlpTraceExportRequest } from "../otlp.ts"
26
+ import { IngestError, IngestRpcs } from "./ingestRpc.ts"
27
+ import { TelemetryStore, TelemetryStoreWorkerLive } from "./TelemetryStore.ts"
28
+
29
+ // Wire the two RPC methods to the existing TelemetryStore service.
30
+ // The store's ingest methods already carry their own Effect.fn spans,
31
+ // so the worker-side traces show up correctly attributed — the RPC
32
+ // framework also auto-spans each incoming request with method +
33
+ // payload-size attributes, giving us visibility into how ingest is
34
+ // splitting its time across the queue / wire / SQL stages.
35
+ const IngestHandlers = IngestRpcs.toLayer(
36
+ Effect.gen(function*() {
37
+ const store = yield* TelemetryStore
38
+ return {
39
+ ingestTraces: ({ payload }) =>
40
+ store.ingestTraces(payload as OtlpTraceExportRequest).pipe(
41
+ Effect.mapError((cause) => new IngestError({ message: String(cause) })),
42
+ ),
43
+ ingestLogs: ({ payload }) =>
44
+ store.ingestLogs(payload as OtlpLogExportRequest).pipe(
45
+ Effect.mapError((cause) => new IngestError({ message: String(cause) })),
46
+ ),
47
+ }
48
+ }),
49
+ )
50
+
51
+ const WorkerLive = RpcServer.layer(IngestRpcs).pipe(
52
+ Layer.provide(IngestHandlers),
53
+ Layer.provide(TelemetryStoreWorkerLive),
54
+ Layer.provide(RpcServer.layerProtocolWorkerRunner),
55
+ Layer.provide(RpcSerialization.layerMsgPack),
56
+ Layer.provide(BunWorkerRunner.layer),
57
+ )
58
+
59
+ // BunRuntime.runMain installs signal handlers so the scope closes
60
+ // cleanly on termination; the BunHttpServer layer pattern from the
61
+ // main server carries over here.
62
+ Layer.launch(WorkerLive).pipe(BunRuntime.runMain)