@kitlangton/motel 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/AGENTS.md +11 -8
  2. package/README.md +13 -2
  3. package/package.json +31 -19
  4. package/skills/motel-debug/SKILL.md +203 -0
  5. package/skills/motel-debug/references/effect.md +38 -0
  6. package/src/App.tsx +3 -5
  7. package/src/StartupGate.tsx +8 -10
  8. package/src/cli.ts +15 -16
  9. package/src/config.ts +7 -1
  10. package/src/daemon.test.ts +332 -51
  11. package/src/daemon.ts +103 -152
  12. package/src/httpApi.ts +1 -0
  13. package/src/httpListPolicy.test.ts +76 -0
  14. package/src/httpListPolicy.ts +129 -0
  15. package/src/localServer.ts +194 -323
  16. package/src/mcp.ts +2 -1
  17. package/src/opentui-jsx.d.ts +11 -0
  18. package/src/otlp.test.ts +65 -0
  19. package/src/otlp.ts +20 -0
  20. package/src/otlpProtobuf.ts +35 -0
  21. package/src/registry.ts +37 -11
  22. package/src/runtime.ts +2 -6
  23. package/src/services/AsyncIngest.ts +20 -8
  24. package/src/services/LogQueryService.ts +11 -25
  25. package/src/services/TelemetryQuery.ts +62 -0
  26. package/src/services/TelemetryStore.ts +433 -249
  27. package/src/services/TraceQueryService.ts +18 -52
  28. package/src/services/ingestRpc.ts +2 -4
  29. package/src/services/queryRpc.ts +15 -0
  30. package/src/services/telemetryQueryWorker.ts +32 -0
  31. package/src/services/telemetryWorker.ts +5 -8
  32. package/src/storybook/aiChatStory.tsx +1 -1
  33. package/src/telemetry.test.ts +307 -41
  34. package/src/ui/AiChatView.tsx +1 -1
  35. package/src/ui/AttrFilterModal.tsx +1 -1
  36. package/src/ui/ServiceLogs.tsx +10 -7
  37. package/src/ui/SpanContentView.tsx +24 -21
  38. package/src/ui/TraceDetailsPane.tsx +1 -1
  39. package/src/ui/TraceList.tsx +1 -1
  40. package/src/ui/aiState.ts +10 -22
  41. package/src/ui/app/TraceWorkspace.tsx +2 -1
  42. package/src/ui/app/useAppLayout.ts +1 -1
  43. package/src/ui/app/useTraceScreenData.ts +22 -18
  44. package/src/ui/cachedLoader.test.ts +23 -0
  45. package/src/ui/cachedLoader.ts +60 -0
  46. package/src/ui/loaders.ts +34 -53
  47. package/src/ui/primitives.tsx +1 -1
  48. package/src/ui/state.ts +2 -0
  49. package/src/ui/traceDetailsWidth.repro.test.ts +12 -1
  50. package/src/ui/traceSortNav.repro.seed.ts +1 -1
  51. package/src/ui/traceSortNav.repro.test.ts +12 -2
  52. package/src/ui/useAttrFilterPicker.ts +10 -8
  53. package/src/ui/useKeyboardNav.ts +3 -6
  54. package/src/ui/waterfallNav.repro.seed.ts +1 -1
  55. package/src/ui/waterfallNav.repro.test.ts +16 -8
  56. package/web/dist/assets/index-B01z9BaO.css +2 -0
  57. package/web/dist/assets/index-M86tcih5.js +22 -0
  58. package/web/dist/index.html +2 -2
  59. package/web/dist/assets/index-DnyVo03x.js +0 -27
  60. package/web/dist/assets/index-DzuHNBGV.css +0 -2
@@ -1,11 +1,11 @@
1
1
  import { Database } from "bun:sqlite"
2
- import { mkdirSync } from "node:fs"
2
+ import * as BunFileSystem from "@effect/platform-bun/BunFileSystem"
3
3
  import { dirname } from "node:path"
4
- import { Clock, Effect, Layer, Schedule, Context } from "effect"
4
+ import { Cause, Clock, Effect, FileSystem, Layer, Schedule, Context } from "effect"
5
5
  import { config } from "../config.js"
6
6
  import type { AiCallDetail, AiCallSummary, FacetItem, LogItem, SpanItem, StatsItem, TraceItem, TraceSummaryItem, TraceSpanEvent, TraceSpanItem } from "../domain.js"
7
7
  import { AI_ATTR_MAP, AI_FTS_KEYS, AI_TEXT_SEARCH_KEYS, truncatePreview } from "../domain.js"
8
- import { attributeMap, nanosToMilliseconds, parseAnyValue, spanKindLabel, spanStatusLabel, stringifyValue, type OtlpLogExportRequest, type OtlpTraceExportRequest } from "../otlp.js"
8
+ import { attributeMap, nanosToMilliseconds, normalizeOtlpBinaryId, parseAnyValue, spanKindLabel, spanStatusLabel, stringifyValue, type OtlpLogExportRequest, type OtlpTraceExportRequest } from "../otlp.js"
9
9
 
10
10
  const isSqliteLockError = (error: unknown) =>
11
11
  error instanceof Error && /(database is locked|database table is locked|SQLITE_BUSY)/i.test(error.message)
@@ -196,18 +196,40 @@ const TRACE_SUMMARY_SELECT_SQL = `
196
196
  FROM spans
197
197
  `
198
198
 
199
+ // Memoize small repeated JSON records. Resource attributes are the primary
200
+ // beneficiary because many spans share the same serialized value; compact
201
+ // repeated span attributes also benefit while large unique payloads bypass
202
+ // the cache to keep memory bounded for long-running daemons.
203
+ const RECORD_PARSE_CACHE_MAX_VALUE_LEN = 1024
204
+ const RECORD_PARSE_CACHE_LIMIT = 256
205
+ const recordParseCache = new Map<string, Record<string, string>>()
206
+ const EMPTY_RECORD: Record<string, string> = {}
207
+
199
208
  const parseRecord = (value: string): Record<string, string> => {
209
+ if (value === "" || value === "{}") return EMPTY_RECORD
210
+ const cacheable = value.length <= RECORD_PARSE_CACHE_MAX_VALUE_LEN
211
+ if (cacheable) {
212
+ const cached = recordParseCache.get(value)
213
+ if (cached !== undefined) return cached
214
+ }
215
+ let parsed: Record<string, string>
200
216
  try {
201
- const parsed = JSON.parse(value) as Record<string, unknown>
202
- return Object.fromEntries(Object.entries(parsed).map(([key, entry]) => [key, stringifyValue(entry)]))
217
+ const json = JSON.parse(value) as Record<string, unknown>
218
+ parsed = Object.fromEntries(Object.entries(json).map(([key, entry]) => [key, stringifyValue(entry)]))
203
219
  } catch {
204
- return {}
220
+ parsed = EMPTY_RECORD
221
+ }
222
+ if (cacheable && recordParseCache.size < RECORD_PARSE_CACHE_LIMIT) {
223
+ recordParseCache.set(value, parsed)
205
224
  }
225
+ return parsed
206
226
  }
207
227
 
208
228
  const parseEvents = (value: string): readonly TraceSpanEvent[] => {
229
+ if (value === "" || value === "[]") return []
209
230
  try {
210
231
  const parsed = JSON.parse(value) as Array<{ name: string; timestamp: number; attributes: Record<string, string> }>
232
+ if (parsed.length === 0) return []
211
233
  return parsed.map((event) => ({
212
234
  name: event.name,
213
235
  timestamp: new Date(event.timestamp),
@@ -423,29 +445,41 @@ const buildContainsAttributeMatchSubquery = (
423
445
  }
424
446
  }
425
447
 
448
+ // Read-only surface of the telemetry store. Pulled out so a readonly
449
+ // SQLite connection (TUI / HTTP query handlers) can be expressed as a
450
+ // distinct service identifier from the writer, without re-declaring
451
+ // every query in a wrapper layer. The writer's value still satisfies
452
+ // this shape — TelemetryStoreLive can provide both identifiers from
453
+ // one underlying object if needed.
454
+ export interface TelemetryStoreReader {
455
+ readonly listServices: Effect.Effect<readonly string[], Error>
456
+ readonly listRecentTraces: (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) => Effect.Effect<readonly TraceItem[], Error>
457
+ readonly listTraceSummaries: (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) => Effect.Effect<readonly TraceSummaryItem[], Error>
458
+ readonly searchTraces: (input: TraceSearch) => Effect.Effect<readonly TraceItem[], Error>
459
+ readonly searchTraceSummaries: (input: TraceSearch) => Effect.Effect<readonly TraceSummaryItem[], Error>
460
+ readonly traceStats: (input: TraceStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
461
+ readonly getTrace: (traceId: string) => Effect.Effect<TraceItem | null, Error>
462
+ readonly getSpan: (spanId: string) => Effect.Effect<SpanItem | null, Error>
463
+ readonly listTraceSpans: (traceId: string) => Effect.Effect<readonly SpanItem[], Error>
464
+ readonly searchSpans: (input: SpanSearch) => Effect.Effect<readonly SpanItem[], Error>
465
+ readonly searchLogs: (input: LogSearch) => Effect.Effect<readonly LogItem[], Error>
466
+ readonly logStats: (input: LogStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
467
+ readonly listFacets: (input: FacetSearch) => Effect.Effect<readonly FacetItem[], Error>
468
+ readonly listRecentLogs: (serviceName: string) => Effect.Effect<readonly LogItem[], Error>
469
+ readonly listTraceLogs: (traceId: string) => Effect.Effect<readonly LogItem[], Error>
470
+ readonly searchAiCalls: (input: AiCallSearch) => Effect.Effect<readonly AiCallSummary[], Error>
471
+ readonly getAiCall: (spanId: string) => Effect.Effect<AiCallDetail | null, Error>
472
+ readonly aiCallStats: (input: AiCallStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
473
+ }
474
+
475
+ export class TelemetryStoreReadonly extends Context.Service<TelemetryStoreReadonly, TelemetryStoreReader>()("motel/TelemetryStoreReadonly") {}
476
+
426
477
  export class TelemetryStore extends Context.Service<
427
478
  TelemetryStore,
428
- {
479
+ TelemetryStoreReader & {
429
480
  readonly ingestTraces: (payload: OtlpTraceExportRequest) => Effect.Effect<{ readonly insertedSpans: number }, Error>
430
481
  readonly ingestLogs: (payload: OtlpLogExportRequest) => Effect.Effect<{ readonly insertedLogs: number }, Error>
431
- readonly listServices: Effect.Effect<readonly string[], Error>
432
- readonly listRecentTraces: (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) => Effect.Effect<readonly TraceItem[], Error>
433
- readonly listTraceSummaries: (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) => Effect.Effect<readonly TraceSummaryItem[], Error>
434
- readonly searchTraces: (input: TraceSearch) => Effect.Effect<readonly TraceItem[], Error>
435
- readonly searchTraceSummaries: (input: TraceSearch) => Effect.Effect<readonly TraceSummaryItem[], Error>
436
- readonly traceStats: (input: TraceStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
437
- readonly getTrace: (traceId: string) => Effect.Effect<TraceItem | null, Error>
438
- readonly getSpan: (spanId: string) => Effect.Effect<SpanItem | null, Error>
439
- readonly listTraceSpans: (traceId: string) => Effect.Effect<readonly SpanItem[], Error>
440
- readonly searchSpans: (input: SpanSearch) => Effect.Effect<readonly SpanItem[], Error>
441
- readonly searchLogs: (input: LogSearch) => Effect.Effect<readonly LogItem[], Error>
442
- readonly logStats: (input: LogStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
443
- readonly listFacets: (input: FacetSearch) => Effect.Effect<readonly FacetItem[], Error>
444
- readonly listRecentLogs: (serviceName: string) => Effect.Effect<readonly LogItem[], Error>
445
- readonly listTraceLogs: (traceId: string) => Effect.Effect<readonly LogItem[], Error>
446
- readonly searchAiCalls: (input: AiCallSearch) => Effect.Effect<readonly AiCallSummary[], Error>
447
- readonly getAiCall: (spanId: string) => Effect.Effect<AiCallDetail | null, Error>
448
- readonly aiCallStats: (input: AiCallStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
482
+ readonly runRetentionNow: Effect.Effect<void, Error>
449
483
  }
450
484
  >()("motel/TelemetryStore") {}
451
485
 
@@ -462,18 +496,17 @@ export class TelemetryStore extends Context.Service<
462
496
  *
463
497
  * - `runRetention` — fork the background cleanup loop (age + size cap
464
498
  * eviction, WAL checkpoint). Only one process should own this at a
465
- * time. Currently the main daemon (localServer) does; the ingest
466
- * worker and the TUI skip it.
499
+ * time. The ingest worker owns it; the HTTP thread and TUI skip it.
467
500
  */
468
501
  export interface TelemetryStoreOptions {
469
502
  readonly readonly: boolean
470
503
  readonly runRetention: boolean
471
504
  }
472
505
 
473
- export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.effect(
474
- TelemetryStore,
506
+ const makeTelemetryStoreEffect = (opts: TelemetryStoreOptions) =>
475
507
  Effect.gen(function* () {
476
- mkdirSync(dirname(config.otel.databasePath), { recursive: true })
508
+ const fileSystem = yield* FileSystem.FileSystem
509
+ yield* fileSystem.makeDirectory(dirname(config.otel.databasePath), { recursive: true })
477
510
  const db = yield* Effect.acquireRelease(
478
511
  Effect.sync(() => new Database(config.otel.databasePath, {
479
512
  create: !opts.readonly,
@@ -517,6 +550,13 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
517
550
  -- SQLite silently caps at actual file size for smaller DBs.
518
551
  PRAGMA mmap_size = 268435456;
519
552
  `)
553
+ // auto_vacuum is a header-level setting: it only takes effect on
554
+ // an empty DB, or on the next VACUUM after a change. Setting it
555
+ // here, BEFORE the first CREATE TABLE, is the only path that
556
+ // makes incremental_vacuum work without a full VACUUM. For
557
+ // existing DBs that predate this setting keep their current mode;
558
+ // Motel never performs a surprise full-file VACUUM at startup.
559
+ try { db.exec(`PRAGMA auto_vacuum = INCREMENTAL;`) } catch { /* ignore */ }
520
560
  try {
521
561
  db.exec(`
522
562
  PRAGMA journal_mode = WAL;
@@ -526,6 +566,13 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
526
566
  -- this the WAL happily runs into the hundreds of MB and queries
527
567
  -- start paying the cost of walking the WAL on every read.
528
568
  PRAGMA wal_autocheckpoint = 4000;
569
+ -- Hard floor for the WAL file. Auto-checkpoint controls *when*
570
+ -- pages move out of the WAL; size_limit controls how much the
571
+ -- WAL file is allowed to grow on disk. 128MB is generous enough
572
+ -- to absorb a long write burst without blocking on truncation,
573
+ -- tight enough that a wedged retention loop can't hide a 20GB
574
+ -- WAL the way a default no-limit configuration can.
575
+ PRAGMA journal_size_limit = 134217728;
529
576
 
530
577
  CREATE TABLE IF NOT EXISTS spans (
531
578
  trace_id TEXT NOT NULL,
@@ -604,6 +651,11 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
604
651
 
605
652
  CREATE INDEX IF NOT EXISTS idx_log_attributes_key_value ON log_attributes(key, value, log_id);
606
653
  CREATE INDEX IF NOT EXISTS idx_log_attributes_log_id ON log_attributes(log_id);
654
+
655
+ CREATE TABLE IF NOT EXISTS motel_maintenance (
656
+ key TEXT PRIMARY KEY,
657
+ value TEXT NOT NULL
658
+ );
607
659
  `)
608
660
  } catch (err) {
609
661
  if (!isSqliteLockError(err)) throw err
@@ -624,7 +676,8 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
624
676
  } catch { hasFts = false }
625
677
  try {
626
678
  const row = db.query(`SELECT name FROM sqlite_master WHERE type='table' AND name='span_attr_fts'`).get()
627
- hasAttrFts = row !== null
679
+ const backfill = db.query(`SELECT value FROM motel_maintenance WHERE key = 'span_attr_fts_v1'`).get() as { value: string } | null
680
+ hasAttrFts = row !== null && backfill?.value === "complete"
628
681
  } catch { hasAttrFts = false }
629
682
  }
630
683
 
@@ -723,10 +776,6 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
723
776
  // pay 3-4s on cold open instead of 400ms.
724
777
  try {
725
778
  db.exec(`PRAGMA analysis_limit = 1000; PRAGMA optimize;`)
726
- // First-time databases won't have sqlite_stat1 until we run a
727
- // real ANALYZE. Force it once if stats haven't been collected.
728
- const hasStats = db.query(`SELECT 1 FROM sqlite_master WHERE name = 'sqlite_stat1' LIMIT 1`).get() !== null
729
- if (!hasStats) db.exec(`ANALYZE;`)
730
779
  } catch {
731
780
  // ANALYZE / optimize failures are never fatal — queries still work,
732
781
  // they just run with default row estimates.
@@ -777,22 +826,19 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
777
826
  )
778
827
  `)
779
828
 
780
- const rebuildTraceSummaries = db.query(`
781
- INSERT INTO trace_summaries (
782
- trace_id, service_name, root_operation_name, started_at_ms, ended_at_ms, active_span_count, duration_ms, span_count, error_count
783
- )
784
- ${TRACE_SUMMARY_SELECT_SQL}
785
- GROUP BY trace_id
786
- `)
787
-
788
829
  const reconcileTraceSummaries = Effect.sync(() => {
789
- try {
790
- db.query(`DELETE FROM trace_summaries`).run()
791
- rebuildTraceSummaries.run()
792
- } catch (err) {
793
- if (!isSqliteLockError(err)) throw err
794
- console.warn(`motel: trace summary rebuild skipped during startup: ${(err as Error).message}`)
830
+ const marker = db.query(`SELECT value FROM motel_maintenance WHERE key = 'trace_summary_cursor'`).get() as { value: string } | null
831
+ const cursor = Number(marker?.value ?? 0)
832
+ const rows = db.query(`SELECT rowid, trace_id FROM spans WHERE rowid > ? ORDER BY rowid ASC LIMIT ?`).all(cursor, config.otel.retentionTraceBatch) as Array<{ rowid: number; trace_id: string }>
833
+ if (rows.length === 0) {
834
+ db.query(`INSERT OR REPLACE INTO motel_maintenance(key, value) VALUES ('trace_summary_cursor', '0')`).run()
835
+ return
795
836
  }
837
+ const transaction = db.transaction(() => {
838
+ for (const traceId of new Set(rows.map((row) => row.trace_id))) upsertTraceSummary.run(traceId)
839
+ db.query(`INSERT OR REPLACE INTO motel_maintenance(key, value) VALUES ('trace_summary_cursor', ?)`).run(String(rows.at(-1)!.rowid))
840
+ })
841
+ transaction()
796
842
  })
797
843
 
798
844
  const deleteSpanAttributes = db.query(`DELETE FROM span_attributes WHERE trace_id = ? AND span_id = ?`)
@@ -876,6 +922,61 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
876
922
 
877
923
  const maxDbSizeBytes = config.otel.maxDbSizeMb * 1024 * 1024
878
924
 
925
+ // Freelist-ratio thresholds for the adaptive reclaim loop. Below the
926
+ // LOW threshold there's nothing worth doing; above HIGH we are in the
927
+ // 17GB-DB-with-10GB-freelist failure mode and need to reclaim aggressively
928
+ // even if it costs writer-lock time.
929
+ const FREELIST_LOW_RATIO = 0.05
930
+ const FREELIST_MID_RATIO = 0.20
931
+ const FREELIST_HIGH_RATIO = 0.50
932
+ const VACUUM_PAGES_NORMAL = 2000 // ~8MB/pass
933
+ const VACUUM_PAGES_BUSY = 20000 // ~80MB/pass — used when freelist > 20%
934
+ const VACUUM_PAGES_PANIC = 50000 // ~200MB/pass — only when ratio > 50%
935
+
936
+ const ftsTableNames = ["span_attr_fts", "log_body_fts", "span_operation_fts"] as const
937
+
938
+ const incrementalFtsMerge = (pages: number) => {
939
+ // FTS5 segment merges drop tombstone rows that DELETE leaves behind.
940
+ // Without periodic merges, deleted FTS rows stay on disk indefinitely
941
+ // — a major source of freelist pages on a heavy-deletion workload.
942
+ // `merge=N` is a bounded, online operation: it merges at most N
943
+ // pages of work and returns. Per FTS5 docs, missing tables silently
944
+ // throw; we swallow because not every DB has every FTS table.
945
+ for (const name of ftsTableNames) {
946
+ try { db.query(`INSERT INTO ${name}(${name}) VALUES (?)`).run(`merge=${pages}`) } catch { /* table absent or older schema */ }
947
+ }
948
+ }
949
+
950
+ const reclaimSpace = Effect.fn("motel/TelemetryStore.reclaimSpace")(function* () {
951
+ yield* Effect.sync(() => {
952
+ const pageCount = (db.query(`PRAGMA page_count`).get() as { page_count: number }).page_count
953
+ const freePages = (db.query(`PRAGMA freelist_count`).get() as { freelist_count: number }).freelist_count
954
+ if (pageCount === 0) return
955
+ const ratio = freePages / pageCount
956
+ if (ratio < FREELIST_LOW_RATIO) return
957
+
958
+ // Adaptive vacuum sizing — fixed 2000 pages/min could not keep
959
+ // up with sustained deletions, leaking 10GB of freelist over
960
+ // time. Scale the per-pass work to the size of the backlog so
961
+ // we stay roughly proportional to the deficit.
962
+ const pages =
963
+ ratio >= FREELIST_HIGH_RATIO ? VACUUM_PAGES_PANIC :
964
+ ratio >= FREELIST_MID_RATIO ? VACUUM_PAGES_BUSY :
965
+ VACUUM_PAGES_NORMAL
966
+
967
+ try { db.exec(`PRAGMA incremental_vacuum(${pages});`) } catch { /* ignore */ }
968
+
969
+ // In WAL mode incremental_vacuum only moves pages — the file
970
+ // shrinks on the next checkpoint. PASSIVE silently skips when
971
+ // readers are active (the failure mode the agent's research
972
+ // flagged: checkpoint starvation). Use RESTART normally and
973
+ // TRUNCATE in panic mode to physically shrink the WAL when it
974
+ // has grown.
975
+ const mode = ratio >= FREELIST_HIGH_RATIO ? "TRUNCATE" : "RESTART"
976
+ try { db.exec(`PRAGMA wal_checkpoint(${mode});`) } catch { /* ignore */ }
977
+ })
978
+ })
979
+
879
980
  const cleanupExpired = Effect.fn("motel/TelemetryStore.cleanupExpired")(function* () {
880
981
  const now = yield* Clock.currentTimeMillis
881
982
 
@@ -891,12 +992,12 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
891
992
 
892
993
  // Time-based: completed traces whose last span ended before cutoff.
893
994
  const timeExpired = db.query(
894
- `SELECT trace_id FROM trace_summaries WHERE active_span_count = 0 AND ended_at_ms > 0 AND ended_at_ms < ?`,
895
- ).all(cutoff) as readonly { trace_id: string }[]
995
+ `SELECT trace_id FROM trace_summaries WHERE active_span_count = 0 AND ended_at_ms > 0 AND ended_at_ms < ? ORDER BY ended_at_ms ASC LIMIT ?`,
996
+ ).all(cutoff, config.otel.retentionTraceBatch) as readonly { trace_id: string }[]
896
997
  for (const row of timeExpired) toEvict.add(row.trace_id)
897
998
 
898
- // Size-based: if actual data exceeds cap, drop oldest 20% of the
899
- // remaining completed traces. `(page_count - freelist_count)`
999
+ // Size-based: if actual data exceeds the target, drop one bounded
1000
+ // batch of the oldest completed traces. `(page_count - freelist_count)`
900
1001
  // ignores freed-but-not-vacuumed pages so a large freelist doesn't
901
1002
  // trigger a deletion death spiral.
902
1003
  const pageCount = (db.query(`PRAGMA page_count`).get() as { page_count: number }).page_count
@@ -904,22 +1005,22 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
904
1005
  const pageSize = (db.query(`PRAGMA page_size`).get() as { page_size: number }).page_size
905
1006
  const dbSize = (pageCount - freePages) * pageSize
906
1007
  if (dbSize > maxDbSizeBytes) {
907
- const completedCount = (db.query(
908
- `SELECT COUNT(*) AS c FROM trace_summaries WHERE active_span_count = 0`,
909
- ).get() as { c: number }).c
910
- const traceCutCount = Math.max(1, Math.floor(completedCount * 0.2))
911
1008
  const oldest = db.query(
912
1009
  `SELECT trace_id FROM trace_summaries WHERE active_span_count = 0 ORDER BY started_at_ms ASC LIMIT ?`,
913
- ).all(traceCutCount) as readonly { trace_id: string }[]
1010
+ ).all(config.otel.retentionTraceBatch) as readonly { trace_id: string }[]
914
1011
  // Set.add dedupes overlap with the time-expired batch above.
915
1012
  for (const row of oldest) toEvict.add(row.trace_id)
916
1013
  }
917
1014
 
918
- // Always prune orphan logs (no trace_id) by timestamp they're
919
- // not covered by trace eviction.
920
- db.query(`DELETE FROM logs WHERE trace_id IS NULL AND timestamp_ms < ?`).run(cutoff)
921
-
922
- if (toEvict.size === 0) return
1015
+ // Logs have their own retention boundary. A correlated log may refer
1016
+ // to a trace that was sampled elsewhere or never reached Motel, so
1017
+ // tying log eviction to trace_summaries lets those rows grow forever.
1018
+ const expiredLogs = db.query(`DELETE FROM logs WHERE id IN (SELECT id FROM logs WHERE timestamp_ms < ? ORDER BY timestamp_ms ASC LIMIT ?)`).run(cutoff, config.otel.retentionLogBatch)
1019
+ let deletedLogs = Number(expiredLogs.changes) > 0
1020
+ if (dbSize > maxDbSizeBytes) {
1021
+ const oversizedLogs = db.query(`DELETE FROM logs WHERE id IN (SELECT id FROM logs ORDER BY timestamp_ms ASC LIMIT ?)`).run(config.otel.retentionLogBatch)
1022
+ deletedLogs = deletedLogs || Number(oversizedLogs.changes) > 0
1023
+ }
923
1024
 
924
1025
  // Batch the trace-id list so the IN placeholders stay under
925
1026
  // SQLite's default limit (~999). Each batch wipes every row
@@ -942,48 +1043,54 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
942
1043
 
943
1044
  // Log-side orphans (log_attributes + FTS) are keyed by log.id,
944
1045
  // so prune what no longer has a parent log row.
945
- db.query(`DELETE FROM log_attributes WHERE NOT EXISTS (SELECT 1 FROM logs WHERE logs.id = log_attributes.log_id)`).run()
1046
+ const orphanAttributes = db.query(`DELETE FROM log_attributes WHERE rowid IN (SELECT log_attributes.rowid FROM log_attributes WHERE NOT EXISTS (SELECT 1 FROM logs WHERE logs.id = log_attributes.log_id) LIMIT ?)`).run(config.otel.retentionLogBatch)
1047
+ let deletedOrphans = Number(orphanAttributes.changes) > 0
946
1048
  try {
947
- db.query(`DELETE FROM log_body_fts WHERE NOT EXISTS (SELECT 1 FROM logs WHERE logs.id = CAST(log_body_fts.log_id AS INTEGER))`).run()
1049
+ const orphanFts = db.query(`DELETE FROM log_body_fts WHERE rowid IN (SELECT rowid FROM log_body_fts WHERE NOT EXISTS (SELECT 1 FROM logs WHERE logs.id = CAST(log_body_fts.log_id AS INTEGER)) LIMIT ?)`).run(config.otel.retentionLogBatch)
1050
+ deletedOrphans = deletedOrphans || Number(orphanFts.changes) > 0
948
1051
  } catch {
949
1052
  // FTS table may not exist on old DBs.
950
1053
  }
951
1054
 
952
- // Truncate the WAL after a big delete pass. Without this the
953
- // WAL keeps growing (observed: 640MB) because wal_autocheckpoint
954
- // only triggers when WAL pages exceed the threshold during
955
- // writes a retention pass that evicts millions of rows can
956
- // blow far past that before the auto-checkpoint fires. Using
957
- // PASSIVE so active readers aren't interrupted; if the WAL
958
- // can't be fully reclaimed right now, we'll try again next
959
- // cycle.
960
- try { db.exec(`PRAGMA wal_checkpoint(PASSIVE);`) } catch { /* ignore */ }
961
-
962
- // Incremental vacuum reclaims some of the freed pages back
963
- // to the OS so the file size actually shrinks over time
964
- // instead of just growing the freelist. Bounded to 2000
965
- // pages per pass (≈8MB) to avoid a long-running transaction.
966
- try { db.exec(`PRAGMA incremental_vacuum(2000);`) } catch { /* ignore */ }
1055
+ // Checkpoint after a big delete pass so the freed pages land
1056
+ // in the main DB file and become eligible for incremental
1057
+ // vacuum. Use RESTART (not PASSIVE): PASSIVE silently no-ops
1058
+ // when readers are active, which is the documented mechanism
1059
+ // behind WAL/freelist starvation when ingest is busy.
1060
+ if (toEvict.size === 0 && !deletedLogs && !deletedOrphans) return
1061
+ try { db.exec(`PRAGMA wal_checkpoint(RESTART);`) } catch { /* ignore */ }
1062
+
1063
+ // Incremental FTS5 merge DELETE on an FTS5-indexed row
1064
+ // leaves a tombstone in the segment tree that only `merge`
1065
+ // reclaims. Skipping this is the second compounding cause
1066
+ // (after fixed-size vacuum) of the slow freelist accretion
1067
+ // that took the DB to 17GB. 100 pages of merge work per
1068
+ // retention tick is bounded and runs in milliseconds.
1069
+ incrementalFtsMerge(100)
1070
+
1071
+ // Actual page reclamation lives in `reclaimSpace`, which
1072
+ // runs on its own faster cadence so the file shrinks even
1073
+ // when no traces are evicted in a given retention tick (e.g.
1074
+ // after a large historical eviction has already happened).
967
1075
  })
968
1076
  })
969
1077
 
970
- // Retention only runs in processes that opt in (currently the main
971
- // daemon). The ingest worker and TUI skip it to avoid two writers
972
- // competing for the write lock with overlapping DELETE passes.
1078
+ // Retention only runs in the ingest worker so maintenance never blocks
1079
+ // the HTTP event loop and no second writer duplicates cleanup work.
973
1080
  if (opts.runRetention) {
974
- // Reconcile any summary drift from interrupted ingests, but do it
975
- // after the server becomes healthy. Running this synchronously at
976
- // open can sit behind another writer's lock for ~15s and make the
977
- // daemon look hung even though the port is already bound.
978
- yield* Effect.forkScoped(reconcileTraceSummaries)
979
-
980
- // Enable incremental vacuum so retention can reclaim freed
981
- // pages over time instead of needing a stop-the-world VACUUM.
982
- // Idempotent: repeat calls after the first are no-ops.
983
- try { db.exec(`PRAGMA auto_vacuum = INCREMENTAL;`) } catch { /* ignore */ }
984
-
985
- // Run cleanup every 60 seconds in the background, tied to the layer's scope
986
- yield* Effect.forkScoped(Effect.repeat(cleanupExpired(), Schedule.spaced("60 seconds")))
1081
+ // Cleanup runs on the telemetry worker, never the HTTP event loop.
1082
+ yield* Effect.forkScoped(Effect.repeat(
1083
+ Effect.andThen(reconcileTraceSummaries, cleanupExpired()).pipe(Effect.catchCause((cause) => Effect.logWarning(`motel: maintenance pass failed: ${Cause.pretty(cause)}`))),
1084
+ Schedule.spaced(`${config.otel.retentionIntervalSeconds} seconds`),
1085
+ ))
1086
+
1087
+ // Page reclamation runs on a separate, faster cadence (10s) and
1088
+ // is independent of the eviction loop. The reason: a single sweep
1089
+ // at 60s intervals can move only ~8MB of pages before the next
1090
+ // burst of inserts grows the freelist again. Decoupling lets us
1091
+ // catch up adaptively (see VACUUM_PAGES_BUSY/PANIC) without
1092
+ // changing the cost of the heavier delete sweep.
1093
+ yield* Effect.forkScoped(Effect.repeat(reclaimSpace(), Schedule.spaced("10 seconds")))
987
1094
 
988
1095
  // Periodically refresh query planner stats. `PRAGMA optimize` is a
989
1096
  // no-op when nothing has changed, so this is essentially free on idle
@@ -996,35 +1103,48 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
996
1103
  yield* Effect.forkScoped(Effect.repeat(refreshPlannerStats, Schedule.spaced("15 minutes")))
997
1104
  }
998
1105
 
999
- // One-time backfill for existing DBs: if span_attr_fts is empty but
1000
- // span_attributes has rows with AI_FTS_KEYS, populate the index.
1001
- // Runs forked so server startup isn't blocked; queries hitting the
1002
- // FTS will just return empty until the fill lands. On a 2 GB DB with
1003
- // ~400 matching rows this takes ~3-8 seconds. Writer-only because
1004
- // it does INSERT INTO ... — readonly connections would error.
1106
+ // Incrementally rebuild historical AI attributes in bounded batches.
1107
+ // Queries fall back to LIKE until the persistent marker is complete.
1005
1108
  if (hasAttrFts && !opts.readonly) {
1006
- const backfillAttrFts = Effect.sync(() => {
1109
+ const backfillAttrFtsBatch = Effect.sync(() => {
1007
1110
  try {
1008
- const ftsCount = (db.query(`SELECT COUNT(*) AS c FROM span_attr_fts`).get() as { c: number }).c
1009
- if (ftsCount > 0) return
1010
1111
  const keyList = AI_FTS_KEYS.map((k) => `'${k.replace(/'/g, "''")}'`).join(", ")
1011
- const attrCount = (db.query(
1012
- `SELECT COUNT(*) AS c FROM span_attributes WHERE key IN (${keyList})`,
1013
- ).get() as { c: number }).c
1014
- if (attrCount === 0) return
1015
- // Single INSERT..SELECT is atomic and fast; FTS5 batches
1016
- // its internal segment writes. No transaction wrapper
1017
- // needed — it runs as one statement.
1018
- db.exec(`
1019
- INSERT INTO span_attr_fts(rowid, value)
1020
- SELECT rowid, value FROM span_attributes WHERE key IN (${keyList})
1021
- `)
1112
+ const marker = db.query(`SELECT value FROM motel_maintenance WHERE key = 'span_attr_fts_v1'`).get() as { value: string } | null
1113
+ if (marker?.value === "complete") return false
1114
+ let cursor = 0
1115
+ let maxRowId = 0
1116
+ if (marker) {
1117
+ [cursor, maxRowId] = marker.value.split(":").map(Number)
1118
+ } else {
1119
+ maxRowId = (db.query(`SELECT COALESCE(MAX(rowid), 0) AS value FROM span_attributes`).get() as { value: number }).value
1120
+ db.query(`INSERT INTO span_attr_fts(span_attr_fts) VALUES ('delete-all')`).run()
1121
+ db.query(`INSERT OR REPLACE INTO motel_maintenance(key, value) VALUES ('span_attr_fts_v1', ?)`).run(`0:${maxRowId}`)
1122
+ }
1123
+ const rows = db.query(`SELECT rowid, value FROM span_attributes WHERE key IN (${keyList}) AND rowid > ? AND rowid <= ? ORDER BY rowid ASC LIMIT 500`).all(cursor, maxRowId) as Array<{ rowid: number; value: string }>
1124
+ if (rows.length === 0) {
1125
+ db.query(`UPDATE motel_maintenance SET value = 'complete' WHERE key = 'span_attr_fts_v1'`).run()
1126
+ hasAttrFts = true
1127
+ return false
1128
+ }
1129
+ const insert = db.query(`INSERT INTO span_attr_fts(rowid, value) VALUES (?, ?)`)
1130
+ const transaction = db.transaction(() => {
1131
+ for (const row of rows) insert.run(row.rowid, row.value)
1132
+ db.query(`UPDATE motel_maintenance SET value = ? WHERE key = 'span_attr_fts_v1'`).run(`${rows.at(-1)!.rowid}:${maxRowId}`)
1133
+ })
1134
+ transaction()
1135
+ return true
1022
1136
  } catch {
1023
1137
  // Backfill failure is never fatal — new ingests still
1024
1138
  // populate FTS via the trigger, and queries fall back to
1025
1139
  // LIKE when FTS lookups return empty.
1140
+ return true
1026
1141
  }
1027
1142
  })
1143
+ const backfillAttrFts: Effect.Effect<void> = Effect.suspend(() =>
1144
+ Effect.flatMap(backfillAttrFtsBatch, (pending) =>
1145
+ pending ? Effect.andThen(Effect.sleep("100 millis"), backfillAttrFts) : Effect.void,
1146
+ ),
1147
+ )
1028
1148
  yield* Effect.forkScoped(backfillAttrFts)
1029
1149
  }
1030
1150
 
@@ -1042,6 +1162,10 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1042
1162
  const scopeName = scopeSpans.scope?.name ?? null
1043
1163
 
1044
1164
  for (const span of scopeSpans.spans ?? []) {
1165
+ const traceId = normalizeOtlpBinaryId(span.traceId, 16)
1166
+ const spanId = normalizeOtlpBinaryId(span.spanId, 8)
1167
+ if (!traceId || !spanId) continue
1168
+ const parentSpanId = normalizeOtlpBinaryId(span.parentSpanId, 8)
1045
1169
  const spanAttributes = attributeMap(span.attributes)
1046
1170
  const mergedAttributes = { ...resourceAttributes, ...spanAttributes }
1047
1171
  const startTimeMs = nanosToMilliseconds(span.startTimeUnixNano)
@@ -1053,9 +1177,9 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1053
1177
  }))
1054
1178
 
1055
1179
  insertSpan.run(
1056
- span.traceId,
1057
- span.spanId,
1058
- span.parentSpanId ?? null,
1180
+ traceId,
1181
+ spanId,
1182
+ parentSpanId,
1059
1183
  serviceName,
1060
1184
  scopeName,
1061
1185
  span.name ?? "unknown",
@@ -1068,10 +1192,10 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1068
1192
  JSON.stringify(resourceAttributes),
1069
1193
  JSON.stringify(events),
1070
1194
  )
1071
- deleteSpanAttributes.run(span.traceId, span.spanId)
1072
- insertSpanAttributesMany(span.traceId, span.spanId, mergedAttributes)
1073
- touchedOperations.push([span.traceId, span.spanId, span.name ?? "unknown"])
1074
- touchedTraceIds.add(span.traceId)
1195
+ deleteSpanAttributes.run(traceId, spanId)
1196
+ insertSpanAttributesMany(traceId, spanId, mergedAttributes)
1197
+ touchedOperations.push([traceId, spanId, span.name ?? "unknown"])
1198
+ touchedTraceIds.add(traceId)
1075
1199
  insertedSpans += 1
1076
1200
  }
1077
1201
  }
@@ -1111,9 +1235,11 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1111
1235
  const mergedAttributes = { ...resourceAttributes, ...attributes }
1112
1236
  const timestampMs = nanosToMilliseconds(record.timeUnixNano ?? record.observedTimeUnixNano)
1113
1237
  const body = stringifyValue(parseAnyValue(record.body))
1238
+ const rawTraceId = attributes.traceId || attributes.trace_id || record.traceId || null
1239
+ const rawSpanId = attributes.spanId || attributes.span_id || record.spanId || null
1114
1240
  const result = insertLog.run(
1115
- attributes.traceId || attributes.trace_id || record.traceId || null,
1116
- attributes.spanId || attributes.span_id || record.spanId || null,
1241
+ normalizeOtlpBinaryId(rawTraceId, 16),
1242
+ normalizeOtlpBinaryId(rawSpanId, 8),
1117
1243
  serviceName,
1118
1244
  scopeName,
1119
1245
  record.severityText ?? "INFO",
@@ -1145,9 +1271,11 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1145
1271
  })
1146
1272
 
1147
1273
  const listServices = Effect.fn("motel/TelemetryStore.listServices")(function* () {
1148
-
1149
1274
  const cutoff = (yield* Clock.currentTimeMillis) - config.otel.traceLookbackMinutes * 60 * 1000
1150
- return yield* Effect.sync(() => {
1275
+ const services = yield* Effect.sync(() => {
1276
+ // Discover recent activity from span rows, not trace starts: a
1277
+ // long-running trace can emit a current child after its root ages
1278
+ // outside the lookback window.
1151
1279
  const rows = db.query(`
1152
1280
  SELECT service_name FROM spans WHERE start_time_ms >= ?
1153
1281
  UNION
@@ -1156,6 +1284,8 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1156
1284
  `).all(cutoff, cutoff) as Array<{ service_name: string }>
1157
1285
  return rows.map((row) => row.service_name)
1158
1286
  })
1287
+ yield* Effect.annotateCurrentSpan("trace.service_count", services.length)
1288
+ return services
1159
1289
  })()
1160
1290
 
1161
1291
  const loadTracesByIds = (traceIds: readonly string[]) => {
@@ -1181,15 +1311,19 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1181
1311
  }
1182
1312
 
1183
1313
  const listRecentTraces = Effect.fn("motel/TelemetryStore.listRecentTraces")(function* (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number }) {
1314
+ yield* Effect.annotateCurrentSpan("trace.service_name", serviceName ?? "all")
1184
1315
  const summaries = yield* listTraceSummaries(serviceName, options)
1185
- return yield* Effect.sync(() => loadTracesByIds(summaries.map((summary) => summary.traceId)))
1316
+ const traces = yield* Effect.sync(() => loadTracesByIds(summaries.map((summary) => summary.traceId)))
1317
+ yield* Effect.annotateCurrentSpan("trace.result_count", traces.length)
1318
+ return traces
1186
1319
  })
1187
1320
 
1188
1321
  const listTraceSummaries = Effect.fn("motel/TelemetryStore.listTraceSummaries")(function* (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) {
1322
+ yield* Effect.annotateCurrentSpan("trace.service_name", serviceName ?? "all")
1189
1323
  const cutoff = (yield* Clock.currentTimeMillis) - (options?.lookbackMinutes ?? config.otel.traceLookbackMinutes) * 60 * 1000
1190
1324
  const limit = options?.limit ?? config.otel.traceFetchLimit
1191
1325
 
1192
- return yield* Effect.sync(() => {
1326
+ const summaries = yield* Effect.sync(() => {
1193
1327
  const clauses = ["started_at_ms >= ?"]
1194
1328
  const params: Array<string | number> = [cutoff]
1195
1329
 
@@ -1211,6 +1345,8 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1211
1345
  LIMIT ?
1212
1346
  `).all(...params, limit) as TraceSummaryRow[]
1213
1347
  }).pipe(Effect.map((rows) => rows.map(parseSummaryRow)))
1348
+ yield* Effect.annotateCurrentSpan("trace.result_count", summaries.length)
1349
+ return summaries
1214
1350
  })
1215
1351
 
1216
1352
  const searchTraceSummaries = Effect.fn("motel/TelemetryStore.searchTraceSummaries")(function* (input: TraceSearch) {
@@ -1289,6 +1425,7 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1289
1425
  })
1290
1426
 
1291
1427
  const getTrace = Effect.fn("motel/TelemetryStore.getTrace")(function* (traceId: string) {
1428
+ yield* Effect.annotateCurrentSpan("trace.trace_id", traceId)
1292
1429
  return yield* Effect.sync(() => {
1293
1430
  const rows = db.query(`
1294
1431
  SELECT * FROM spans WHERE trace_id = ? ORDER BY start_time_ms ASC
@@ -1298,6 +1435,7 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1298
1435
  })
1299
1436
 
1300
1437
  const getSpan = Effect.fn("motel/TelemetryStore.getSpan")(function* (spanId: string) {
1438
+ yield* Effect.annotateCurrentSpan("trace.span_id", spanId)
1301
1439
  return yield* Effect.sync(() => {
1302
1440
  // Fetch only the target span row (uses idx_spans_span_id)
1303
1441
  const spanRow = db.query(`SELECT * FROM spans WHERE span_id = ? LIMIT 1`).get(spanId) as SpanRow | null
@@ -1305,7 +1443,28 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1305
1443
 
1306
1444
  const traceId = spanRow.trace_id
1307
1445
 
1308
- // Get root operation name (indexed by trace_id)
1446
+ // Walk the parent chain in one recursive CTE instead of one query
1447
+ // per hop. Root context remains the earliest root in the trace,
1448
+ // matching full trace hydration even when input has multiple roots.
1449
+ let parentOperationName: string | null = null
1450
+ let depth = 0
1451
+ if (spanRow.parent_span_id) {
1452
+ const ancestors = db.query(`
1453
+ WITH RECURSIVE ancestors(span_id, parent_span_id, operation_name, hop) AS (
1454
+ SELECT span_id, parent_span_id, operation_name, 1
1455
+ FROM spans WHERE trace_id = ? AND span_id = ?
1456
+ UNION ALL
1457
+ SELECT s.span_id, s.parent_span_id, s.operation_name, a.hop + 1
1458
+ FROM ancestors a
1459
+ JOIN spans s ON s.trace_id = ? AND s.span_id = a.parent_span_id
1460
+ )
1461
+ SELECT span_id, parent_span_id, operation_name, hop FROM ancestors ORDER BY hop ASC
1462
+ `).all(traceId, spanRow.parent_span_id, traceId) as Array<{ span_id: string; parent_span_id: string | null; operation_name: string; hop: number }>
1463
+
1464
+ parentOperationName = ancestors[0]?.operation_name ?? null
1465
+ depth = ancestors.length
1466
+ }
1467
+
1309
1468
  const rootRow = db.query(`
1310
1469
  SELECT operation_name FROM spans
1311
1470
  WHERE trace_id = ? AND parent_span_id IS NULL
@@ -1313,28 +1472,6 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1313
1472
  `).get(traceId) as { operation_name: string } | null
1314
1473
  const rootOperationName = rootRow?.operation_name ?? "unknown"
1315
1474
 
1316
- // Get parent operation name if span has a parent (PK lookup)
1317
- let parentOperationName: string | null = null
1318
- if (spanRow.parent_span_id) {
1319
- const parentRow = db.query(`
1320
- SELECT operation_name FROM spans
1321
- WHERE trace_id = ? AND span_id = ?
1322
- `).get(traceId, spanRow.parent_span_id) as { operation_name: string } | null
1323
- parentOperationName = parentRow?.operation_name ?? null
1324
- }
1325
-
1326
- // Compute depth by walking up parent chain (typically 3-5 hops)
1327
- let depth = 0
1328
- let currentParentId = spanRow.parent_span_id
1329
- while (currentParentId) {
1330
- const parentRow = db.query(`
1331
- SELECT parent_span_id FROM spans WHERE trace_id = ? AND span_id = ?
1332
- `).get(traceId, currentParentId) as { parent_span_id: string | null } | null
1333
- if (!parentRow) break
1334
- depth++
1335
- currentParentId = parentRow.parent_span_id
1336
- }
1337
-
1338
1475
  const parsed = parseSpanRow(spanRow)
1339
1476
  return {
1340
1477
  traceId,
@@ -1356,9 +1493,22 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1356
1493
  const cutoff = (yield* Clock.currentTimeMillis) - (input.lookbackMinutes ?? config.otel.traceLookbackMinutes) * 60 * 1000
1357
1494
  const limit = input.limit ?? 100
1358
1495
  const hasContainsFilters = Object.keys(input.attributeContainsFilters ?? {}).length > 0
1359
- const candidateLimit = hasContainsFilters ? Math.max(limit * 20, 500) : Math.max(limit * 10, 200)
1496
+ // Only over-fetch when post-filtering will discard rows. Without
1497
+ // a parentOperation filter the SQL `LIMIT` already returns the
1498
+ // final set, and over-fetching just makes us parse JSON blobs
1499
+ // for rows we'll throw away.
1500
+ const needsPostFilter = !!input.parentOperation
1501
+ const candidateLimit = !needsPostFilter
1502
+ ? limit
1503
+ : hasContainsFilters
1504
+ ? Math.max(limit * 20, 500)
1505
+ : Math.max(limit * 10, 200)
1360
1506
 
1361
1507
  return yield* Effect.sync(() => {
1508
+ // First pass: fetch only the columns needed to filter and
1509
+ // to drive the parent-context lookup. Parsing the heavy
1510
+ // `*_json` blobs is deferred until after we've sliced down
1511
+ // to the final `limit`.
1362
1512
  let fromSql = "FROM spans AS s"
1363
1513
  const joinParams: Array<string | number> = []
1364
1514
  const clauses: string[] = ["s.start_time_ms >= ?"]
@@ -1399,60 +1549,47 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1399
1549
  params.push(...containsAttrMatch.params)
1400
1550
  }
1401
1551
 
1402
- const rows = db.query(`
1403
- SELECT *
1552
+ const candidateRows = db.query(`
1553
+ SELECT s.trace_id, s.span_id, s.parent_span_id, s.operation_name, s.start_time_ms
1404
1554
  ${fromSql}
1405
1555
  WHERE ${clauses.join(" AND ")}
1406
1556
  ORDER BY s.start_time_ms DESC
1407
1557
  LIMIT ?
1408
- `).all(...joinParams, ...params, candidateLimit) as SpanRow[]
1558
+ `).all(...joinParams, ...params, candidateLimit) as Array<{ trace_id: string; span_id: string; parent_span_id: string | null; operation_name: string; start_time_ms: number }>
1409
1559
 
1410
- const traceIds = [...new Set(rows.map((row) => row.trace_id))]
1560
+ const traceIds = [...new Set(candidateRows.map((row) => row.trace_id))]
1411
1561
  if (traceIds.length === 0) return [] as readonly SpanItem[]
1412
1562
 
1413
1563
  const keyOf = (traceId: string, spanId: string) => `${traceId}:${spanId}`
1414
1564
  const spanContextById = new Map<string, { readonly parentSpanId: string | null; readonly operationName: string }>()
1415
- for (const row of rows) {
1416
- spanContextById.set(keyOf(row.trace_id, row.span_id), {
1417
- parentSpanId: row.parent_span_id,
1418
- operationName: row.operation_name,
1419
- })
1420
- }
1421
1565
 
1566
+ // Bulk-prefetch parent metadata for every span in every trace
1567
+ // touched by the candidate set. One indexed scan per trace_id
1568
+ // is much cheaper than a per-span lookup loop while computing
1569
+ // depth, and we get the trace-root lookup in the same pass.
1422
1570
  const placeholders = traceIds.map(() => "?").join(", ")
1423
- const rootRows = db.query(`
1424
- SELECT trace_id, operation_name
1571
+ const allSpanRows = db.query(`
1572
+ SELECT trace_id, span_id, parent_span_id, operation_name, start_time_ms
1425
1573
  FROM spans
1426
- WHERE trace_id IN (${placeholders}) AND parent_span_id IS NULL
1427
- ORDER BY start_time_ms ASC
1428
- `).all(...traceIds) as Array<{ trace_id: string; operation_name: string }>
1429
- const rootOperationByTraceId = new Map<string, string>()
1430
- for (const row of rootRows) {
1431
- if (!rootOperationByTraceId.has(row.trace_id)) {
1432
- rootOperationByTraceId.set(row.trace_id, row.operation_name)
1433
- }
1434
- }
1574
+ WHERE trace_id IN (${placeholders})
1575
+ `).all(...traceIds) as Array<{ trace_id: string; span_id: string; parent_span_id: string | null; operation_name: string; start_time_ms: number }>
1435
1576
 
1436
- const spanContextLookup = db.query(`
1437
- SELECT parent_span_id, operation_name
1438
- FROM spans
1439
- WHERE trace_id = ? AND span_id = ?
1440
- `)
1441
-
1442
- const getSpanContext = (traceId: string, spanId: string) => {
1443
- const key = keyOf(traceId, spanId)
1444
- const cached = spanContextById.get(key)
1445
- if (cached !== undefined) return cached
1446
- const row = spanContextLookup.get(traceId, spanId) as { parent_span_id: string | null; operation_name: string } | null
1447
- if (!row) return null
1448
- const value = {
1577
+ const rootOperationByTraceId = new Map<string, { operationName: string; startTimeMs: number }>()
1578
+ for (const row of allSpanRows) {
1579
+ spanContextById.set(keyOf(row.trace_id, row.span_id), {
1449
1580
  parentSpanId: row.parent_span_id,
1450
1581
  operationName: row.operation_name,
1582
+ })
1583
+ if (row.parent_span_id === null) {
1584
+ const existing = rootOperationByTraceId.get(row.trace_id)
1585
+ if (!existing || row.start_time_ms < existing.startTimeMs) {
1586
+ rootOperationByTraceId.set(row.trace_id, { operationName: row.operation_name, startTimeMs: row.start_time_ms })
1587
+ }
1451
1588
  }
1452
- spanContextById.set(key, value)
1453
- return value
1454
1589
  }
1455
1590
 
1591
+ const getSpanContext = (traceId: string, spanId: string) => spanContextById.get(keyOf(traceId, spanId)) ?? null
1592
+
1456
1593
  const depthById = new Map<string, number>()
1457
1594
  const getDepth = (traceId: string, spanId: string, visiting = new Set<string>()): number => {
1458
1595
  const key = keyOf(traceId, spanId)
@@ -1466,32 +1603,57 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1466
1603
  return depth
1467
1604
  }
1468
1605
 
1469
- return rows
1470
- .map((row) => {
1471
- const parentContext = row.parent_span_id ? getSpanContext(row.trace_id, row.parent_span_id) : null
1472
- const parsedSpan = parseSpanRow(row)
1473
- const span = {
1474
- ...parsedSpan,
1475
- depth: getDepth(row.trace_id, row.span_id),
1476
- warnings: row.parent_span_id && !parentContext
1477
- ? [`missing span ${row.parent_span_id} (1 child)`]
1478
- : parsedSpan.warnings,
1479
- }
1480
- return {
1481
- traceId: row.trace_id,
1482
- rootOperationName: rootOperationByTraceId.get(row.trace_id) ?? span.operationName,
1483
- parentOperationName: parentContext?.operationName ?? null,
1484
- span,
1485
- } satisfies SpanItem
1486
- })
1487
- .filter((item) => {
1488
- if (input.parentOperation) {
1489
- const needle = input.parentOperation.toLowerCase()
1490
- if (!item.parentOperationName?.toLowerCase().includes(needle)) return false
1491
- }
1492
- return true
1606
+ // Apply parentOperation post-filter on the lite candidate set
1607
+ // (cheap — string compare against cached parent op) and then
1608
+ // slice down to the final result size before parsing any JSON.
1609
+ const parentOperationNeedle = input.parentOperation?.toLowerCase() ?? null
1610
+ const filteredLite: typeof candidateRows = []
1611
+ for (const row of candidateRows) {
1612
+ if (parentOperationNeedle) {
1613
+ const parent = row.parent_span_id ? getSpanContext(row.trace_id, row.parent_span_id) : null
1614
+ if (!parent?.operationName.toLowerCase().includes(parentOperationNeedle)) continue
1615
+ }
1616
+ filteredLite.push(row)
1617
+ if (filteredLite.length >= limit) break
1618
+ }
1619
+
1620
+ if (filteredLite.length === 0) return [] as readonly SpanItem[]
1621
+
1622
+ // Hydrate only the kept rows: one batched fetch of the full
1623
+ // SpanRow (with resource_json / attributes_json / events_json)
1624
+ // using SQLite's row-value `IN` syntax, then parseSpanRow per
1625
+ // kept row. Result order follows `filteredLite` so the caller
1626
+ // sees the same ordering the candidate scan produced.
1627
+ const keptValues = filteredLite.map(() => "(?, ?)").join(", ")
1628
+ const fullRows = db.query(`
1629
+ SELECT * FROM spans WHERE (trace_id, span_id) IN (VALUES ${keptValues})
1630
+ `).all(...filteredLite.flatMap((row) => [row.trace_id, row.span_id])) as SpanRow[]
1631
+ const fullRowByKey = new Map<string, SpanRow>()
1632
+ for (const row of fullRows) {
1633
+ fullRowByKey.set(keyOf(row.trace_id, row.span_id), row)
1634
+ }
1635
+
1636
+ const items: SpanItem[] = []
1637
+ for (const lite of filteredLite) {
1638
+ const row = fullRowByKey.get(keyOf(lite.trace_id, lite.span_id))
1639
+ if (!row) continue
1640
+ const parentContext = row.parent_span_id ? getSpanContext(row.trace_id, row.parent_span_id) : null
1641
+ const parsedSpan = parseSpanRow(row)
1642
+ const span = {
1643
+ ...parsedSpan,
1644
+ depth: getDepth(row.trace_id, row.span_id),
1645
+ warnings: row.parent_span_id && !parentContext
1646
+ ? [`missing span ${row.parent_span_id} (1 child)`]
1647
+ : parsedSpan.warnings,
1648
+ }
1649
+ items.push({
1650
+ traceId: row.trace_id,
1651
+ rootOperationName: rootOperationByTraceId.get(row.trace_id)?.operationName ?? span.operationName,
1652
+ parentOperationName: parentContext?.operationName ?? null,
1653
+ span,
1493
1654
  })
1494
- .slice(0, limit)
1655
+ }
1656
+ return items
1495
1657
  })
1496
1658
  })
1497
1659
 
@@ -1789,7 +1951,10 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1789
1951
  })
1790
1952
 
1791
1953
  const listRecentLogs = Effect.fn("motel/TelemetryStore.listRecentLogs")(function* (serviceName: string) {
1792
- return yield* searchLogs({ serviceName, limit: config.otel.logFetchLimit })
1954
+ yield* Effect.annotateCurrentSpan("log.service_name", serviceName)
1955
+ const logs = yield* searchLogs({ serviceName, limit: config.otel.logFetchLimit })
1956
+ yield* Effect.annotateCurrentSpan("log.result_count", logs.length)
1957
+ return logs
1793
1958
  })
1794
1959
 
1795
1960
  const listFacets = Effect.fn("motel/TelemetryStore.listFacets")(function* (input: FacetSearch) {
@@ -1882,26 +2047,30 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1882
2047
  // FACET_VALUE_MAX_LEN. For opencode this hides `ai.prompt`,
1883
2048
  // `ai.prompt.messages`, and `ai.prompt.tools` — which are 1-6MB text
1884
2049
  // blobs that you'd never want to filter by exact match anyway. The
1885
- // WHERE clause lets SQLite skip reading those pages from disk. We also
1886
- // dedupe to one (trace, key, value) row before grouping so repeated
1887
- // span-level duplicates don't blow up the temp B-trees used for the
1888
- // picker ranking query.
1889
- const params: Array<string | number> = [FACET_VALUE_MAX_LEN, cutoff]
1890
- if (input.serviceName) params.push(input.serviceName)
1891
- params.push(limit)
2050
+ // WHERE clause lets SQLite skip reading those pages from disk.
2051
+ // COUNT(DISTINCT ...) does its own per-group dedup via a temp B-tree,
2052
+ // so the outer query needs no DISTINCT subquery in front of it. We
2053
+ // pre-filter trace_ids through trace_summaries (an indexed lookup) so
2054
+ // the planner can use a SEMI JOIN against the small in-window set
2055
+ // instead of joining every span_attributes row to trace_summaries.
2056
+ const params: Array<string | number> = []
2057
+ let traceFilter: string
2058
+ if (input.serviceName) {
2059
+ traceFilter = `(SELECT trace_id FROM trace_summaries WHERE started_at_ms >= ? AND service_name = ?)`
2060
+ params.push(cutoff, input.serviceName)
2061
+ } else {
2062
+ traceFilter = `(SELECT trace_id FROM trace_summaries WHERE started_at_ms >= ?)`
2063
+ params.push(cutoff)
2064
+ }
2065
+ params.push(FACET_VALUE_MAX_LEN, limit)
1892
2066
  const rows = db.query(`
1893
- SELECT scoped.key AS value,
1894
- COUNT(DISTINCT scoped.trace_id) AS count,
1895
- COUNT(DISTINCT scoped.value) AS distinct_values
1896
- FROM (
1897
- SELECT DISTINCT sa.trace_id, sa.key, sa.value
1898
- FROM span_attributes sa
1899
- JOIN trace_summaries ts ON ts.trace_id = sa.trace_id
1900
- WHERE LENGTH(sa.value) < ?
1901
- AND ts.started_at_ms >= ?
1902
- ${input.serviceName ? "AND ts.service_name = ?" : ""}
1903
- ) AS scoped
1904
- GROUP BY scoped.key
2067
+ SELECT key AS value,
2068
+ COUNT(DISTINCT trace_id) AS count,
2069
+ COUNT(DISTINCT value) AS distinct_values
2070
+ FROM span_attributes
2071
+ WHERE trace_id IN ${traceFilter}
2072
+ AND LENGTH(value) < ?
2073
+ GROUP BY key
1905
2074
  ORDER BY (CASE WHEN distinct_values = 1 THEN 1 ELSE 0 END) ASC,
1906
2075
  distinct_values DESC,
1907
2076
  count DESC,
@@ -1938,7 +2107,10 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
1938
2107
  })
1939
2108
 
1940
2109
  const listTraceLogs = Effect.fn("motel/TelemetryStore.listTraceLogs")(function* (traceId: string) {
1941
- return yield* searchLogs({ traceId, limit: config.otel.logFetchLimit })
2110
+ yield* Effect.annotateCurrentSpan("log.trace_id", traceId)
2111
+ const logs = yield* searchLogs({ traceId, limit: config.otel.logFetchLimit })
2112
+ yield* Effect.annotateCurrentSpan("log.result_count", logs.length)
2113
+ return logs
1942
2114
  })
1943
2115
 
1944
2116
  // ---------------------------------------------------------------------------
@@ -2343,28 +2515,40 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
2343
2515
  searchAiCalls,
2344
2516
  getAiCall,
2345
2517
  aiCallStats,
2518
+ runRetentionNow: cleanupExpired(),
2346
2519
  })
2347
- }),
2348
- )
2520
+ })
2521
+
2522
+ /** Compatibility factory for callers constructing a writer/query-capable store layer. */
2523
+ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) =>
2524
+ Layer.effect(TelemetryStore, makeTelemetryStoreEffect(opts)).pipe(Layer.provide(BunFileSystem.layer))
2349
2525
 
2350
2526
  /**
2351
- * Default writer instance: the main daemon uses this. Owns schema
2352
- * migrations, FTS backfill, and the retention loop.
2527
+ * Default writer runtime used by tests and direct store consumers.
2353
2528
  */
2354
2529
  export const TelemetryStoreLive = makeTelemetryStoreLayer({ readonly: false, runRetention: true })
2355
2530
 
2356
2531
  /**
2357
- * Writer instance that SKIPS retention. The ingest worker uses this
2358
- * so the daemon and the worker aren't both running DELETE passes at
2359
- * the same time (they'd just serialise behind the write lock and
2360
- * duplicate work).
2532
+ * The ingest worker's writer. It is the managed daemon's sole owner of
2533
+ * schema migrations, FTS backfill, retention, and page reclamation.
2361
2534
  */
2362
- export const TelemetryStoreWorkerLive = makeTelemetryStoreLayer({ readonly: false, runRetention: false })
2535
+ export const TelemetryStoreWorkerLive = TelemetryStoreLive
2363
2536
 
2364
2537
  /**
2365
- * Read-only instance for query-only processes (currently the TUI).
2366
- * Skips every DDL/DML statement at startup so the connection can be
2367
- * opened while a writer is mid-transaction without racing for the
2368
- * write lock. Writes through the service interface will throw.
2538
+ * Read-only instance for query-only processes (currently the TUI and
2539
+ * HTTP query handlers). Skips every DDL/DML statement at startup so
2540
+ * the connection can be opened while a writer is mid-transaction
2541
+ * without racing for the write lock. Provided as TelemetryStoreReadonly
2542
+ * — a distinct service identifier so it can coexist with the writer
2543
+ * TelemetryStore in the same runtime.
2369
2544
  */
2370
- export const TelemetryStoreReadonlyLive = makeTelemetryStoreLayer({ readonly: true, runRetention: false })
2545
+ export const TelemetryStoreReadonlyLive = Layer.effect(TelemetryStoreReadonly, makeTelemetryStoreEffect({ readonly: true, runRetention: false })).pipe(Layer.provide(BunFileSystem.layer))
2546
+
2547
+ /** Query-worker reader that waits for the sole writer to finish schema bootstrap. */
2548
+ export const TelemetryStoreQueryWorkerLive = Layer.effect(
2549
+ TelemetryStoreReadonly,
2550
+ makeTelemetryStoreEffect({ readonly: true, runRetention: false }).pipe(
2551
+ Effect.map((store) => TelemetryStoreReadonly.of(store)),
2552
+ Effect.retry(Schedule.spaced("50 millis")),
2553
+ ),
2554
+ ).pipe(Layer.provide(BunFileSystem.layer))