@kitlangton/motel 0.1.3 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +11 -1
- package/package.json +5 -3
- package/src/App.tsx +239 -59
- package/src/daemon.test.ts +144 -7
- package/src/daemon.ts +113 -8
- package/src/domain.test.ts +62 -0
- package/src/domain.ts +62 -4
- package/src/httpApi.ts +4 -1
- package/src/localServer.ts +112 -121
- package/src/mcp.ts +172 -0
- package/src/motelClient.ts +166 -14
- package/src/registry.ts +26 -23
- package/src/runtime.ts +8 -2
- package/src/server.ts +10 -9
- package/src/services/AsyncIngest.ts +52 -0
- package/src/services/TelemetryStore.ts +285 -27
- package/src/services/TraceQueryService.ts +4 -2
- package/src/services/ingestRpc.ts +41 -0
- package/src/services/telemetryWorker.ts +62 -0
- package/src/storybook/aiChatStory.tsx +243 -0
- package/src/storybook/fixtures/errorState.ts +44 -0
- package/src/storybook/fixtures/imagePaste.ts +34 -0
- package/src/storybook/fixtures/index.ts +62 -0
- package/src/storybook/fixtures/kitchenSink.ts +148 -0
- package/src/storybook/fixtures/rawPrompt.ts +15 -0
- package/src/storybook/fixtures/short.ts +27 -0
- package/src/storybook/fixtures/toolHeavy.ts +65 -0
- package/src/telemetry.test.ts +61 -0
- package/src/ui/AiChatView.tsx +292 -0
- package/src/ui/SpanContentView.tsx +181 -0
- package/src/ui/SpanDetail.tsx +98 -17
- package/src/ui/TraceDetailsPane.tsx +35 -3
- package/src/ui/Waterfall.tsx +94 -167
- package/src/ui/aiChatModel.test.ts +347 -0
- package/src/ui/aiChatModel.ts +736 -0
- package/src/ui/aiState.ts +71 -0
- package/src/ui/app/TraceWorkspace.tsx +295 -120
- package/src/ui/app/useAppLayout.ts +14 -11
- package/src/ui/app/useTraceScreenData.ts +191 -35
- package/src/ui/atoms.ts +131 -0
- package/src/ui/filterParser.test.ts +56 -0
- package/src/ui/filterParser.ts +45 -0
- package/src/ui/loaders.ts +120 -0
- package/src/ui/persistence.ts +41 -0
- package/src/ui/primitives.tsx +47 -21
- package/src/ui/state.ts +4 -169
- package/src/ui/useAttrFilterPicker.ts +63 -23
- package/src/ui/useKeyboardNav.ts +576 -300
- package/src/ui/waterfallFilter.test.ts +84 -0
- package/src/ui/waterfallFilter.ts +59 -0
- package/src/ui/waterfallModel.ts +130 -0
- package/src/ui/waterfallNav.test.ts +17 -1
- package/src/ui/waterfallNav.ts +1 -1
- package/web/dist/assets/{index-DKinj-OE.js → index-DnyVo03x.js} +1 -1
- package/web/dist/index.html +1 -1
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Main-thread client for the telemetry worker's ingest RPCs.
|
|
3
|
+
*
|
|
4
|
+
* The HTTP handlers for POST /v1/traces and POST /v1/logs call into
|
|
5
|
+
* this service instead of `TelemetryStore.ingestTraces/Logs`. Each
|
|
6
|
+
* method sends a typed message to the worker, awaits the reply, and
|
|
7
|
+
* returns the worker's result as an Effect. While the worker is
|
|
8
|
+
* serialising a big batch into SQLite, the main thread's event loop
|
|
9
|
+
* is FREE to answer /api/* queries — that's the whole point of the
|
|
10
|
+
* offload. Without this, /api/health and friends queued behind long
|
|
11
|
+
* ingests and reported p95 latencies of 3-5 seconds; after, they
|
|
12
|
+
* stay responsive regardless of ingest load.
|
|
13
|
+
*
|
|
14
|
+
* The worker is spawned as a scope'd resource inside the layer. The
|
|
15
|
+
* protocol pool is sized at 1 because SQLite only supports a single
|
|
16
|
+
* writer at a time anyway — running N concurrent workers would just
|
|
17
|
+
* queue them on SQLite's lock. When the outer scope closes (server
|
|
18
|
+
* shutdown), `BunWorker.layer`'s finalizer sends a close message and
|
|
19
|
+
* terminates the worker if it doesn't exit gracefully in 5s.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import * as BunWorker from "@effect/platform-bun/BunWorker"
|
|
23
|
+
import { Context, Layer } from "effect"
|
|
24
|
+
import * as RpcClient from "effect/unstable/rpc/RpcClient"
|
|
25
|
+
import type { RpcClientError } from "effect/unstable/rpc/RpcClientError"
|
|
26
|
+
import * as RpcSerialization from "effect/unstable/rpc/RpcSerialization"
|
|
27
|
+
import { IngestRpcs } from "./ingestRpc.ts"
|
|
28
|
+
|
|
29
|
+
// RpcClient.make always surfaces RpcClientError in addition to the
|
|
30
|
+
// group's declared errors (transport failures, worker crashes, etc.),
|
|
31
|
+
// so the service shape has to mirror that. Without the explicit error
|
|
32
|
+
// type param, TS treats the declared and observed client types as
|
|
33
|
+
// unrelated structural mismatches.
|
|
34
|
+
export class AsyncIngest extends Context.Service<
|
|
35
|
+
AsyncIngest,
|
|
36
|
+
RpcClient.FromGroup<typeof IngestRpcs, RpcClientError>
|
|
37
|
+
>()("@motel/AsyncIngest") {}
|
|
38
|
+
|
|
39
|
+
// Protocol: RpcClient.layerProtocolWorker manages a worker pool and
|
|
40
|
+
// speaks msgpack over structured-clone messages. `size: 1` matches
|
|
41
|
+
// SQLite's single-writer constraint.
|
|
42
|
+
const WorkerProtocol = RpcClient.layerProtocolWorker({ size: 1 }).pipe(
|
|
43
|
+
Layer.provide(RpcSerialization.layerMsgPack),
|
|
44
|
+
Layer.provide(
|
|
45
|
+
BunWorker.layer(() => new Worker(new URL("./telemetryWorker.ts", import.meta.url))),
|
|
46
|
+
),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
export const AsyncIngestLive = Layer.effect(
|
|
50
|
+
AsyncIngest,
|
|
51
|
+
RpcClient.make(IngestRpcs),
|
|
52
|
+
).pipe(Layer.provide(WorkerProtocol))
|
|
@@ -4,7 +4,7 @@ import { dirname } from "node:path"
|
|
|
4
4
|
import { Clock, Effect, Layer, Schedule, Context } from "effect"
|
|
5
5
|
import { config } from "../config.js"
|
|
6
6
|
import type { AiCallDetail, AiCallSummary, FacetItem, LogItem, SpanItem, StatsItem, TraceItem, TraceSummaryItem, TraceSpanEvent, TraceSpanItem } from "../domain.js"
|
|
7
|
-
import { AI_ATTR_MAP, AI_TEXT_SEARCH_KEYS, truncatePreview } from "../domain.js"
|
|
7
|
+
import { AI_ATTR_MAP, AI_FTS_KEYS, AI_TEXT_SEARCH_KEYS, truncatePreview } from "../domain.js"
|
|
8
8
|
import { attributeMap, nanosToMilliseconds, parseAnyValue, spanKindLabel, spanStatusLabel, stringifyValue, type OtlpLogExportRequest, type OtlpTraceExportRequest } from "../otlp.js"
|
|
9
9
|
|
|
10
10
|
interface SpanRow {
|
|
@@ -57,6 +57,13 @@ interface TraceSearch {
|
|
|
57
57
|
readonly status?: "ok" | "error" | null
|
|
58
58
|
readonly minDurationMs?: number | null
|
|
59
59
|
readonly attributeFilters?: Readonly<Record<string, string>>
|
|
60
|
+
/**
|
|
61
|
+
* Full-text match against the AI prompt/response/tool attribute values
|
|
62
|
+
* on any span in the trace (see AI_FTS_KEYS). When set, traces are
|
|
63
|
+
* filtered to those containing at least one span whose indexed LLM
|
|
64
|
+
* content matches. Powered by span_attr_fts (FTS5).
|
|
65
|
+
*/
|
|
66
|
+
readonly aiText?: string | null
|
|
60
67
|
readonly lookbackMinutes?: number
|
|
61
68
|
readonly limit?: number
|
|
62
69
|
readonly cursorStartedAtMs?: number
|
|
@@ -440,26 +447,74 @@ export class TelemetryStore extends Context.Service<
|
|
|
440
447
|
>()("motel/TelemetryStore") {}
|
|
441
448
|
|
|
442
449
|
|
|
443
|
-
|
|
450
|
+
/**
|
|
451
|
+
* How this TelemetryStore instance behaves:
|
|
452
|
+
*
|
|
453
|
+
* - `readonly` — opens the SQLite connection read-only and skips every
|
|
454
|
+
* DDL/DML initialisation. Use this from the TUI (and anywhere else
|
|
455
|
+
* that only queries); it avoids the "database is locked" race that
|
|
456
|
+
* happens when a TUI process races a daemon's writer for the schema
|
|
457
|
+
* pragmas on startup. Writes through the service interface become
|
|
458
|
+
* runtime errors — but readers don't call them.
|
|
459
|
+
*
|
|
460
|
+
* - `runRetention` — fork the background cleanup loop (age + size cap
|
|
461
|
+
* eviction, WAL checkpoint). Only one process should own this at a
|
|
462
|
+
* time. Currently the main daemon (localServer) does; the ingest
|
|
463
|
+
* worker and the TUI skip it.
|
|
464
|
+
*/
|
|
465
|
+
export interface TelemetryStoreOptions {
|
|
466
|
+
readonly readonly: boolean
|
|
467
|
+
readonly runRetention: boolean
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.effect(
|
|
444
471
|
TelemetryStore,
|
|
445
472
|
Effect.gen(function* () {
|
|
446
473
|
mkdirSync(dirname(config.otel.databasePath), { recursive: true })
|
|
447
474
|
const db = yield* Effect.acquireRelease(
|
|
448
|
-
Effect.sync(() => new Database(config.otel.databasePath, {
|
|
475
|
+
Effect.sync(() => new Database(config.otel.databasePath, {
|
|
476
|
+
create: !opts.readonly,
|
|
477
|
+
readonly: opts.readonly,
|
|
478
|
+
})),
|
|
449
479
|
(db) => Effect.sync(() => {
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
480
|
+
if (!opts.readonly) {
|
|
481
|
+
// `PRAGMA optimize` at close persists any stats SQLite gathered
|
|
482
|
+
// during the session, so the next process start gets an accurate
|
|
483
|
+
// query planner on the first query instead of a 3-second cold
|
|
484
|
+
// run. Cheap: it skips work unless stats have drifted.
|
|
485
|
+
try { db.exec(`PRAGMA optimize;`) } catch { /* nothing */ }
|
|
486
|
+
}
|
|
455
487
|
db.close()
|
|
456
488
|
}),
|
|
457
489
|
)
|
|
490
|
+
if (opts.readonly) {
|
|
491
|
+
// Readonly connections skip schema init entirely — the schema
|
|
492
|
+
// already exists (a writer created it) and any `CREATE TABLE IF
|
|
493
|
+
// NOT EXISTS` / `PRAGMA journal_mode = WAL` statement would
|
|
494
|
+
// attempt a write and fight the daemon for the write lock.
|
|
495
|
+
// `query_only = 1` logically blocks any DML the app might
|
|
496
|
+
// accidentally send; still bump cache + mmap since those are
|
|
497
|
+
// safe and keep queries fast.
|
|
498
|
+
db.exec(`
|
|
499
|
+
PRAGMA query_only = 1;
|
|
500
|
+
PRAGMA busy_timeout = 15000;
|
|
501
|
+
PRAGMA cache_size = -65536;
|
|
502
|
+
PRAGMA mmap_size = 268435456;
|
|
503
|
+
`)
|
|
504
|
+
} else {
|
|
458
505
|
db.exec(`
|
|
459
506
|
PRAGMA journal_mode = WAL;
|
|
460
507
|
PRAGMA synchronous = NORMAL;
|
|
461
508
|
PRAGMA temp_store = MEMORY;
|
|
462
|
-
|
|
509
|
+
-- Longer busy timeout: the ingest worker holds the write lock
|
|
510
|
+
-- for up to a few seconds during big OTLP batches, and the main
|
|
511
|
+
-- daemon's retention passes can do the same. 15s gives either
|
|
512
|
+
-- side enough slack to serialise instead of erroring.
|
|
513
|
+
PRAGMA busy_timeout = 15000;
|
|
514
|
+
-- WAL checkpoint automatically when it grows past ~16MB. Without
|
|
515
|
+
-- this the WAL happily runs into the hundreds of MB and queries
|
|
516
|
+
-- start paying the cost of walking the WAL on every read.
|
|
517
|
+
PRAGMA wal_autocheckpoint = 4000;
|
|
463
518
|
-- Bump cache above the 2MB default. 64MB fits most hot index pages
|
|
464
519
|
-- (trace_summaries, spans, span_attributes indexes) in RAM even on
|
|
465
520
|
-- multi-GB databases, cutting cold-read latency meaningfully on
|
|
@@ -549,8 +604,26 @@ export const TelemetryStoreLive = Layer.effect(
|
|
|
549
604
|
CREATE INDEX IF NOT EXISTS idx_log_attributes_key_value ON log_attributes(key, value, log_id);
|
|
550
605
|
CREATE INDEX IF NOT EXISTS idx_log_attributes_log_id ON log_attributes(log_id);
|
|
551
606
|
`)
|
|
607
|
+
}
|
|
552
608
|
|
|
609
|
+
// Tables detected at runtime. For writer connections these flags are
|
|
610
|
+
// set by the FTS `CREATE VIRTUAL TABLE IF NOT EXISTS` try/catch; for
|
|
611
|
+
// readonly connections we probe `sqlite_master` and set them based on
|
|
612
|
+
// what the writer has already provisioned.
|
|
553
613
|
let hasFts = true
|
|
614
|
+
let hasAttrFts = true
|
|
615
|
+
if (opts.readonly) {
|
|
616
|
+
try {
|
|
617
|
+
const row = db.query(`SELECT name FROM sqlite_master WHERE type='table' AND name='span_operation_fts'`).get()
|
|
618
|
+
hasFts = row !== null
|
|
619
|
+
} catch { hasFts = false }
|
|
620
|
+
try {
|
|
621
|
+
const row = db.query(`SELECT name FROM sqlite_master WHERE type='table' AND name='span_attr_fts'`).get()
|
|
622
|
+
hasAttrFts = row !== null
|
|
623
|
+
} catch { hasAttrFts = false }
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
if (!opts.readonly) {
|
|
554
627
|
try {
|
|
555
628
|
db.exec(`
|
|
556
629
|
CREATE VIRTUAL TABLE IF NOT EXISTS span_operation_fts USING fts5(
|
|
@@ -571,6 +644,65 @@ export const TelemetryStoreLive = Layer.effect(
|
|
|
571
644
|
// FTS is optional; queries will fall back to LIKE if unavailable.
|
|
572
645
|
}
|
|
573
646
|
|
|
647
|
+
// External-content FTS5 over the subset of span_attributes.value rows
|
|
648
|
+
// whose key is in AI_FTS_KEYS (LLM prompts, responses, tool calls,
|
|
649
|
+
// etc.). External content means the inverted index is the only
|
|
650
|
+
// FTS storage — the value text itself continues to live once in
|
|
651
|
+
// span_attributes, not duplicated into the FTS table. On a 2 GB DB
|
|
652
|
+
// with 270 MB of prompt JSON this typically adds ~50-120 MB of
|
|
653
|
+
// index, turning a 500-800ms LIKE scan into a <50ms MATCH.
|
|
654
|
+
//
|
|
655
|
+
// Keys are inlined into the trigger DDL rather than looked up in a
|
|
656
|
+
// side table so the `WHEN` guard stays constant-cost (a subquery
|
|
657
|
+
// would run on every span_attributes insert — ~60/span).
|
|
658
|
+
if (hasFts) {
|
|
659
|
+
try {
|
|
660
|
+
const keyList = AI_FTS_KEYS.map((k) => `'${k.replace(/'/g, "''")}'`).join(", ")
|
|
661
|
+
db.exec(`
|
|
662
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS span_attr_fts USING fts5(
|
|
663
|
+
value,
|
|
664
|
+
content='span_attributes',
|
|
665
|
+
content_rowid='rowid',
|
|
666
|
+
tokenize='unicode61 remove_diacritics 2'
|
|
667
|
+
);
|
|
668
|
+
|
|
669
|
+
-- Mirror inserts into FTS when the key carries LLM content.
|
|
670
|
+
-- NOTE: triggers MUST use fully-qualified name (new.rowid,
|
|
671
|
+
-- new.value) and emit rowid so external-content FTS can
|
|
672
|
+
-- fetch the value back via span_attributes.rowid.
|
|
673
|
+
CREATE TRIGGER IF NOT EXISTS span_attr_fts_ai AFTER INSERT ON span_attributes
|
|
674
|
+
WHEN new.key IN (${keyList})
|
|
675
|
+
BEGIN
|
|
676
|
+
INSERT INTO span_attr_fts(rowid, value) VALUES (new.rowid, new.value);
|
|
677
|
+
END;
|
|
678
|
+
|
|
679
|
+
-- Delete with the same guard so retention & re-ingest stay
|
|
680
|
+
-- in sync. External-content 'delete' command needs the
|
|
681
|
+
-- original value to remove from the inverted index.
|
|
682
|
+
CREATE TRIGGER IF NOT EXISTS span_attr_fts_ad AFTER DELETE ON span_attributes
|
|
683
|
+
WHEN old.key IN (${keyList})
|
|
684
|
+
BEGIN
|
|
685
|
+
INSERT INTO span_attr_fts(span_attr_fts, rowid, value)
|
|
686
|
+
VALUES ('delete', old.rowid, old.value);
|
|
687
|
+
END;
|
|
688
|
+
|
|
689
|
+
-- Handle in-place updates (rare; re-ingest usually goes
|
|
690
|
+
-- DELETE then INSERT but belt-and-braces).
|
|
691
|
+
CREATE TRIGGER IF NOT EXISTS span_attr_fts_au AFTER UPDATE ON span_attributes
|
|
692
|
+
WHEN old.key IN (${keyList}) OR new.key IN (${keyList})
|
|
693
|
+
BEGIN
|
|
694
|
+
INSERT INTO span_attr_fts(span_attr_fts, rowid, value)
|
|
695
|
+
VALUES ('delete', old.rowid, old.value);
|
|
696
|
+
INSERT INTO span_attr_fts(rowid, value)
|
|
697
|
+
SELECT new.rowid, new.value
|
|
698
|
+
WHERE new.key IN (${keyList});
|
|
699
|
+
END;
|
|
700
|
+
`)
|
|
701
|
+
} catch {
|
|
702
|
+
hasAttrFts = false
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
|
|
574
706
|
try {
|
|
575
707
|
db.exec(`ALTER TABLE trace_summaries ADD COLUMN active_span_count INTEGER NOT NULL DEFAULT 0`)
|
|
576
708
|
} catch {
|
|
@@ -594,6 +726,7 @@ export const TelemetryStoreLive = Layer.effect(
|
|
|
594
726
|
// ANALYZE / optimize failures are never fatal — queries still work,
|
|
595
727
|
// they just run with default row estimates.
|
|
596
728
|
}
|
|
729
|
+
} // end: if (!opts.readonly) writer init
|
|
597
730
|
|
|
598
731
|
const insertSpan = db.query(`
|
|
599
732
|
INSERT INTO spans (
|
|
@@ -641,8 +774,14 @@ export const TelemetryStoreLive = Layer.effect(
|
|
|
641
774
|
GROUP BY trace_id
|
|
642
775
|
`)
|
|
643
776
|
|
|
644
|
-
|
|
645
|
-
|
|
777
|
+
// One-time full rebuild of the trace_summaries table at open so
|
|
778
|
+
// any drift from interrupted ingests gets reconciled. Writer-only
|
|
779
|
+
// because the DELETE + INSERT would fail on a readonly connection
|
|
780
|
+
// (and would fight the daemon's writer for the lock anyway).
|
|
781
|
+
if (!opts.readonly) {
|
|
782
|
+
db.query(`DELETE FROM trace_summaries`).run()
|
|
783
|
+
rebuildTraceSummaries.run()
|
|
784
|
+
}
|
|
646
785
|
|
|
647
786
|
const deleteSpanAttributes = db.query(`DELETE FROM span_attributes WHERE trace_id = ? AND span_id = ?`)
|
|
648
787
|
const insertSpanAttribute = db.query(`INSERT INTO span_attributes (trace_id, span_id, key, value) VALUES (?, ?, ?, ?)`)
|
|
@@ -725,21 +864,79 @@ export const TelemetryStoreLive = Layer.effect(
|
|
|
725
864
|
} catch {
|
|
726
865
|
// FTS table may not exist on old DBs.
|
|
727
866
|
}
|
|
867
|
+
|
|
868
|
+
// Truncate the WAL after a big delete pass. Without this the
|
|
869
|
+
// WAL keeps growing (observed: 640MB) because wal_autocheckpoint
|
|
870
|
+
// only triggers when WAL pages exceed the threshold during
|
|
871
|
+
// writes — a retention pass that evicts millions of rows can
|
|
872
|
+
// blow far past that before the auto-checkpoint fires. Using
|
|
873
|
+
// PASSIVE so active readers aren't interrupted; if the WAL
|
|
874
|
+
// can't be fully reclaimed right now, we'll try again next
|
|
875
|
+
// cycle.
|
|
876
|
+
try { db.exec(`PRAGMA wal_checkpoint(PASSIVE);`) } catch { /* ignore */ }
|
|
877
|
+
|
|
878
|
+
// Incremental vacuum reclaims some of the freed pages back
|
|
879
|
+
// to the OS so the file size actually shrinks over time
|
|
880
|
+
// instead of just growing the freelist. Bounded to 2000
|
|
881
|
+
// pages per pass (≈8MB) to avoid a long-running transaction.
|
|
882
|
+
try { db.exec(`PRAGMA incremental_vacuum(2000);`) } catch { /* ignore */ }
|
|
728
883
|
})
|
|
729
884
|
})
|
|
730
885
|
|
|
731
|
-
//
|
|
732
|
-
|
|
886
|
+
// Retention only runs in processes that opt in (currently the main
|
|
887
|
+
// daemon). The ingest worker and TUI skip it to avoid two writers
|
|
888
|
+
// competing for the write lock with overlapping DELETE passes.
|
|
889
|
+
if (opts.runRetention) {
|
|
890
|
+
// Enable incremental vacuum so retention can reclaim freed
|
|
891
|
+
// pages over time instead of needing a stop-the-world VACUUM.
|
|
892
|
+
// Idempotent: repeat calls after the first are no-ops.
|
|
893
|
+
try { db.exec(`PRAGMA auto_vacuum = INCREMENTAL;`) } catch { /* ignore */ }
|
|
894
|
+
|
|
895
|
+
// Run cleanup every 60 seconds in the background, tied to the layer's scope
|
|
896
|
+
yield* Effect.forkScoped(Effect.repeat(cleanupExpired(), Schedule.spaced("60 seconds")))
|
|
897
|
+
|
|
898
|
+
// Periodically refresh query planner stats. `PRAGMA optimize` is a
|
|
899
|
+
// no-op when nothing has changed, so this is essentially free on idle
|
|
900
|
+
// servers and keeps facet/search planner estimates accurate as data
|
|
901
|
+
// grows. 15 minutes is slower than ingestion rates we care about but
|
|
902
|
+
// frequent enough that the attribute picker stays snappy.
|
|
903
|
+
const refreshPlannerStats = Effect.sync(() => {
|
|
904
|
+
try { db.exec(`PRAGMA optimize;`) } catch { /* ignore */ }
|
|
905
|
+
})
|
|
906
|
+
yield* Effect.forkScoped(Effect.repeat(refreshPlannerStats, Schedule.spaced("15 minutes")))
|
|
907
|
+
}
|
|
733
908
|
|
|
734
|
-
//
|
|
735
|
-
//
|
|
736
|
-
//
|
|
737
|
-
//
|
|
738
|
-
//
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
909
|
+
// One-time backfill for existing DBs: if span_attr_fts is empty but
|
|
910
|
+
// span_attributes has rows with AI_FTS_KEYS, populate the index.
|
|
911
|
+
// Runs forked so server startup isn't blocked; queries hitting the
|
|
912
|
+
// FTS will just return empty until the fill lands. On a 2 GB DB with
|
|
913
|
+
// ~400 matching rows this takes ~3-8 seconds. Writer-only because
|
|
914
|
+
// it does INSERT INTO ... — readonly connections would error.
|
|
915
|
+
if (hasAttrFts && !opts.readonly) {
|
|
916
|
+
const backfillAttrFts = Effect.sync(() => {
|
|
917
|
+
try {
|
|
918
|
+
const ftsCount = (db.query(`SELECT COUNT(*) AS c FROM span_attr_fts`).get() as { c: number }).c
|
|
919
|
+
if (ftsCount > 0) return
|
|
920
|
+
const keyList = AI_FTS_KEYS.map((k) => `'${k.replace(/'/g, "''")}'`).join(", ")
|
|
921
|
+
const attrCount = (db.query(
|
|
922
|
+
`SELECT COUNT(*) AS c FROM span_attributes WHERE key IN (${keyList})`,
|
|
923
|
+
).get() as { c: number }).c
|
|
924
|
+
if (attrCount === 0) return
|
|
925
|
+
// Single INSERT..SELECT is atomic and fast; FTS5 batches
|
|
926
|
+
// its internal segment writes. No transaction wrapper
|
|
927
|
+
// needed — it runs as one statement.
|
|
928
|
+
db.exec(`
|
|
929
|
+
INSERT INTO span_attr_fts(rowid, value)
|
|
930
|
+
SELECT rowid, value FROM span_attributes WHERE key IN (${keyList})
|
|
931
|
+
`)
|
|
932
|
+
} catch {
|
|
933
|
+
// Backfill failure is never fatal — new ingests still
|
|
934
|
+
// populate FTS via the trigger, and queries fall back to
|
|
935
|
+
// LIKE when FTS lookups return empty.
|
|
936
|
+
}
|
|
937
|
+
})
|
|
938
|
+
yield* Effect.forkScoped(backfillAttrFts)
|
|
939
|
+
}
|
|
743
940
|
|
|
744
941
|
const ingestTraces = Effect.fn("motel/TelemetryStore.ingestTraces")(function* (payload: OtlpTraceExportRequest) {
|
|
745
942
|
return yield* Effect.sync(() => {
|
|
@@ -965,6 +1162,25 @@ export const TelemetryStoreLive = Layer.effect(
|
|
|
965
1162
|
params.push(...exactAttrMatch.params)
|
|
966
1163
|
}
|
|
967
1164
|
|
|
1165
|
+
// `:ai <query>` — FTS match against LLM content keys. Joins
|
|
1166
|
+
// span_attr_fts back to span_attributes to collect trace_ids
|
|
1167
|
+
// whose spans carry matching prompt/response content. Falls
|
|
1168
|
+
// through to no-op when the query tokenizes empty (e.g. only
|
|
1169
|
+
// stopwords or operator-chars) so users don't get a silently
|
|
1170
|
+
// empty list.
|
|
1171
|
+
if (input.aiText) {
|
|
1172
|
+
const aiFtsQuery = toFtsMatchQuery(input.aiText)
|
|
1173
|
+
if (hasAttrFts && aiFtsQuery) {
|
|
1174
|
+
clauses.push(`trace_id IN (
|
|
1175
|
+
SELECT DISTINCT sa.trace_id
|
|
1176
|
+
FROM span_attr_fts fts
|
|
1177
|
+
JOIN span_attributes sa ON sa.rowid = fts.rowid
|
|
1178
|
+
WHERE fts.value MATCH ?
|
|
1179
|
+
)`)
|
|
1180
|
+
params.push(aiFtsQuery)
|
|
1181
|
+
}
|
|
1182
|
+
}
|
|
1183
|
+
|
|
968
1184
|
const rows = db.query(`
|
|
969
1185
|
SELECT trace_id, service_name, root_operation_name, started_at_ms, ended_at_ms, active_span_count, duration_ms, span_count, error_count
|
|
970
1186
|
FROM trace_summaries
|
|
@@ -1588,7 +1804,11 @@ export const TelemetryStoreLive = Layer.effect(
|
|
|
1588
1804
|
|
|
1589
1805
|
/** Builds WHERE clauses for AI call search against the spans table (aliased as s) */
|
|
1590
1806
|
const buildAiWhereClauses = (input: AiCallSearch | AiCallStatsSearch, cutoff: number) => {
|
|
1591
|
-
const clauses: string[] = [
|
|
1807
|
+
const clauses: string[] = [
|
|
1808
|
+
"s.operation_name LIKE 'ai.%'",
|
|
1809
|
+
"s.operation_name NOT LIKE 'ai.%.do%'",
|
|
1810
|
+
"s.start_time_ms >= ?",
|
|
1811
|
+
]
|
|
1592
1812
|
const params: Array<string | number> = [cutoff]
|
|
1593
1813
|
|
|
1594
1814
|
if (input.service) {
|
|
@@ -1624,11 +1844,27 @@ export const TelemetryStoreLive = Layer.effect(
|
|
|
1624
1844
|
params.push(key, value)
|
|
1625
1845
|
}
|
|
1626
1846
|
|
|
1627
|
-
// Text search across prompt/response/tool attribute values
|
|
1847
|
+
// Text search across prompt/response/tool attribute values via
|
|
1848
|
+
// FTS5. Prefers the external-content span_attr_fts index when
|
|
1849
|
+
// available, falls back to case-insensitive LIKE so old DBs
|
|
1850
|
+
// without FTS still work. FTS turns ~500ms full scans of 3 MB
|
|
1851
|
+
// prompt JSON into <50ms MATCH lookups.
|
|
1628
1852
|
if ("text" in input && input.text) {
|
|
1629
|
-
const
|
|
1630
|
-
|
|
1631
|
-
|
|
1853
|
+
const ftsQuery = toFtsMatchQuery(input.text)
|
|
1854
|
+
if (hasAttrFts && ftsQuery) {
|
|
1855
|
+
clauses.push(`EXISTS (
|
|
1856
|
+
SELECT 1 FROM span_attr_fts fts
|
|
1857
|
+
JOIN span_attributes sa ON sa.rowid = fts.rowid
|
|
1858
|
+
WHERE sa.trace_id = s.trace_id
|
|
1859
|
+
AND sa.span_id = s.span_id
|
|
1860
|
+
AND fts.value MATCH ?
|
|
1861
|
+
)`)
|
|
1862
|
+
params.push(ftsQuery)
|
|
1863
|
+
} else {
|
|
1864
|
+
const textKeys = AI_TEXT_SEARCH_KEYS.map(() => "?").join(", ")
|
|
1865
|
+
clauses.push(`EXISTS (SELECT 1 FROM span_attributes WHERE span_attributes.trace_id = s.trace_id AND span_attributes.span_id = s.span_id AND key IN (${textKeys}) AND value LIKE ? COLLATE NOCASE)`)
|
|
1866
|
+
params.push(...AI_TEXT_SEARCH_KEYS, `%${input.text}%`)
|
|
1867
|
+
}
|
|
1632
1868
|
}
|
|
1633
1869
|
|
|
1634
1870
|
return { clauses, params }
|
|
@@ -1961,3 +2197,25 @@ export const TelemetryStoreLive = Layer.effect(
|
|
|
1961
2197
|
})
|
|
1962
2198
|
}),
|
|
1963
2199
|
)
|
|
2200
|
+
|
|
2201
|
+
/**
|
|
2202
|
+
* Default writer instance: the main daemon uses this. Owns schema
|
|
2203
|
+
* migrations, FTS backfill, and the retention loop.
|
|
2204
|
+
*/
|
|
2205
|
+
export const TelemetryStoreLive = makeTelemetryStoreLayer({ readonly: false, runRetention: true })
|
|
2206
|
+
|
|
2207
|
+
/**
|
|
2208
|
+
* Writer instance that SKIPS retention. The ingest worker uses this
|
|
2209
|
+
* so the daemon and the worker aren't both running DELETE passes at
|
|
2210
|
+
* the same time (they'd just serialise behind the write lock and
|
|
2211
|
+
* duplicate work).
|
|
2212
|
+
*/
|
|
2213
|
+
export const TelemetryStoreWorkerLive = makeTelemetryStoreLayer({ readonly: false, runRetention: false })
|
|
2214
|
+
|
|
2215
|
+
/**
|
|
2216
|
+
* Read-only instance for query-only processes (currently the TUI).
|
|
2217
|
+
* Skips every DDL/DML statement at startup so the connection can be
|
|
2218
|
+
* opened while a writer is mid-transaction without racing for the
|
|
2219
|
+
* write lock. Writes through the service interface will throw.
|
|
2220
|
+
*/
|
|
2221
|
+
export const TelemetryStoreReadonlyLive = makeTelemetryStoreLayer({ readonly: true, runRetention: false })
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Effect, Layer, Context } from "effect"
|
|
2
|
-
import type { SpanItem, TraceItem, TraceSummaryItem } from "../domain.js"
|
|
2
|
+
import type { AiCallDetail, SpanItem, TraceItem, TraceSummaryItem } from "../domain.js"
|
|
3
3
|
import { TelemetryStore } from "./TelemetryStore.js"
|
|
4
4
|
|
|
5
5
|
export class TraceQueryService extends Context.Service<
|
|
@@ -8,12 +8,13 @@ export class TraceQueryService extends Context.Service<
|
|
|
8
8
|
readonly listServices: Effect.Effect<readonly string[], Error>
|
|
9
9
|
readonly listRecentTraces: (serviceName: string, options?: { readonly lookbackMinutes?: number; readonly limit?: number }) => Effect.Effect<readonly TraceItem[], Error>
|
|
10
10
|
readonly listTraceSummaries: (serviceName: string, options?: { readonly lookbackMinutes?: number; readonly limit?: number }) => Effect.Effect<readonly TraceSummaryItem[], Error>
|
|
11
|
-
readonly searchTraceSummaries: (input: { readonly serviceName?: string | null; readonly operation?: string | null; readonly status?: "ok" | "error" | null; readonly minDurationMs?: number | null; readonly lookbackMinutes?: number; readonly limit?: number; readonly attributeFilters?: Readonly<Record<string, string
|
|
11
|
+
readonly searchTraceSummaries: (input: { readonly serviceName?: string | null; readonly operation?: string | null; readonly status?: "ok" | "error" | null; readonly minDurationMs?: number | null; readonly lookbackMinutes?: number; readonly limit?: number; readonly attributeFilters?: Readonly<Record<string, string>>; readonly aiText?: string | null }) => Effect.Effect<readonly TraceSummaryItem[], Error>
|
|
12
12
|
readonly listFacets: (input: { readonly type: "traces" | "logs"; readonly field: string; readonly serviceName?: string | null; readonly key?: string | null; readonly lookbackMinutes?: number; readonly limit?: number }) => Effect.Effect<readonly { readonly value: string; readonly count: number }[], Error>
|
|
13
13
|
readonly searchTraces: (input: { readonly serviceName?: string | null; readonly operation?: string | null; readonly status?: "ok" | "error" | null; readonly minDurationMs?: number | null; readonly lookbackMinutes?: number; readonly limit?: number; readonly attributeFilters?: Readonly<Record<string, string>> }) => Effect.Effect<readonly TraceItem[], Error>
|
|
14
14
|
readonly traceStats: (input: { readonly groupBy: string; readonly agg: "count" | "avg_duration" | "p95_duration" | "error_rate"; readonly serviceName?: string | null; readonly operation?: string | null; readonly status?: "ok" | "error" | null; readonly minDurationMs?: number | null; readonly lookbackMinutes?: number; readonly limit?: number; readonly attributeFilters?: Readonly<Record<string, string>> }) => Effect.Effect<readonly { readonly group: string; readonly value: number; readonly count: number }[], Error>
|
|
15
15
|
readonly getTrace: (traceId: string) => Effect.Effect<TraceItem | null, Error>
|
|
16
16
|
readonly getSpan: (spanId: string) => Effect.Effect<SpanItem | null, Error>
|
|
17
|
+
readonly getAiCall: (spanId: string) => Effect.Effect<AiCallDetail | null, Error>
|
|
17
18
|
readonly listTraceSpans: (traceId: string) => Effect.Effect<readonly SpanItem[], Error>
|
|
18
19
|
readonly searchSpans: (input: { readonly serviceName?: string | null; readonly operation?: string | null; readonly parentOperation?: string | null; readonly status?: "ok" | "error" | null; readonly lookbackMinutes?: number; readonly limit?: number; readonly attributeFilters?: Readonly<Record<string, string>> }) => Effect.Effect<readonly SpanItem[], Error>
|
|
19
20
|
}
|
|
@@ -68,6 +69,7 @@ export const TraceQueryServiceLive = Layer.effect(
|
|
|
68
69
|
traceStats: store.traceStats,
|
|
69
70
|
getTrace,
|
|
70
71
|
getSpan,
|
|
72
|
+
getAiCall: store.getAiCall,
|
|
71
73
|
listTraceSpans: store.listTraceSpans,
|
|
72
74
|
searchSpans: store.searchSpans,
|
|
73
75
|
})
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RPC contract for OTLP ingest. Lives in its own file so both the
|
|
3
|
+
* main thread (client) and the telemetry worker (server) can import
|
|
4
|
+
* the schema without pulling in each other's runtime code.
|
|
5
|
+
*
|
|
6
|
+
* Only ingestTraces and ingestLogs run through RPC — those are the
|
|
7
|
+
* methods whose SQLite writes used to block the main event loop for
|
|
8
|
+
* seconds at a time. Every other TelemetryStore method stays on the
|
|
9
|
+
* main thread with its own direct DB connection; SQLite's WAL mode
|
|
10
|
+
* lets the reader (main) and writer (worker) hold independent
|
|
11
|
+
* connections to the same file concurrently without contention.
|
|
12
|
+
*
|
|
13
|
+
* Payloads are typed as Schema.Unknown because OTLP's protobuf-JSON
|
|
14
|
+
* shape is enormous and nested — the store validates structurally
|
|
15
|
+
* during the actual insert loop, and serialising a schema through
|
|
16
|
+
* the worker boundary would add overhead that beats the purpose of
|
|
17
|
+
* the offload. If a payload is malformed we surface it as an
|
|
18
|
+
* IngestError rather than a RpcSchemaError, which keeps the failure
|
|
19
|
+
* mode consistent with the old direct-call behaviour.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { Schema } from "effect"
|
|
23
|
+
import * as Rpc from "effect/unstable/rpc/Rpc"
|
|
24
|
+
import * as RpcGroup from "effect/unstable/rpc/RpcGroup"
|
|
25
|
+
|
|
26
|
+
export class IngestError extends Schema.TaggedErrorClass<IngestError>()("IngestError", {
|
|
27
|
+
message: Schema.String,
|
|
28
|
+
}) {}
|
|
29
|
+
|
|
30
|
+
export const IngestRpcs = RpcGroup.make(
|
|
31
|
+
Rpc.make("ingestTraces", {
|
|
32
|
+
payload: { payload: Schema.Unknown },
|
|
33
|
+
success: Schema.Struct({ insertedSpans: Schema.Number }),
|
|
34
|
+
error: IngestError,
|
|
35
|
+
}),
|
|
36
|
+
Rpc.make("ingestLogs", {
|
|
37
|
+
payload: { payload: Schema.Unknown },
|
|
38
|
+
success: Schema.Struct({ insertedLogs: Schema.Number }),
|
|
39
|
+
error: IngestError,
|
|
40
|
+
}),
|
|
41
|
+
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Worker-thread entry point for OTLP ingest.
|
|
3
|
+
*
|
|
4
|
+
* Spawned by the main process via `new Worker(new URL("./telemetryWorker.ts", import.meta.url))`.
|
|
5
|
+
* This file runs inside a Bun Worker, so anything it imports is
|
|
6
|
+
* evaluated in a FRESH module graph on the worker side. In particular
|
|
7
|
+
* `TelemetryStoreWorkerLive` opens its own SQLite connection here — the main
|
|
8
|
+
* thread's store connection is unrelated. SQLite's WAL journal mode
|
|
9
|
+
* lets both connections coexist against the same `.sqlite` file: the
|
|
10
|
+
* worker writes, the main thread reads, and neither blocks the other.
|
|
11
|
+
*
|
|
12
|
+
* The worker only exposes `ingestTraces` / `ingestLogs` (see
|
|
13
|
+
* ingestRpc.ts). Query methods stay on the main thread because they're
|
|
14
|
+
* already fast (1-14ms) and round-tripping them through structured-
|
|
15
|
+
* clone would add more overhead than it saves. This is a deliberately
|
|
16
|
+
* narrow interface — the payoff is that main-thread HTTP queries
|
|
17
|
+
* never queue behind a heavy OTLP batch again.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { BunRuntime } from "@effect/platform-bun"
|
|
21
|
+
import * as BunWorkerRunner from "@effect/platform-bun/BunWorkerRunner"
|
|
22
|
+
import { Effect, Layer } from "effect"
|
|
23
|
+
import * as RpcSerialization from "effect/unstable/rpc/RpcSerialization"
|
|
24
|
+
import * as RpcServer from "effect/unstable/rpc/RpcServer"
|
|
25
|
+
import type { OtlpLogExportRequest, OtlpTraceExportRequest } from "../otlp.ts"
|
|
26
|
+
import { IngestError, IngestRpcs } from "./ingestRpc.ts"
|
|
27
|
+
import { TelemetryStore, TelemetryStoreWorkerLive } from "./TelemetryStore.ts"
|
|
28
|
+
|
|
29
|
+
// Wire the two RPC methods to the existing TelemetryStore service.
|
|
30
|
+
// The store's ingest methods already carry their own Effect.fn spans,
|
|
31
|
+
// so the worker-side traces show up correctly attributed — the RPC
|
|
32
|
+
// framework also auto-spans each incoming request with method +
|
|
33
|
+
// payload-size attributes, giving us visibility into how ingest is
|
|
34
|
+
// splitting its time across the queue / wire / SQL stages.
|
|
35
|
+
const IngestHandlers = IngestRpcs.toLayer(
|
|
36
|
+
Effect.gen(function*() {
|
|
37
|
+
const store = yield* TelemetryStore
|
|
38
|
+
return {
|
|
39
|
+
ingestTraces: ({ payload }) =>
|
|
40
|
+
store.ingestTraces(payload as OtlpTraceExportRequest).pipe(
|
|
41
|
+
Effect.mapError((cause) => new IngestError({ message: String(cause) })),
|
|
42
|
+
),
|
|
43
|
+
ingestLogs: ({ payload }) =>
|
|
44
|
+
store.ingestLogs(payload as OtlpLogExportRequest).pipe(
|
|
45
|
+
Effect.mapError((cause) => new IngestError({ message: String(cause) })),
|
|
46
|
+
),
|
|
47
|
+
}
|
|
48
|
+
}),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
const WorkerLive = RpcServer.layer(IngestRpcs).pipe(
|
|
52
|
+
Layer.provide(IngestHandlers),
|
|
53
|
+
Layer.provide(TelemetryStoreWorkerLive),
|
|
54
|
+
Layer.provide(RpcServer.layerProtocolWorkerRunner),
|
|
55
|
+
Layer.provide(RpcSerialization.layerMsgPack),
|
|
56
|
+
Layer.provide(BunWorkerRunner.layer),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
// BunRuntime.runMain installs signal handlers so the scope closes
|
|
60
|
+
// cleanly on termination; the BunHttpServer layer pattern from the
|
|
61
|
+
// main server carries over here.
|
|
62
|
+
Layer.launch(WorkerLive).pipe(BunRuntime.runMain)
|