@kitlangton/motel 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +23 -8
- package/README.md +13 -2
- package/package.json +35 -19
- package/skills/motel-debug/SKILL.md +203 -0
- package/skills/motel-debug/references/effect.md +38 -0
- package/src/App.tsx +12 -5
- package/src/StartupGate.tsx +289 -0
- package/src/cli.ts +15 -16
- package/src/config.ts +7 -1
- package/src/daemon.test.ts +332 -51
- package/src/daemon.ts +105 -153
- package/src/httpApi.ts +1 -0
- package/src/httpListPolicy.test.ts +76 -0
- package/src/httpListPolicy.ts +129 -0
- package/src/index.tsx +9 -2
- package/src/localServer.ts +194 -313
- package/src/mcp.ts +2 -1
- package/src/motel.ts +0 -2
- package/src/opentui-jsx.d.ts +11 -0
- package/src/otlp.test.ts +65 -0
- package/src/otlp.ts +20 -0
- package/src/otlpProtobuf.ts +35 -0
- package/src/registry.ts +37 -11
- package/src/runtime.ts +2 -6
- package/src/services/AsyncIngest.ts +22 -8
- package/src/services/LogQueryService.ts +13 -27
- package/src/services/TelemetryQuery.ts +62 -0
- package/src/services/TelemetryStore.ts +546 -231
- package/src/services/TraceQueryService.ts +22 -56
- package/src/services/ingestRpc.ts +2 -4
- package/src/services/queryRpc.ts +15 -0
- package/src/services/telemetryQueryWorker.ts +32 -0
- package/src/services/telemetryWorker.ts +5 -8
- package/src/startupBench.ts +19 -0
- package/src/storybook/aiChatStory.tsx +1 -1
- package/src/telemetry.test.ts +307 -41
- package/src/ui/AiChatView.tsx +1 -1
- package/src/ui/AttrFilterModal.tsx +1 -1
- package/src/ui/ServiceLogs.tsx +10 -7
- package/src/ui/SpanContentView.tsx +24 -21
- package/src/ui/TraceDetailsPane.tsx +1 -1
- package/src/ui/TraceList.tsx +1 -1
- package/src/ui/aiState.ts +10 -22
- package/src/ui/app/TraceWorkspace.tsx +2 -1
- package/src/ui/app/useAppLayout.ts +1 -1
- package/src/ui/app/useTraceScreenData.ts +35 -23
- package/src/ui/atoms.ts +1 -1
- package/src/ui/cachedLoader.test.ts +23 -0
- package/src/ui/cachedLoader.ts +60 -0
- package/src/ui/loaders.ts +34 -53
- package/src/ui/persistence.ts +3 -3
- package/src/ui/primitives.tsx +1 -1
- package/src/ui/state.ts +2 -0
- package/src/ui/theme.ts +7 -5
- package/src/ui/traceDetailsWidth.repro.test.ts +12 -1
- package/src/ui/traceSortNav.repro.seed.ts +1 -1
- package/src/ui/traceSortNav.repro.test.ts +12 -2
- package/src/ui/useAttrFilterPicker.ts +10 -8
- package/src/ui/useKeyboardNav.ts +28 -5
- package/src/ui/waterfallNav.repro.seed.ts +1 -1
- package/src/ui/waterfallNav.repro.test.ts +16 -8
- package/web/dist/assets/index-B01z9BaO.css +2 -0
- package/web/dist/assets/index-M86tcih5.js +22 -0
- package/web/dist/index.html +2 -2
- package/web/dist/assets/index-DnyVo03x.js +0 -27
- package/web/dist/assets/index-DzuHNBGV.css +0 -2
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import { Database } from "bun:sqlite"
|
|
2
|
-
import
|
|
2
|
+
import * as BunFileSystem from "@effect/platform-bun/BunFileSystem"
|
|
3
3
|
import { dirname } from "node:path"
|
|
4
|
-
import { Clock, Effect, Layer, Schedule, Context } from "effect"
|
|
4
|
+
import { Cause, Clock, Effect, FileSystem, Layer, Schedule, Context } from "effect"
|
|
5
5
|
import { config } from "../config.js"
|
|
6
6
|
import type { AiCallDetail, AiCallSummary, FacetItem, LogItem, SpanItem, StatsItem, TraceItem, TraceSummaryItem, TraceSpanEvent, TraceSpanItem } from "../domain.js"
|
|
7
7
|
import { AI_ATTR_MAP, AI_FTS_KEYS, AI_TEXT_SEARCH_KEYS, truncatePreview } from "../domain.js"
|
|
8
|
-
import { attributeMap, nanosToMilliseconds, parseAnyValue, spanKindLabel, spanStatusLabel, stringifyValue, type OtlpLogExportRequest, type OtlpTraceExportRequest } from "../otlp.js"
|
|
8
|
+
import { attributeMap, nanosToMilliseconds, normalizeOtlpBinaryId, parseAnyValue, spanKindLabel, spanStatusLabel, stringifyValue, type OtlpLogExportRequest, type OtlpTraceExportRequest } from "../otlp.js"
|
|
9
9
|
|
|
10
10
|
const isSqliteLockError = (error: unknown) =>
|
|
11
11
|
error instanceof Error && /(database is locked|database table is locked|SQLITE_BUSY)/i.test(error.message)
|
|
@@ -196,18 +196,40 @@ const TRACE_SUMMARY_SELECT_SQL = `
|
|
|
196
196
|
FROM spans
|
|
197
197
|
`
|
|
198
198
|
|
|
199
|
+
// Memoize small repeated JSON records. Resource attributes are the primary
|
|
200
|
+
// beneficiary because many spans share the same serialized value; compact
|
|
201
|
+
// repeated span attributes also benefit while large unique payloads bypass
|
|
202
|
+
// the cache to keep memory bounded for long-running daemons.
|
|
203
|
+
const RECORD_PARSE_CACHE_MAX_VALUE_LEN = 1024
|
|
204
|
+
const RECORD_PARSE_CACHE_LIMIT = 256
|
|
205
|
+
const recordParseCache = new Map<string, Record<string, string>>()
|
|
206
|
+
const EMPTY_RECORD: Record<string, string> = {}
|
|
207
|
+
|
|
199
208
|
const parseRecord = (value: string): Record<string, string> => {
|
|
209
|
+
if (value === "" || value === "{}") return EMPTY_RECORD
|
|
210
|
+
const cacheable = value.length <= RECORD_PARSE_CACHE_MAX_VALUE_LEN
|
|
211
|
+
if (cacheable) {
|
|
212
|
+
const cached = recordParseCache.get(value)
|
|
213
|
+
if (cached !== undefined) return cached
|
|
214
|
+
}
|
|
215
|
+
let parsed: Record<string, string>
|
|
200
216
|
try {
|
|
201
|
-
const
|
|
202
|
-
|
|
217
|
+
const json = JSON.parse(value) as Record<string, unknown>
|
|
218
|
+
parsed = Object.fromEntries(Object.entries(json).map(([key, entry]) => [key, stringifyValue(entry)]))
|
|
203
219
|
} catch {
|
|
204
|
-
|
|
220
|
+
parsed = EMPTY_RECORD
|
|
205
221
|
}
|
|
222
|
+
if (cacheable && recordParseCache.size < RECORD_PARSE_CACHE_LIMIT) {
|
|
223
|
+
recordParseCache.set(value, parsed)
|
|
224
|
+
}
|
|
225
|
+
return parsed
|
|
206
226
|
}
|
|
207
227
|
|
|
208
228
|
const parseEvents = (value: string): readonly TraceSpanEvent[] => {
|
|
229
|
+
if (value === "" || value === "[]") return []
|
|
209
230
|
try {
|
|
210
231
|
const parsed = JSON.parse(value) as Array<{ name: string; timestamp: number; attributes: Record<string, string> }>
|
|
232
|
+
if (parsed.length === 0) return []
|
|
211
233
|
return parsed.map((event) => ({
|
|
212
234
|
name: event.name,
|
|
213
235
|
timestamp: new Date(event.timestamp),
|
|
@@ -423,29 +445,41 @@ const buildContainsAttributeMatchSubquery = (
|
|
|
423
445
|
}
|
|
424
446
|
}
|
|
425
447
|
|
|
448
|
+
// Read-only surface of the telemetry store. Pulled out so a readonly
|
|
449
|
+
// SQLite connection (TUI / HTTP query handlers) can be expressed as a
|
|
450
|
+
// distinct service identifier from the writer, without re-declaring
|
|
451
|
+
// every query in a wrapper layer. The writer's value still satisfies
|
|
452
|
+
// this shape — TelemetryStoreLive can provide both identifiers from
|
|
453
|
+
// one underlying object if needed.
|
|
454
|
+
export interface TelemetryStoreReader {
|
|
455
|
+
readonly listServices: Effect.Effect<readonly string[], Error>
|
|
456
|
+
readonly listRecentTraces: (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) => Effect.Effect<readonly TraceItem[], Error>
|
|
457
|
+
readonly listTraceSummaries: (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) => Effect.Effect<readonly TraceSummaryItem[], Error>
|
|
458
|
+
readonly searchTraces: (input: TraceSearch) => Effect.Effect<readonly TraceItem[], Error>
|
|
459
|
+
readonly searchTraceSummaries: (input: TraceSearch) => Effect.Effect<readonly TraceSummaryItem[], Error>
|
|
460
|
+
readonly traceStats: (input: TraceStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
|
|
461
|
+
readonly getTrace: (traceId: string) => Effect.Effect<TraceItem | null, Error>
|
|
462
|
+
readonly getSpan: (spanId: string) => Effect.Effect<SpanItem | null, Error>
|
|
463
|
+
readonly listTraceSpans: (traceId: string) => Effect.Effect<readonly SpanItem[], Error>
|
|
464
|
+
readonly searchSpans: (input: SpanSearch) => Effect.Effect<readonly SpanItem[], Error>
|
|
465
|
+
readonly searchLogs: (input: LogSearch) => Effect.Effect<readonly LogItem[], Error>
|
|
466
|
+
readonly logStats: (input: LogStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
|
|
467
|
+
readonly listFacets: (input: FacetSearch) => Effect.Effect<readonly FacetItem[], Error>
|
|
468
|
+
readonly listRecentLogs: (serviceName: string) => Effect.Effect<readonly LogItem[], Error>
|
|
469
|
+
readonly listTraceLogs: (traceId: string) => Effect.Effect<readonly LogItem[], Error>
|
|
470
|
+
readonly searchAiCalls: (input: AiCallSearch) => Effect.Effect<readonly AiCallSummary[], Error>
|
|
471
|
+
readonly getAiCall: (spanId: string) => Effect.Effect<AiCallDetail | null, Error>
|
|
472
|
+
readonly aiCallStats: (input: AiCallStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
export class TelemetryStoreReadonly extends Context.Service<TelemetryStoreReadonly, TelemetryStoreReader>()("motel/TelemetryStoreReadonly") {}
|
|
476
|
+
|
|
426
477
|
export class TelemetryStore extends Context.Service<
|
|
427
478
|
TelemetryStore,
|
|
428
|
-
{
|
|
479
|
+
TelemetryStoreReader & {
|
|
429
480
|
readonly ingestTraces: (payload: OtlpTraceExportRequest) => Effect.Effect<{ readonly insertedSpans: number }, Error>
|
|
430
481
|
readonly ingestLogs: (payload: OtlpLogExportRequest) => Effect.Effect<{ readonly insertedLogs: number }, Error>
|
|
431
|
-
readonly
|
|
432
|
-
readonly listRecentTraces: (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) => Effect.Effect<readonly TraceItem[], Error>
|
|
433
|
-
readonly listTraceSummaries: (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) => Effect.Effect<readonly TraceSummaryItem[], Error>
|
|
434
|
-
readonly searchTraces: (input: TraceSearch) => Effect.Effect<readonly TraceItem[], Error>
|
|
435
|
-
readonly searchTraceSummaries: (input: TraceSearch) => Effect.Effect<readonly TraceSummaryItem[], Error>
|
|
436
|
-
readonly traceStats: (input: TraceStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
|
|
437
|
-
readonly getTrace: (traceId: string) => Effect.Effect<TraceItem | null, Error>
|
|
438
|
-
readonly getSpan: (spanId: string) => Effect.Effect<SpanItem | null, Error>
|
|
439
|
-
readonly listTraceSpans: (traceId: string) => Effect.Effect<readonly SpanItem[], Error>
|
|
440
|
-
readonly searchSpans: (input: SpanSearch) => Effect.Effect<readonly SpanItem[], Error>
|
|
441
|
-
readonly searchLogs: (input: LogSearch) => Effect.Effect<readonly LogItem[], Error>
|
|
442
|
-
readonly logStats: (input: LogStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
|
|
443
|
-
readonly listFacets: (input: FacetSearch) => Effect.Effect<readonly FacetItem[], Error>
|
|
444
|
-
readonly listRecentLogs: (serviceName: string) => Effect.Effect<readonly LogItem[], Error>
|
|
445
|
-
readonly listTraceLogs: (traceId: string) => Effect.Effect<readonly LogItem[], Error>
|
|
446
|
-
readonly searchAiCalls: (input: AiCallSearch) => Effect.Effect<readonly AiCallSummary[], Error>
|
|
447
|
-
readonly getAiCall: (spanId: string) => Effect.Effect<AiCallDetail | null, Error>
|
|
448
|
-
readonly aiCallStats: (input: AiCallStatsSearch) => Effect.Effect<readonly StatsItem[], Error>
|
|
482
|
+
readonly runRetentionNow: Effect.Effect<void, Error>
|
|
449
483
|
}
|
|
450
484
|
>()("motel/TelemetryStore") {}
|
|
451
485
|
|
|
@@ -462,18 +496,17 @@ export class TelemetryStore extends Context.Service<
|
|
|
462
496
|
*
|
|
463
497
|
* - `runRetention` — fork the background cleanup loop (age + size cap
|
|
464
498
|
* eviction, WAL checkpoint). Only one process should own this at a
|
|
465
|
-
* time.
|
|
466
|
-
* worker and the TUI skip it.
|
|
499
|
+
* time. The ingest worker owns it; the HTTP thread and TUI skip it.
|
|
467
500
|
*/
|
|
468
501
|
export interface TelemetryStoreOptions {
|
|
469
502
|
readonly readonly: boolean
|
|
470
503
|
readonly runRetention: boolean
|
|
471
504
|
}
|
|
472
505
|
|
|
473
|
-
|
|
474
|
-
TelemetryStore,
|
|
506
|
+
const makeTelemetryStoreEffect = (opts: TelemetryStoreOptions) =>
|
|
475
507
|
Effect.gen(function* () {
|
|
476
|
-
|
|
508
|
+
const fileSystem = yield* FileSystem.FileSystem
|
|
509
|
+
yield* fileSystem.makeDirectory(dirname(config.otel.databasePath), { recursive: true })
|
|
477
510
|
const db = yield* Effect.acquireRelease(
|
|
478
511
|
Effect.sync(() => new Database(config.otel.databasePath, {
|
|
479
512
|
create: !opts.readonly,
|
|
@@ -517,6 +550,13 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
517
550
|
-- SQLite silently caps at actual file size for smaller DBs.
|
|
518
551
|
PRAGMA mmap_size = 268435456;
|
|
519
552
|
`)
|
|
553
|
+
// auto_vacuum is a header-level setting: it only takes effect on
|
|
554
|
+
// an empty DB, or on the next VACUUM after a change. Setting it
|
|
555
|
+
// here, BEFORE the first CREATE TABLE, is the only path that
|
|
556
|
+
// makes incremental_vacuum work without a full VACUUM. For
|
|
557
|
+
// existing DBs that predate this setting keep their current mode;
|
|
558
|
+
// Motel never performs a surprise full-file VACUUM at startup.
|
|
559
|
+
try { db.exec(`PRAGMA auto_vacuum = INCREMENTAL;`) } catch { /* ignore */ }
|
|
520
560
|
try {
|
|
521
561
|
db.exec(`
|
|
522
562
|
PRAGMA journal_mode = WAL;
|
|
@@ -526,6 +566,13 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
526
566
|
-- this the WAL happily runs into the hundreds of MB and queries
|
|
527
567
|
-- start paying the cost of walking the WAL on every read.
|
|
528
568
|
PRAGMA wal_autocheckpoint = 4000;
|
|
569
|
+
-- Hard floor for the WAL file. Auto-checkpoint controls *when*
|
|
570
|
+
-- pages move out of the WAL; size_limit controls how much the
|
|
571
|
+
-- WAL file is allowed to grow on disk. 128MB is generous enough
|
|
572
|
+
-- to absorb a long write burst without blocking on truncation,
|
|
573
|
+
-- tight enough that a wedged retention loop can't hide a 20GB
|
|
574
|
+
-- WAL the way a default no-limit configuration can.
|
|
575
|
+
PRAGMA journal_size_limit = 134217728;
|
|
529
576
|
|
|
530
577
|
CREATE TABLE IF NOT EXISTS spans (
|
|
531
578
|
trace_id TEXT NOT NULL,
|
|
@@ -604,6 +651,11 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
604
651
|
|
|
605
652
|
CREATE INDEX IF NOT EXISTS idx_log_attributes_key_value ON log_attributes(key, value, log_id);
|
|
606
653
|
CREATE INDEX IF NOT EXISTS idx_log_attributes_log_id ON log_attributes(log_id);
|
|
654
|
+
|
|
655
|
+
CREATE TABLE IF NOT EXISTS motel_maintenance (
|
|
656
|
+
key TEXT PRIMARY KEY,
|
|
657
|
+
value TEXT NOT NULL
|
|
658
|
+
);
|
|
607
659
|
`)
|
|
608
660
|
} catch (err) {
|
|
609
661
|
if (!isSqliteLockError(err)) throw err
|
|
@@ -624,7 +676,8 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
624
676
|
} catch { hasFts = false }
|
|
625
677
|
try {
|
|
626
678
|
const row = db.query(`SELECT name FROM sqlite_master WHERE type='table' AND name='span_attr_fts'`).get()
|
|
627
|
-
|
|
679
|
+
const backfill = db.query(`SELECT value FROM motel_maintenance WHERE key = 'span_attr_fts_v1'`).get() as { value: string } | null
|
|
680
|
+
hasAttrFts = row !== null && backfill?.value === "complete"
|
|
628
681
|
} catch { hasAttrFts = false }
|
|
629
682
|
}
|
|
630
683
|
|
|
@@ -723,10 +776,6 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
723
776
|
// pay 3-4s on cold open instead of 400ms.
|
|
724
777
|
try {
|
|
725
778
|
db.exec(`PRAGMA analysis_limit = 1000; PRAGMA optimize;`)
|
|
726
|
-
// First-time databases won't have sqlite_stat1 until we run a
|
|
727
|
-
// real ANALYZE. Force it once if stats haven't been collected.
|
|
728
|
-
const hasStats = db.query(`SELECT 1 FROM sqlite_master WHERE name = 'sqlite_stat1' LIMIT 1`).get() !== null
|
|
729
|
-
if (!hasStats) db.exec(`ANALYZE;`)
|
|
730
779
|
} catch {
|
|
731
780
|
// ANALYZE / optimize failures are never fatal — queries still work,
|
|
732
781
|
// they just run with default row estimates.
|
|
@@ -777,33 +826,157 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
777
826
|
)
|
|
778
827
|
`)
|
|
779
828
|
|
|
780
|
-
const rebuildTraceSummaries = db.query(`
|
|
781
|
-
INSERT INTO trace_summaries (
|
|
782
|
-
trace_id, service_name, root_operation_name, started_at_ms, ended_at_ms, active_span_count, duration_ms, span_count, error_count
|
|
783
|
-
)
|
|
784
|
-
${TRACE_SUMMARY_SELECT_SQL}
|
|
785
|
-
GROUP BY trace_id
|
|
786
|
-
`)
|
|
787
|
-
|
|
788
829
|
const reconcileTraceSummaries = Effect.sync(() => {
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
830
|
+
const marker = db.query(`SELECT value FROM motel_maintenance WHERE key = 'trace_summary_cursor'`).get() as { value: string } | null
|
|
831
|
+
const cursor = Number(marker?.value ?? 0)
|
|
832
|
+
const rows = db.query(`SELECT rowid, trace_id FROM spans WHERE rowid > ? ORDER BY rowid ASC LIMIT ?`).all(cursor, config.otel.retentionTraceBatch) as Array<{ rowid: number; trace_id: string }>
|
|
833
|
+
if (rows.length === 0) {
|
|
834
|
+
db.query(`INSERT OR REPLACE INTO motel_maintenance(key, value) VALUES ('trace_summary_cursor', '0')`).run()
|
|
835
|
+
return
|
|
795
836
|
}
|
|
837
|
+
const transaction = db.transaction(() => {
|
|
838
|
+
for (const traceId of new Set(rows.map((row) => row.trace_id))) upsertTraceSummary.run(traceId)
|
|
839
|
+
db.query(`INSERT OR REPLACE INTO motel_maintenance(key, value) VALUES ('trace_summary_cursor', ?)`).run(String(rows.at(-1)!.rowid))
|
|
840
|
+
})
|
|
841
|
+
transaction()
|
|
796
842
|
})
|
|
797
843
|
|
|
798
844
|
const deleteSpanAttributes = db.query(`DELETE FROM span_attributes WHERE trace_id = ? AND span_id = ?`)
|
|
799
845
|
const insertSpanAttribute = db.query(`INSERT INTO span_attributes (trace_id, span_id, key, value) VALUES (?, ?, ?, ?)`)
|
|
846
|
+
const spanAttributeInsertManyByCount = new Map<number, ReturnType<Database["query"]>>()
|
|
847
|
+
const insertSpanAttributesMany = (traceId: string, spanId: string, attributes: Readonly<Record<string, string>>) => {
|
|
848
|
+
const entries = Object.entries(attributes)
|
|
849
|
+
if (entries.length === 0) return
|
|
850
|
+
if (entries.length === 1) {
|
|
851
|
+
const [key, value] = entries[0]!
|
|
852
|
+
insertSpanAttribute.run(traceId, spanId, key, value)
|
|
853
|
+
return
|
|
854
|
+
}
|
|
855
|
+
let query = spanAttributeInsertManyByCount.get(entries.length)
|
|
856
|
+
if (!query) {
|
|
857
|
+
query = db.query(`INSERT INTO span_attributes (trace_id, span_id, key, value) VALUES ${entries.map(() => "(?, ?, ?, ?)").join(", ")}`)
|
|
858
|
+
spanAttributeInsertManyByCount.set(entries.length, query)
|
|
859
|
+
}
|
|
860
|
+
query.run(...entries.flatMap(([key, value]) => [traceId, spanId, key, value]))
|
|
861
|
+
}
|
|
800
862
|
const deleteSpanOperationSearch = db.query(`DELETE FROM span_operation_fts WHERE trace_id = ? AND span_id = ?`)
|
|
801
863
|
const insertSpanOperationSearch = db.query(`INSERT INTO span_operation_fts (trace_id, span_id, operation_name) VALUES (?, ?, ?)`)
|
|
864
|
+
const deleteSpanOperationSearchManyByCount = new Map<number, ReturnType<Database["query"]>>()
|
|
865
|
+
const insertSpanOperationSearchManyByCount = new Map<number, ReturnType<Database["query"]>>()
|
|
866
|
+
const updateSpanOperationSearchMany = (operations: ReadonlyArray<readonly [string, string, string]>) => {
|
|
867
|
+
if (operations.length === 0) return
|
|
868
|
+
if (operations.length === 1) {
|
|
869
|
+
const [traceId, spanId, operationName] = operations[0]!
|
|
870
|
+
deleteSpanOperationSearch.run(traceId, spanId)
|
|
871
|
+
insertSpanOperationSearch.run(traceId, spanId, operationName)
|
|
872
|
+
return
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
let deleteQuery = deleteSpanOperationSearchManyByCount.get(operations.length)
|
|
876
|
+
if (!deleteQuery) {
|
|
877
|
+
deleteQuery = db.query(`DELETE FROM span_operation_fts WHERE ${operations.map(() => "(trace_id = ? AND span_id = ?)").join(" OR ")}`)
|
|
878
|
+
deleteSpanOperationSearchManyByCount.set(operations.length, deleteQuery)
|
|
879
|
+
}
|
|
880
|
+
deleteQuery.run(...operations.flatMap(([traceId, spanId]) => [traceId, spanId]))
|
|
881
|
+
|
|
882
|
+
let insertQuery = insertSpanOperationSearchManyByCount.get(operations.length)
|
|
883
|
+
if (!insertQuery) {
|
|
884
|
+
insertQuery = db.query(`INSERT INTO span_operation_fts (trace_id, span_id, operation_name) VALUES ${operations.map(() => "(?, ?, ?)").join(", ")}`)
|
|
885
|
+
insertSpanOperationSearchManyByCount.set(operations.length, insertQuery)
|
|
886
|
+
}
|
|
887
|
+
insertQuery.run(...operations.flatMap(([traceId, spanId, operationName]) => [traceId, spanId, operationName]))
|
|
888
|
+
}
|
|
802
889
|
const insertLogAttribute = db.query(`INSERT INTO log_attributes (log_id, key, value) VALUES (?, ?, ?)`)
|
|
890
|
+
const logAttributeInsertManyByCount = new Map<number, ReturnType<Database["query"]>>()
|
|
891
|
+
const insertLogAttributesMany = (logId: number, attributes: Readonly<Record<string, string>>) => {
|
|
892
|
+
const entries = Object.entries(attributes)
|
|
893
|
+
if (entries.length === 0) return
|
|
894
|
+
if (entries.length === 1) {
|
|
895
|
+
const [key, value] = entries[0]!
|
|
896
|
+
insertLogAttribute.run(logId, key, value)
|
|
897
|
+
return
|
|
898
|
+
}
|
|
899
|
+
let query = logAttributeInsertManyByCount.get(entries.length)
|
|
900
|
+
if (!query) {
|
|
901
|
+
query = db.query(`INSERT INTO log_attributes (log_id, key, value) VALUES ${entries.map(() => "(?, ?, ?)").join(", ")}`)
|
|
902
|
+
logAttributeInsertManyByCount.set(entries.length, query)
|
|
903
|
+
}
|
|
904
|
+
query.run(...entries.flatMap(([key, value]) => [logId, key, value]))
|
|
905
|
+
}
|
|
803
906
|
const insertLogBodySearch = db.query(`INSERT INTO log_body_fts (log_id, body) VALUES (?, ?)`)
|
|
907
|
+
const insertLogBodySearchManyByCount = new Map<number, ReturnType<Database["query"]>>()
|
|
908
|
+
const insertLogBodySearchMany = (entries: ReadonlyArray<readonly [string, string]>) => {
|
|
909
|
+
if (entries.length === 0) return
|
|
910
|
+
if (entries.length === 1) {
|
|
911
|
+
const [logId, body] = entries[0]!
|
|
912
|
+
insertLogBodySearch.run(logId, body)
|
|
913
|
+
return
|
|
914
|
+
}
|
|
915
|
+
let query = insertLogBodySearchManyByCount.get(entries.length)
|
|
916
|
+
if (!query) {
|
|
917
|
+
query = db.query(`INSERT INTO log_body_fts (log_id, body) VALUES ${entries.map(() => "(?, ?)").join(", ")}`)
|
|
918
|
+
insertLogBodySearchManyByCount.set(entries.length, query)
|
|
919
|
+
}
|
|
920
|
+
query.run(...entries.flatMap(([logId, body]) => [logId, body]))
|
|
921
|
+
}
|
|
804
922
|
|
|
805
923
|
const maxDbSizeBytes = config.otel.maxDbSizeMb * 1024 * 1024
|
|
806
924
|
|
|
925
|
+
// Freelist-ratio thresholds for the adaptive reclaim loop. Below the
|
|
926
|
+
// LOW threshold there's nothing worth doing; above HIGH we are in the
|
|
927
|
+
// 17GB-DB-with-10GB-freelist failure mode and need to reclaim aggressively
|
|
928
|
+
// even if it costs writer-lock time.
|
|
929
|
+
const FREELIST_LOW_RATIO = 0.05
|
|
930
|
+
const FREELIST_MID_RATIO = 0.20
|
|
931
|
+
const FREELIST_HIGH_RATIO = 0.50
|
|
932
|
+
const VACUUM_PAGES_NORMAL = 2000 // ~8MB/pass
|
|
933
|
+
const VACUUM_PAGES_BUSY = 20000 // ~80MB/pass — used when freelist > 20%
|
|
934
|
+
const VACUUM_PAGES_PANIC = 50000 // ~200MB/pass — only when ratio > 50%
|
|
935
|
+
|
|
936
|
+
const ftsTableNames = ["span_attr_fts", "log_body_fts", "span_operation_fts"] as const
|
|
937
|
+
|
|
938
|
+
const incrementalFtsMerge = (pages: number) => {
|
|
939
|
+
// FTS5 segment merges drop tombstone rows that DELETE leaves behind.
|
|
940
|
+
// Without periodic merges, deleted FTS rows stay on disk indefinitely
|
|
941
|
+
// — a major source of freelist pages on a heavy-deletion workload.
|
|
942
|
+
// `merge=N` is a bounded, online operation: it merges at most N
|
|
943
|
+
// pages of work and returns. Per FTS5 docs, missing tables silently
|
|
944
|
+
// throw; we swallow because not every DB has every FTS table.
|
|
945
|
+
for (const name of ftsTableNames) {
|
|
946
|
+
try { db.query(`INSERT INTO ${name}(${name}) VALUES (?)`).run(`merge=${pages}`) } catch { /* table absent or older schema */ }
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
const reclaimSpace = Effect.fn("motel/TelemetryStore.reclaimSpace")(function* () {
|
|
951
|
+
yield* Effect.sync(() => {
|
|
952
|
+
const pageCount = (db.query(`PRAGMA page_count`).get() as { page_count: number }).page_count
|
|
953
|
+
const freePages = (db.query(`PRAGMA freelist_count`).get() as { freelist_count: number }).freelist_count
|
|
954
|
+
if (pageCount === 0) return
|
|
955
|
+
const ratio = freePages / pageCount
|
|
956
|
+
if (ratio < FREELIST_LOW_RATIO) return
|
|
957
|
+
|
|
958
|
+
// Adaptive vacuum sizing — fixed 2000 pages/min could not keep
|
|
959
|
+
// up with sustained deletions, leaking 10GB of freelist over
|
|
960
|
+
// time. Scale the per-pass work to the size of the backlog so
|
|
961
|
+
// we stay roughly proportional to the deficit.
|
|
962
|
+
const pages =
|
|
963
|
+
ratio >= FREELIST_HIGH_RATIO ? VACUUM_PAGES_PANIC :
|
|
964
|
+
ratio >= FREELIST_MID_RATIO ? VACUUM_PAGES_BUSY :
|
|
965
|
+
VACUUM_PAGES_NORMAL
|
|
966
|
+
|
|
967
|
+
try { db.exec(`PRAGMA incremental_vacuum(${pages});`) } catch { /* ignore */ }
|
|
968
|
+
|
|
969
|
+
// In WAL mode incremental_vacuum only moves pages — the file
|
|
970
|
+
// shrinks on the next checkpoint. PASSIVE silently skips when
|
|
971
|
+
// readers are active (the failure mode the agent's research
|
|
972
|
+
// flagged: checkpoint starvation). Use RESTART normally and
|
|
973
|
+
// TRUNCATE in panic mode to physically shrink the WAL when it
|
|
974
|
+
// has grown.
|
|
975
|
+
const mode = ratio >= FREELIST_HIGH_RATIO ? "TRUNCATE" : "RESTART"
|
|
976
|
+
try { db.exec(`PRAGMA wal_checkpoint(${mode});`) } catch { /* ignore */ }
|
|
977
|
+
})
|
|
978
|
+
})
|
|
979
|
+
|
|
807
980
|
const cleanupExpired = Effect.fn("motel/TelemetryStore.cleanupExpired")(function* () {
|
|
808
981
|
const now = yield* Clock.currentTimeMillis
|
|
809
982
|
|
|
@@ -819,12 +992,12 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
819
992
|
|
|
820
993
|
// Time-based: completed traces whose last span ended before cutoff.
|
|
821
994
|
const timeExpired = db.query(
|
|
822
|
-
`SELECT trace_id FROM trace_summaries WHERE active_span_count = 0 AND ended_at_ms > 0 AND ended_at_ms < ?`,
|
|
823
|
-
).all(cutoff) as readonly { trace_id: string }[]
|
|
995
|
+
`SELECT trace_id FROM trace_summaries WHERE active_span_count = 0 AND ended_at_ms > 0 AND ended_at_ms < ? ORDER BY ended_at_ms ASC LIMIT ?`,
|
|
996
|
+
).all(cutoff, config.otel.retentionTraceBatch) as readonly { trace_id: string }[]
|
|
824
997
|
for (const row of timeExpired) toEvict.add(row.trace_id)
|
|
825
998
|
|
|
826
|
-
// Size-based: if actual data exceeds
|
|
827
|
-
//
|
|
999
|
+
// Size-based: if actual data exceeds the target, drop one bounded
|
|
1000
|
+
// batch of the oldest completed traces. `(page_count - freelist_count)`
|
|
828
1001
|
// ignores freed-but-not-vacuumed pages so a large freelist doesn't
|
|
829
1002
|
// trigger a deletion death spiral.
|
|
830
1003
|
const pageCount = (db.query(`PRAGMA page_count`).get() as { page_count: number }).page_count
|
|
@@ -832,22 +1005,22 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
832
1005
|
const pageSize = (db.query(`PRAGMA page_size`).get() as { page_size: number }).page_size
|
|
833
1006
|
const dbSize = (pageCount - freePages) * pageSize
|
|
834
1007
|
if (dbSize > maxDbSizeBytes) {
|
|
835
|
-
const completedCount = (db.query(
|
|
836
|
-
`SELECT COUNT(*) AS c FROM trace_summaries WHERE active_span_count = 0`,
|
|
837
|
-
).get() as { c: number }).c
|
|
838
|
-
const traceCutCount = Math.max(1, Math.floor(completedCount * 0.2))
|
|
839
1008
|
const oldest = db.query(
|
|
840
1009
|
`SELECT trace_id FROM trace_summaries WHERE active_span_count = 0 ORDER BY started_at_ms ASC LIMIT ?`,
|
|
841
|
-
).all(
|
|
1010
|
+
).all(config.otel.retentionTraceBatch) as readonly { trace_id: string }[]
|
|
842
1011
|
// Set.add dedupes overlap with the time-expired batch above.
|
|
843
1012
|
for (const row of oldest) toEvict.add(row.trace_id)
|
|
844
1013
|
}
|
|
845
1014
|
|
|
846
|
-
//
|
|
847
|
-
//
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
1015
|
+
// Logs have their own retention boundary. A correlated log may refer
|
|
1016
|
+
// to a trace that was sampled elsewhere or never reached Motel, so
|
|
1017
|
+
// tying log eviction to trace_summaries lets those rows grow forever.
|
|
1018
|
+
const expiredLogs = db.query(`DELETE FROM logs WHERE id IN (SELECT id FROM logs WHERE timestamp_ms < ? ORDER BY timestamp_ms ASC LIMIT ?)`).run(cutoff, config.otel.retentionLogBatch)
|
|
1019
|
+
let deletedLogs = Number(expiredLogs.changes) > 0
|
|
1020
|
+
if (dbSize > maxDbSizeBytes) {
|
|
1021
|
+
const oversizedLogs = db.query(`DELETE FROM logs WHERE id IN (SELECT id FROM logs ORDER BY timestamp_ms ASC LIMIT ?)`).run(config.otel.retentionLogBatch)
|
|
1022
|
+
deletedLogs = deletedLogs || Number(oversizedLogs.changes) > 0
|
|
1023
|
+
}
|
|
851
1024
|
|
|
852
1025
|
// Batch the trace-id list so the IN placeholders stay under
|
|
853
1026
|
// SQLite's default limit (~999). Each batch wipes every row
|
|
@@ -870,48 +1043,54 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
870
1043
|
|
|
871
1044
|
// Log-side orphans (log_attributes + FTS) are keyed by log.id,
|
|
872
1045
|
// so prune what no longer has a parent log row.
|
|
873
|
-
db.query(`DELETE FROM log_attributes WHERE NOT EXISTS (SELECT 1 FROM logs WHERE logs.id = log_attributes.log_id)`).run()
|
|
1046
|
+
const orphanAttributes = db.query(`DELETE FROM log_attributes WHERE rowid IN (SELECT log_attributes.rowid FROM log_attributes WHERE NOT EXISTS (SELECT 1 FROM logs WHERE logs.id = log_attributes.log_id) LIMIT ?)`).run(config.otel.retentionLogBatch)
|
|
1047
|
+
let deletedOrphans = Number(orphanAttributes.changes) > 0
|
|
874
1048
|
try {
|
|
875
|
-
db.query(`DELETE FROM log_body_fts WHERE NOT EXISTS (SELECT 1 FROM logs WHERE logs.id = CAST(log_body_fts.log_id AS INTEGER))`).run()
|
|
1049
|
+
const orphanFts = db.query(`DELETE FROM log_body_fts WHERE rowid IN (SELECT rowid FROM log_body_fts WHERE NOT EXISTS (SELECT 1 FROM logs WHERE logs.id = CAST(log_body_fts.log_id AS INTEGER)) LIMIT ?)`).run(config.otel.retentionLogBatch)
|
|
1050
|
+
deletedOrphans = deletedOrphans || Number(orphanFts.changes) > 0
|
|
876
1051
|
} catch {
|
|
877
1052
|
// FTS table may not exist on old DBs.
|
|
878
1053
|
}
|
|
879
1054
|
|
|
880
|
-
//
|
|
881
|
-
//
|
|
882
|
-
//
|
|
883
|
-
//
|
|
884
|
-
//
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
//
|
|
891
|
-
//
|
|
892
|
-
//
|
|
893
|
-
//
|
|
894
|
-
|
|
1055
|
+
// Checkpoint after a big delete pass so the freed pages land
|
|
1056
|
+
// in the main DB file and become eligible for incremental
|
|
1057
|
+
// vacuum. Use RESTART (not PASSIVE): PASSIVE silently no-ops
|
|
1058
|
+
// when readers are active, which is the documented mechanism
|
|
1059
|
+
// behind WAL/freelist starvation when ingest is busy.
|
|
1060
|
+
if (toEvict.size === 0 && !deletedLogs && !deletedOrphans) return
|
|
1061
|
+
try { db.exec(`PRAGMA wal_checkpoint(RESTART);`) } catch { /* ignore */ }
|
|
1062
|
+
|
|
1063
|
+
// Incremental FTS5 merge — DELETE on an FTS5-indexed row
|
|
1064
|
+
// leaves a tombstone in the segment tree that only `merge`
|
|
1065
|
+
// reclaims. Skipping this is the second compounding cause
|
|
1066
|
+
// (after fixed-size vacuum) of the slow freelist accretion
|
|
1067
|
+
// that took the DB to 17GB. 100 pages of merge work per
|
|
1068
|
+
// retention tick is bounded and runs in milliseconds.
|
|
1069
|
+
incrementalFtsMerge(100)
|
|
1070
|
+
|
|
1071
|
+
// Actual page reclamation lives in `reclaimSpace`, which
|
|
1072
|
+
// runs on its own faster cadence so the file shrinks even
|
|
1073
|
+
// when no traces are evicted in a given retention tick (e.g.
|
|
1074
|
+
// after a large historical eviction has already happened).
|
|
895
1075
|
})
|
|
896
1076
|
})
|
|
897
1077
|
|
|
898
|
-
// Retention only runs in
|
|
899
|
-
//
|
|
900
|
-
// competing for the write lock with overlapping DELETE passes.
|
|
1078
|
+
// Retention only runs in the ingest worker so maintenance never blocks
|
|
1079
|
+
// the HTTP event loop and no second writer duplicates cleanup work.
|
|
901
1080
|
if (opts.runRetention) {
|
|
902
|
-
//
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
//
|
|
909
|
-
//
|
|
910
|
-
//
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
//
|
|
914
|
-
yield* Effect.forkScoped(Effect.repeat(
|
|
1081
|
+
// Cleanup runs on the telemetry worker, never the HTTP event loop.
|
|
1082
|
+
yield* Effect.forkScoped(Effect.repeat(
|
|
1083
|
+
Effect.andThen(reconcileTraceSummaries, cleanupExpired()).pipe(Effect.catchCause((cause) => Effect.logWarning(`motel: maintenance pass failed: ${Cause.pretty(cause)}`))),
|
|
1084
|
+
Schedule.spaced(`${config.otel.retentionIntervalSeconds} seconds`),
|
|
1085
|
+
))
|
|
1086
|
+
|
|
1087
|
+
// Page reclamation runs on a separate, faster cadence (10s) and
|
|
1088
|
+
// is independent of the eviction loop. The reason: a single sweep
|
|
1089
|
+
// at 60s intervals can move only ~8MB of pages before the next
|
|
1090
|
+
// burst of inserts grows the freelist again. Decoupling lets us
|
|
1091
|
+
// catch up adaptively (see VACUUM_PAGES_BUSY/PANIC) without
|
|
1092
|
+
// changing the cost of the heavier delete sweep.
|
|
1093
|
+
yield* Effect.forkScoped(Effect.repeat(reclaimSpace(), Schedule.spaced("10 seconds")))
|
|
915
1094
|
|
|
916
1095
|
// Periodically refresh query planner stats. `PRAGMA optimize` is a
|
|
917
1096
|
// no-op when nothing has changed, so this is essentially free on idle
|
|
@@ -924,35 +1103,48 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
924
1103
|
yield* Effect.forkScoped(Effect.repeat(refreshPlannerStats, Schedule.spaced("15 minutes")))
|
|
925
1104
|
}
|
|
926
1105
|
|
|
927
|
-
//
|
|
928
|
-
//
|
|
929
|
-
// Runs forked so server startup isn't blocked; queries hitting the
|
|
930
|
-
// FTS will just return empty until the fill lands. On a 2 GB DB with
|
|
931
|
-
// ~400 matching rows this takes ~3-8 seconds. Writer-only because
|
|
932
|
-
// it does INSERT INTO ... — readonly connections would error.
|
|
1106
|
+
// Incrementally rebuild historical AI attributes in bounded batches.
|
|
1107
|
+
// Queries fall back to LIKE until the persistent marker is complete.
|
|
933
1108
|
if (hasAttrFts && !opts.readonly) {
|
|
934
|
-
const
|
|
1109
|
+
const backfillAttrFtsBatch = Effect.sync(() => {
|
|
935
1110
|
try {
|
|
936
|
-
const ftsCount = (db.query(`SELECT COUNT(*) AS c FROM span_attr_fts`).get() as { c: number }).c
|
|
937
|
-
if (ftsCount > 0) return
|
|
938
1111
|
const keyList = AI_FTS_KEYS.map((k) => `'${k.replace(/'/g, "''")}'`).join(", ")
|
|
939
|
-
const
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
INSERT INTO span_attr_fts(
|
|
948
|
-
|
|
949
|
-
|
|
1112
|
+
const marker = db.query(`SELECT value FROM motel_maintenance WHERE key = 'span_attr_fts_v1'`).get() as { value: string } | null
|
|
1113
|
+
if (marker?.value === "complete") return false
|
|
1114
|
+
let cursor = 0
|
|
1115
|
+
let maxRowId = 0
|
|
1116
|
+
if (marker) {
|
|
1117
|
+
[cursor, maxRowId] = marker.value.split(":").map(Number)
|
|
1118
|
+
} else {
|
|
1119
|
+
maxRowId = (db.query(`SELECT COALESCE(MAX(rowid), 0) AS value FROM span_attributes`).get() as { value: number }).value
|
|
1120
|
+
db.query(`INSERT INTO span_attr_fts(span_attr_fts) VALUES ('delete-all')`).run()
|
|
1121
|
+
db.query(`INSERT OR REPLACE INTO motel_maintenance(key, value) VALUES ('span_attr_fts_v1', ?)`).run(`0:${maxRowId}`)
|
|
1122
|
+
}
|
|
1123
|
+
const rows = db.query(`SELECT rowid, value FROM span_attributes WHERE key IN (${keyList}) AND rowid > ? AND rowid <= ? ORDER BY rowid ASC LIMIT 500`).all(cursor, maxRowId) as Array<{ rowid: number; value: string }>
|
|
1124
|
+
if (rows.length === 0) {
|
|
1125
|
+
db.query(`UPDATE motel_maintenance SET value = 'complete' WHERE key = 'span_attr_fts_v1'`).run()
|
|
1126
|
+
hasAttrFts = true
|
|
1127
|
+
return false
|
|
1128
|
+
}
|
|
1129
|
+
const insert = db.query(`INSERT INTO span_attr_fts(rowid, value) VALUES (?, ?)`)
|
|
1130
|
+
const transaction = db.transaction(() => {
|
|
1131
|
+
for (const row of rows) insert.run(row.rowid, row.value)
|
|
1132
|
+
db.query(`UPDATE motel_maintenance SET value = ? WHERE key = 'span_attr_fts_v1'`).run(`${rows.at(-1)!.rowid}:${maxRowId}`)
|
|
1133
|
+
})
|
|
1134
|
+
transaction()
|
|
1135
|
+
return true
|
|
950
1136
|
} catch {
|
|
951
1137
|
// Backfill failure is never fatal — new ingests still
|
|
952
1138
|
// populate FTS via the trigger, and queries fall back to
|
|
953
1139
|
// LIKE when FTS lookups return empty.
|
|
1140
|
+
return true
|
|
954
1141
|
}
|
|
955
1142
|
})
|
|
1143
|
+
const backfillAttrFts: Effect.Effect<void> = Effect.suspend(() =>
|
|
1144
|
+
Effect.flatMap(backfillAttrFtsBatch, (pending) =>
|
|
1145
|
+
pending ? Effect.andThen(Effect.sleep("100 millis"), backfillAttrFts) : Effect.void,
|
|
1146
|
+
),
|
|
1147
|
+
)
|
|
956
1148
|
yield* Effect.forkScoped(backfillAttrFts)
|
|
957
1149
|
}
|
|
958
1150
|
|
|
@@ -961,6 +1153,7 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
961
1153
|
let insertedSpans = 0
|
|
962
1154
|
const transaction = db.transaction((request: OtlpTraceExportRequest) => {
|
|
963
1155
|
const touchedTraceIds = new Set<string>()
|
|
1156
|
+
const touchedOperations: Array<readonly [string, string, string]> = []
|
|
964
1157
|
for (const resourceSpans of request.resourceSpans ?? []) {
|
|
965
1158
|
const resourceAttributes = attributeMap(resourceSpans.resource?.attributes)
|
|
966
1159
|
const serviceName = resourceAttributes["service.name"] || resourceAttributes["service_name"] || "unknown"
|
|
@@ -969,6 +1162,10 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
969
1162
|
const scopeName = scopeSpans.scope?.name ?? null
|
|
970
1163
|
|
|
971
1164
|
for (const span of scopeSpans.spans ?? []) {
|
|
1165
|
+
const traceId = normalizeOtlpBinaryId(span.traceId, 16)
|
|
1166
|
+
const spanId = normalizeOtlpBinaryId(span.spanId, 8)
|
|
1167
|
+
if (!traceId || !spanId) continue
|
|
1168
|
+
const parentSpanId = normalizeOtlpBinaryId(span.parentSpanId, 8)
|
|
972
1169
|
const spanAttributes = attributeMap(span.attributes)
|
|
973
1170
|
const mergedAttributes = { ...resourceAttributes, ...spanAttributes }
|
|
974
1171
|
const startTimeMs = nanosToMilliseconds(span.startTimeUnixNano)
|
|
@@ -980,9 +1177,9 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
980
1177
|
}))
|
|
981
1178
|
|
|
982
1179
|
insertSpan.run(
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
1180
|
+
traceId,
|
|
1181
|
+
spanId,
|
|
1182
|
+
parentSpanId,
|
|
986
1183
|
serviceName,
|
|
987
1184
|
scopeName,
|
|
988
1185
|
span.name ?? "unknown",
|
|
@@ -995,21 +1192,22 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
995
1192
|
JSON.stringify(resourceAttributes),
|
|
996
1193
|
JSON.stringify(events),
|
|
997
1194
|
)
|
|
998
|
-
deleteSpanAttributes.run(
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
try {
|
|
1003
|
-
deleteSpanOperationSearch.run(span.traceId, span.spanId)
|
|
1004
|
-
insertSpanOperationSearch.run(span.traceId, span.spanId, span.name ?? "unknown")
|
|
1005
|
-
} catch {
|
|
1006
|
-
// FTS is optional.
|
|
1007
|
-
}
|
|
1008
|
-
touchedTraceIds.add(span.traceId)
|
|
1195
|
+
deleteSpanAttributes.run(traceId, spanId)
|
|
1196
|
+
insertSpanAttributesMany(traceId, spanId, mergedAttributes)
|
|
1197
|
+
touchedOperations.push([traceId, spanId, span.name ?? "unknown"])
|
|
1198
|
+
touchedTraceIds.add(traceId)
|
|
1009
1199
|
insertedSpans += 1
|
|
1010
1200
|
}
|
|
1011
1201
|
}
|
|
1012
1202
|
}
|
|
1203
|
+
try {
|
|
1204
|
+
const BATCH_SIZE = 500
|
|
1205
|
+
for (let offset = 0; offset < touchedOperations.length; offset += BATCH_SIZE) {
|
|
1206
|
+
updateSpanOperationSearchMany(touchedOperations.slice(offset, offset + BATCH_SIZE))
|
|
1207
|
+
}
|
|
1208
|
+
} catch {
|
|
1209
|
+
// FTS is optional.
|
|
1210
|
+
}
|
|
1013
1211
|
for (const traceId of touchedTraceIds) {
|
|
1014
1212
|
upsertTraceSummary.run(traceId)
|
|
1015
1213
|
}
|
|
@@ -1024,6 +1222,7 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1024
1222
|
return yield* Effect.sync(() => {
|
|
1025
1223
|
let insertedLogs = 0
|
|
1026
1224
|
const transaction = db.transaction((request: OtlpLogExportRequest) => {
|
|
1225
|
+
const touchedLogBodies: Array<readonly [string, string]> = []
|
|
1027
1226
|
for (const resourceLogs of request.resourceLogs ?? []) {
|
|
1028
1227
|
const resourceAttributes = attributeMap(resourceLogs.resource?.attributes)
|
|
1029
1228
|
const serviceName = resourceAttributes["service.name"] || resourceAttributes["service_name"] || "unknown"
|
|
@@ -1036,9 +1235,11 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1036
1235
|
const mergedAttributes = { ...resourceAttributes, ...attributes }
|
|
1037
1236
|
const timestampMs = nanosToMilliseconds(record.timeUnixNano ?? record.observedTimeUnixNano)
|
|
1038
1237
|
const body = stringifyValue(parseAnyValue(record.body))
|
|
1238
|
+
const rawTraceId = attributes.traceId || attributes.trace_id || record.traceId || null
|
|
1239
|
+
const rawSpanId = attributes.spanId || attributes.span_id || record.spanId || null
|
|
1039
1240
|
const result = insertLog.run(
|
|
1040
|
-
|
|
1041
|
-
|
|
1241
|
+
normalizeOtlpBinaryId(rawTraceId, 16),
|
|
1242
|
+
normalizeOtlpBinaryId(rawSpanId, 8),
|
|
1042
1243
|
serviceName,
|
|
1043
1244
|
scopeName,
|
|
1044
1245
|
record.severityText ?? "INFO",
|
|
@@ -1048,18 +1249,20 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1048
1249
|
JSON.stringify(resourceAttributes),
|
|
1049
1250
|
)
|
|
1050
1251
|
const logId = Number((result as { lastInsertRowid: number | bigint }).lastInsertRowid)
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
}
|
|
1054
|
-
try {
|
|
1055
|
-
insertLogBodySearch.run(String(logId), body)
|
|
1056
|
-
} catch {
|
|
1057
|
-
// FTS is optional.
|
|
1058
|
-
}
|
|
1252
|
+
insertLogAttributesMany(logId, mergedAttributes)
|
|
1253
|
+
touchedLogBodies.push([String(logId), body])
|
|
1059
1254
|
insertedLogs += 1
|
|
1060
1255
|
}
|
|
1061
1256
|
}
|
|
1062
1257
|
}
|
|
1258
|
+
try {
|
|
1259
|
+
const BATCH_SIZE = 500
|
|
1260
|
+
for (let offset = 0; offset < touchedLogBodies.length; offset += BATCH_SIZE) {
|
|
1261
|
+
insertLogBodySearchMany(touchedLogBodies.slice(offset, offset + BATCH_SIZE))
|
|
1262
|
+
}
|
|
1263
|
+
} catch {
|
|
1264
|
+
// FTS is optional.
|
|
1265
|
+
}
|
|
1063
1266
|
})
|
|
1064
1267
|
|
|
1065
1268
|
transaction(payload)
|
|
@@ -1068,9 +1271,11 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1068
1271
|
})
|
|
1069
1272
|
|
|
1070
1273
|
const listServices = Effect.fn("motel/TelemetryStore.listServices")(function* () {
|
|
1071
|
-
|
|
1072
1274
|
const cutoff = (yield* Clock.currentTimeMillis) - config.otel.traceLookbackMinutes * 60 * 1000
|
|
1073
|
-
|
|
1275
|
+
const services = yield* Effect.sync(() => {
|
|
1276
|
+
// Discover recent activity from span rows, not trace starts: a
|
|
1277
|
+
// long-running trace can emit a current child after its root ages
|
|
1278
|
+
// outside the lookback window.
|
|
1074
1279
|
const rows = db.query(`
|
|
1075
1280
|
SELECT service_name FROM spans WHERE start_time_ms >= ?
|
|
1076
1281
|
UNION
|
|
@@ -1079,6 +1284,8 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1079
1284
|
`).all(cutoff, cutoff) as Array<{ service_name: string }>
|
|
1080
1285
|
return rows.map((row) => row.service_name)
|
|
1081
1286
|
})
|
|
1287
|
+
yield* Effect.annotateCurrentSpan("trace.service_count", services.length)
|
|
1288
|
+
return services
|
|
1082
1289
|
})()
|
|
1083
1290
|
|
|
1084
1291
|
const loadTracesByIds = (traceIds: readonly string[]) => {
|
|
@@ -1104,15 +1311,19 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1104
1311
|
}
|
|
1105
1312
|
|
|
1106
1313
|
const listRecentTraces = Effect.fn("motel/TelemetryStore.listRecentTraces")(function* (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number }) {
|
|
1314
|
+
yield* Effect.annotateCurrentSpan("trace.service_name", serviceName ?? "all")
|
|
1107
1315
|
const summaries = yield* listTraceSummaries(serviceName, options)
|
|
1108
|
-
|
|
1316
|
+
const traces = yield* Effect.sync(() => loadTracesByIds(summaries.map((summary) => summary.traceId)))
|
|
1317
|
+
yield* Effect.annotateCurrentSpan("trace.result_count", traces.length)
|
|
1318
|
+
return traces
|
|
1109
1319
|
})
|
|
1110
1320
|
|
|
1111
1321
|
const listTraceSummaries = Effect.fn("motel/TelemetryStore.listTraceSummaries")(function* (serviceName: string | null, options?: { readonly lookbackMinutes?: number; readonly limit?: number; readonly cursorStartedAtMs?: number; readonly cursorTraceId?: string }) {
|
|
1322
|
+
yield* Effect.annotateCurrentSpan("trace.service_name", serviceName ?? "all")
|
|
1112
1323
|
const cutoff = (yield* Clock.currentTimeMillis) - (options?.lookbackMinutes ?? config.otel.traceLookbackMinutes) * 60 * 1000
|
|
1113
1324
|
const limit = options?.limit ?? config.otel.traceFetchLimit
|
|
1114
1325
|
|
|
1115
|
-
|
|
1326
|
+
const summaries = yield* Effect.sync(() => {
|
|
1116
1327
|
const clauses = ["started_at_ms >= ?"]
|
|
1117
1328
|
const params: Array<string | number> = [cutoff]
|
|
1118
1329
|
|
|
@@ -1134,6 +1345,8 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1134
1345
|
LIMIT ?
|
|
1135
1346
|
`).all(...params, limit) as TraceSummaryRow[]
|
|
1136
1347
|
}).pipe(Effect.map((rows) => rows.map(parseSummaryRow)))
|
|
1348
|
+
yield* Effect.annotateCurrentSpan("trace.result_count", summaries.length)
|
|
1349
|
+
return summaries
|
|
1137
1350
|
})
|
|
1138
1351
|
|
|
1139
1352
|
const searchTraceSummaries = Effect.fn("motel/TelemetryStore.searchTraceSummaries")(function* (input: TraceSearch) {
|
|
@@ -1212,6 +1425,7 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1212
1425
|
})
|
|
1213
1426
|
|
|
1214
1427
|
const getTrace = Effect.fn("motel/TelemetryStore.getTrace")(function* (traceId: string) {
|
|
1428
|
+
yield* Effect.annotateCurrentSpan("trace.trace_id", traceId)
|
|
1215
1429
|
return yield* Effect.sync(() => {
|
|
1216
1430
|
const rows = db.query(`
|
|
1217
1431
|
SELECT * FROM spans WHERE trace_id = ? ORDER BY start_time_ms ASC
|
|
@@ -1221,6 +1435,7 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1221
1435
|
})
|
|
1222
1436
|
|
|
1223
1437
|
const getSpan = Effect.fn("motel/TelemetryStore.getSpan")(function* (spanId: string) {
|
|
1438
|
+
yield* Effect.annotateCurrentSpan("trace.span_id", spanId)
|
|
1224
1439
|
return yield* Effect.sync(() => {
|
|
1225
1440
|
// Fetch only the target span row (uses idx_spans_span_id)
|
|
1226
1441
|
const spanRow = db.query(`SELECT * FROM spans WHERE span_id = ? LIMIT 1`).get(spanId) as SpanRow | null
|
|
@@ -1228,7 +1443,28 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1228
1443
|
|
|
1229
1444
|
const traceId = spanRow.trace_id
|
|
1230
1445
|
|
|
1231
|
-
//
|
|
1446
|
+
// Walk the parent chain in one recursive CTE instead of one query
|
|
1447
|
+
// per hop. Root context remains the earliest root in the trace,
|
|
1448
|
+
// matching full trace hydration even when input has multiple roots.
|
|
1449
|
+
let parentOperationName: string | null = null
|
|
1450
|
+
let depth = 0
|
|
1451
|
+
if (spanRow.parent_span_id) {
|
|
1452
|
+
const ancestors = db.query(`
|
|
1453
|
+
WITH RECURSIVE ancestors(span_id, parent_span_id, operation_name, hop) AS (
|
|
1454
|
+
SELECT span_id, parent_span_id, operation_name, 1
|
|
1455
|
+
FROM spans WHERE trace_id = ? AND span_id = ?
|
|
1456
|
+
UNION ALL
|
|
1457
|
+
SELECT s.span_id, s.parent_span_id, s.operation_name, a.hop + 1
|
|
1458
|
+
FROM ancestors a
|
|
1459
|
+
JOIN spans s ON s.trace_id = ? AND s.span_id = a.parent_span_id
|
|
1460
|
+
)
|
|
1461
|
+
SELECT span_id, parent_span_id, operation_name, hop FROM ancestors ORDER BY hop ASC
|
|
1462
|
+
`).all(traceId, spanRow.parent_span_id, traceId) as Array<{ span_id: string; parent_span_id: string | null; operation_name: string; hop: number }>
|
|
1463
|
+
|
|
1464
|
+
parentOperationName = ancestors[0]?.operation_name ?? null
|
|
1465
|
+
depth = ancestors.length
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1232
1468
|
const rootRow = db.query(`
|
|
1233
1469
|
SELECT operation_name FROM spans
|
|
1234
1470
|
WHERE trace_id = ? AND parent_span_id IS NULL
|
|
@@ -1236,28 +1472,6 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1236
1472
|
`).get(traceId) as { operation_name: string } | null
|
|
1237
1473
|
const rootOperationName = rootRow?.operation_name ?? "unknown"
|
|
1238
1474
|
|
|
1239
|
-
// Get parent operation name if span has a parent (PK lookup)
|
|
1240
|
-
let parentOperationName: string | null = null
|
|
1241
|
-
if (spanRow.parent_span_id) {
|
|
1242
|
-
const parentRow = db.query(`
|
|
1243
|
-
SELECT operation_name FROM spans
|
|
1244
|
-
WHERE trace_id = ? AND span_id = ?
|
|
1245
|
-
`).get(traceId, spanRow.parent_span_id) as { operation_name: string } | null
|
|
1246
|
-
parentOperationName = parentRow?.operation_name ?? null
|
|
1247
|
-
}
|
|
1248
|
-
|
|
1249
|
-
// Compute depth by walking up parent chain (typically 3-5 hops)
|
|
1250
|
-
let depth = 0
|
|
1251
|
-
let currentParentId = spanRow.parent_span_id
|
|
1252
|
-
while (currentParentId) {
|
|
1253
|
-
const parentRow = db.query(`
|
|
1254
|
-
SELECT parent_span_id FROM spans WHERE trace_id = ? AND span_id = ?
|
|
1255
|
-
`).get(traceId, currentParentId) as { parent_span_id: string | null } | null
|
|
1256
|
-
if (!parentRow) break
|
|
1257
|
-
depth++
|
|
1258
|
-
currentParentId = parentRow.parent_span_id
|
|
1259
|
-
}
|
|
1260
|
-
|
|
1261
1475
|
const parsed = parseSpanRow(spanRow)
|
|
1262
1476
|
return {
|
|
1263
1477
|
traceId,
|
|
@@ -1279,9 +1493,24 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1279
1493
|
const cutoff = (yield* Clock.currentTimeMillis) - (input.lookbackMinutes ?? config.otel.traceLookbackMinutes) * 60 * 1000
|
|
1280
1494
|
const limit = input.limit ?? 100
|
|
1281
1495
|
const hasContainsFilters = Object.keys(input.attributeContainsFilters ?? {}).length > 0
|
|
1282
|
-
|
|
1496
|
+
// Only over-fetch when post-filtering will discard rows. Without
|
|
1497
|
+
// a parentOperation filter the SQL `LIMIT` already returns the
|
|
1498
|
+
// final set, and over-fetching just makes us parse JSON blobs
|
|
1499
|
+
// for rows we'll throw away.
|
|
1500
|
+
const needsPostFilter = !!input.parentOperation
|
|
1501
|
+
const candidateLimit = !needsPostFilter
|
|
1502
|
+
? limit
|
|
1503
|
+
: hasContainsFilters
|
|
1504
|
+
? Math.max(limit * 20, 500)
|
|
1505
|
+
: Math.max(limit * 10, 200)
|
|
1283
1506
|
|
|
1284
1507
|
return yield* Effect.sync(() => {
|
|
1508
|
+
// First pass: fetch only the columns needed to filter and
|
|
1509
|
+
// to drive the parent-context lookup. Parsing the heavy
|
|
1510
|
+
// `*_json` blobs is deferred until after we've sliced down
|
|
1511
|
+
// to the final `limit`.
|
|
1512
|
+
let fromSql = "FROM spans AS s"
|
|
1513
|
+
const joinParams: Array<string | number> = []
|
|
1285
1514
|
const clauses: string[] = ["s.start_time_ms >= ?"]
|
|
1286
1515
|
const params: Array<string | number> = [cutoff]
|
|
1287
1516
|
|
|
@@ -1296,8 +1525,8 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1296
1525
|
if (input.operation) {
|
|
1297
1526
|
const ftsQuery = toFtsMatchQuery(input.operation)
|
|
1298
1527
|
if (hasFts && ftsQuery) {
|
|
1299
|
-
|
|
1300
|
-
|
|
1528
|
+
fromSql += ` INNER JOIN (SELECT trace_id, span_id FROM span_operation_fts WHERE span_operation_fts MATCH ?) AS span_operation_match ON span_operation_match.trace_id = s.trace_id AND span_operation_match.span_id = s.span_id`
|
|
1529
|
+
joinParams.push(ftsQuery)
|
|
1301
1530
|
} else {
|
|
1302
1531
|
clauses.push("s.operation_name LIKE ? COLLATE NOCASE")
|
|
1303
1532
|
params.push(`%${input.operation}%`)
|
|
@@ -1320,51 +1549,111 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1320
1549
|
params.push(...containsAttrMatch.params)
|
|
1321
1550
|
}
|
|
1322
1551
|
|
|
1323
|
-
const
|
|
1324
|
-
SELECT trace_id, span_id
|
|
1325
|
-
|
|
1552
|
+
const candidateRows = db.query(`
|
|
1553
|
+
SELECT s.trace_id, s.span_id, s.parent_span_id, s.operation_name, s.start_time_ms
|
|
1554
|
+
${fromSql}
|
|
1326
1555
|
WHERE ${clauses.join(" AND ")}
|
|
1327
1556
|
ORDER BY s.start_time_ms DESC
|
|
1328
1557
|
LIMIT ?
|
|
1329
|
-
`).all(...params, candidateLimit) as Array<{ trace_id: string; span_id: string }>
|
|
1558
|
+
`).all(...joinParams, ...params, candidateLimit) as Array<{ trace_id: string; span_id: string; parent_span_id: string | null; operation_name: string; start_time_ms: number }>
|
|
1330
1559
|
|
|
1331
|
-
const traceIds = [...new Set(
|
|
1560
|
+
const traceIds = [...new Set(candidateRows.map((row) => row.trace_id))]
|
|
1332
1561
|
if (traceIds.length === 0) return [] as readonly SpanItem[]
|
|
1333
1562
|
|
|
1563
|
+
const keyOf = (traceId: string, spanId: string) => `${traceId}:${spanId}`
|
|
1564
|
+
const spanContextById = new Map<string, { readonly parentSpanId: string | null; readonly operationName: string }>()
|
|
1565
|
+
|
|
1566
|
+
// Bulk-prefetch parent metadata for every span in every trace
|
|
1567
|
+
// touched by the candidate set. One indexed scan per trace_id
|
|
1568
|
+
// is much cheaper than a per-span lookup loop while computing
|
|
1569
|
+
// depth, and we get the trace-root lookup in the same pass.
|
|
1334
1570
|
const placeholders = traceIds.map(() => "?").join(", ")
|
|
1335
|
-
const
|
|
1336
|
-
SELECT
|
|
1571
|
+
const allSpanRows = db.query(`
|
|
1572
|
+
SELECT trace_id, span_id, parent_span_id, operation_name, start_time_ms
|
|
1573
|
+
FROM spans
|
|
1337
1574
|
WHERE trace_id IN (${placeholders})
|
|
1338
|
-
|
|
1339
|
-
`).all(...traceIds) as SpanRow[]
|
|
1575
|
+
`).all(...traceIds) as Array<{ trace_id: string; span_id: string; parent_span_id: string | null; operation_name: string; start_time_ms: number }>
|
|
1340
1576
|
|
|
1341
|
-
const
|
|
1342
|
-
for (const row of
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1577
|
+
const rootOperationByTraceId = new Map<string, { operationName: string; startTimeMs: number }>()
|
|
1578
|
+
for (const row of allSpanRows) {
|
|
1579
|
+
spanContextById.set(keyOf(row.trace_id, row.span_id), {
|
|
1580
|
+
parentSpanId: row.parent_span_id,
|
|
1581
|
+
operationName: row.operation_name,
|
|
1582
|
+
})
|
|
1583
|
+
if (row.parent_span_id === null) {
|
|
1584
|
+
const existing = rootOperationByTraceId.get(row.trace_id)
|
|
1585
|
+
if (!existing || row.start_time_ms < existing.startTimeMs) {
|
|
1586
|
+
rootOperationByTraceId.set(row.trace_id, { operationName: row.operation_name, startTimeMs: row.start_time_ms })
|
|
1587
|
+
}
|
|
1588
|
+
}
|
|
1346
1589
|
}
|
|
1347
1590
|
|
|
1348
|
-
const
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1591
|
+
const getSpanContext = (traceId: string, spanId: string) => spanContextById.get(keyOf(traceId, spanId)) ?? null
|
|
1592
|
+
|
|
1593
|
+
const depthById = new Map<string, number>()
|
|
1594
|
+
const getDepth = (traceId: string, spanId: string, visiting = new Set<string>()): number => {
|
|
1595
|
+
const key = keyOf(traceId, spanId)
|
|
1596
|
+
const cached = depthById.get(key)
|
|
1597
|
+
if (cached !== undefined) return cached
|
|
1598
|
+
if (visiting.has(key)) return 0
|
|
1599
|
+
visiting.add(key)
|
|
1600
|
+
const context = getSpanContext(traceId, spanId)
|
|
1601
|
+
const depth = context?.parentSpanId ? getDepth(traceId, context.parentSpanId, visiting) + 1 : 0
|
|
1602
|
+
depthById.set(key, depth)
|
|
1603
|
+
return depth
|
|
1604
|
+
}
|
|
1605
|
+
|
|
1606
|
+
// Apply parentOperation post-filter on the lite candidate set
|
|
1607
|
+
// (cheap — string compare against cached parent op) and then
|
|
1608
|
+
// slice down to the final result size before parsing any JSON.
|
|
1609
|
+
const parentOperationNeedle = input.parentOperation?.toLowerCase() ?? null
|
|
1610
|
+
const filteredLite: typeof candidateRows = []
|
|
1611
|
+
for (const row of candidateRows) {
|
|
1612
|
+
if (parentOperationNeedle) {
|
|
1613
|
+
const parent = row.parent_span_id ? getSpanContext(row.trace_id, row.parent_span_id) : null
|
|
1614
|
+
if (!parent?.operationName.toLowerCase().includes(parentOperationNeedle)) continue
|
|
1354
1615
|
}
|
|
1616
|
+
filteredLite.push(row)
|
|
1617
|
+
if (filteredLite.length >= limit) break
|
|
1355
1618
|
}
|
|
1356
1619
|
|
|
1357
|
-
return
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1620
|
+
if (filteredLite.length === 0) return [] as readonly SpanItem[]
|
|
1621
|
+
|
|
1622
|
+
// Hydrate only the kept rows: one batched fetch of the full
|
|
1623
|
+
// SpanRow (with resource_json / attributes_json / events_json)
|
|
1624
|
+
// using SQLite's row-value `IN` syntax, then parseSpanRow per
|
|
1625
|
+
// kept row. Result order follows `filteredLite` so the caller
|
|
1626
|
+
// sees the same ordering the candidate scan produced.
|
|
1627
|
+
const keptValues = filteredLite.map(() => "(?, ?)").join(", ")
|
|
1628
|
+
const fullRows = db.query(`
|
|
1629
|
+
SELECT * FROM spans WHERE (trace_id, span_id) IN (VALUES ${keptValues})
|
|
1630
|
+
`).all(...filteredLite.flatMap((row) => [row.trace_id, row.span_id])) as SpanRow[]
|
|
1631
|
+
const fullRowByKey = new Map<string, SpanRow>()
|
|
1632
|
+
for (const row of fullRows) {
|
|
1633
|
+
fullRowByKey.set(keyOf(row.trace_id, row.span_id), row)
|
|
1634
|
+
}
|
|
1635
|
+
|
|
1636
|
+
const items: SpanItem[] = []
|
|
1637
|
+
for (const lite of filteredLite) {
|
|
1638
|
+
const row = fullRowByKey.get(keyOf(lite.trace_id, lite.span_id))
|
|
1639
|
+
if (!row) continue
|
|
1640
|
+
const parentContext = row.parent_span_id ? getSpanContext(row.trace_id, row.parent_span_id) : null
|
|
1641
|
+
const parsedSpan = parseSpanRow(row)
|
|
1642
|
+
const span = {
|
|
1643
|
+
...parsedSpan,
|
|
1644
|
+
depth: getDepth(row.trace_id, row.span_id),
|
|
1645
|
+
warnings: row.parent_span_id && !parentContext
|
|
1646
|
+
? [`missing span ${row.parent_span_id} (1 child)`]
|
|
1647
|
+
: parsedSpan.warnings,
|
|
1648
|
+
}
|
|
1649
|
+
items.push({
|
|
1650
|
+
traceId: row.trace_id,
|
|
1651
|
+
rootOperationName: rootOperationByTraceId.get(row.trace_id)?.operationName ?? span.operationName,
|
|
1652
|
+
parentOperationName: parentContext?.operationName ?? null,
|
|
1653
|
+
span,
|
|
1366
1654
|
})
|
|
1367
|
-
|
|
1655
|
+
}
|
|
1656
|
+
return items
|
|
1368
1657
|
})
|
|
1369
1658
|
})
|
|
1370
1659
|
|
|
@@ -1662,11 +1951,13 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1662
1951
|
})
|
|
1663
1952
|
|
|
1664
1953
|
const listRecentLogs = Effect.fn("motel/TelemetryStore.listRecentLogs")(function* (serviceName: string) {
|
|
1665
|
-
|
|
1954
|
+
yield* Effect.annotateCurrentSpan("log.service_name", serviceName)
|
|
1955
|
+
const logs = yield* searchLogs({ serviceName, limit: config.otel.logFetchLimit })
|
|
1956
|
+
yield* Effect.annotateCurrentSpan("log.result_count", logs.length)
|
|
1957
|
+
return logs
|
|
1666
1958
|
})
|
|
1667
1959
|
|
|
1668
1960
|
const listFacets = Effect.fn("motel/TelemetryStore.listFacets")(function* (input: FacetSearch) {
|
|
1669
|
-
|
|
1670
1961
|
const cutoff = (yield* Clock.currentTimeMillis) - (input.lookbackMinutes ?? config.otel.traceLookbackMinutes) * 60 * 1000
|
|
1671
1962
|
const limit = input.limit ?? 20
|
|
1672
1963
|
|
|
@@ -1756,21 +2047,30 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1756
2047
|
// FACET_VALUE_MAX_LEN. For opencode this hides `ai.prompt`,
|
|
1757
2048
|
// `ai.prompt.messages`, and `ai.prompt.tools` — which are 1-6MB text
|
|
1758
2049
|
// blobs that you'd never want to filter by exact match anyway. The
|
|
1759
|
-
// WHERE clause lets SQLite skip reading those pages from disk
|
|
1760
|
-
//
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
2050
|
+
// WHERE clause lets SQLite skip reading those pages from disk.
|
|
2051
|
+
// COUNT(DISTINCT ...) does its own per-group dedup via a temp B-tree,
|
|
2052
|
+
// so the outer query needs no DISTINCT subquery in front of it. We
|
|
2053
|
+
// pre-filter trace_ids through trace_summaries (an indexed lookup) so
|
|
2054
|
+
// the planner can use a SEMI JOIN against the small in-window set
|
|
2055
|
+
// instead of joining every span_attributes row to trace_summaries.
|
|
2056
|
+
const params: Array<string | number> = []
|
|
2057
|
+
let traceFilter: string
|
|
2058
|
+
if (input.serviceName) {
|
|
2059
|
+
traceFilter = `(SELECT trace_id FROM trace_summaries WHERE started_at_ms >= ? AND service_name = ?)`
|
|
2060
|
+
params.push(cutoff, input.serviceName)
|
|
2061
|
+
} else {
|
|
2062
|
+
traceFilter = `(SELECT trace_id FROM trace_summaries WHERE started_at_ms >= ?)`
|
|
2063
|
+
params.push(cutoff)
|
|
2064
|
+
}
|
|
2065
|
+
params.push(FACET_VALUE_MAX_LEN, limit)
|
|
1764
2066
|
const rows = db.query(`
|
|
1765
|
-
SELECT
|
|
1766
|
-
COUNT(DISTINCT
|
|
1767
|
-
COUNT(DISTINCT
|
|
1768
|
-
FROM span_attributes
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
${input.serviceName ? "AND s.service_name = ?" : ""}
|
|
1773
|
-
GROUP BY sa.key
|
|
2067
|
+
SELECT key AS value,
|
|
2068
|
+
COUNT(DISTINCT trace_id) AS count,
|
|
2069
|
+
COUNT(DISTINCT value) AS distinct_values
|
|
2070
|
+
FROM span_attributes
|
|
2071
|
+
WHERE trace_id IN ${traceFilter}
|
|
2072
|
+
AND LENGTH(value) < ?
|
|
2073
|
+
GROUP BY key
|
|
1774
2074
|
ORDER BY (CASE WHEN distinct_values = 1 THEN 1 ELSE 0 END) ASC,
|
|
1775
2075
|
distinct_values DESC,
|
|
1776
2076
|
count DESC,
|
|
@@ -1807,7 +2107,10 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
1807
2107
|
})
|
|
1808
2108
|
|
|
1809
2109
|
const listTraceLogs = Effect.fn("motel/TelemetryStore.listTraceLogs")(function* (traceId: string) {
|
|
1810
|
-
|
|
2110
|
+
yield* Effect.annotateCurrentSpan("log.trace_id", traceId)
|
|
2111
|
+
const logs = yield* searchLogs({ traceId, limit: config.otel.logFetchLimit })
|
|
2112
|
+
yield* Effect.annotateCurrentSpan("log.result_count", logs.length)
|
|
2113
|
+
return logs
|
|
1811
2114
|
})
|
|
1812
2115
|
|
|
1813
2116
|
// ---------------------------------------------------------------------------
|
|
@@ -2212,28 +2515,40 @@ export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) => Layer.ef
|
|
|
2212
2515
|
searchAiCalls,
|
|
2213
2516
|
getAiCall,
|
|
2214
2517
|
aiCallStats,
|
|
2518
|
+
runRetentionNow: cleanupExpired(),
|
|
2215
2519
|
})
|
|
2216
|
-
})
|
|
2217
|
-
|
|
2520
|
+
})
|
|
2521
|
+
|
|
2522
|
+
/** Compatibility factory for callers constructing a writer/query-capable store layer. */
|
|
2523
|
+
export const makeTelemetryStoreLayer = (opts: TelemetryStoreOptions) =>
|
|
2524
|
+
Layer.effect(TelemetryStore, makeTelemetryStoreEffect(opts)).pipe(Layer.provide(BunFileSystem.layer))
|
|
2218
2525
|
|
|
2219
2526
|
/**
|
|
2220
|
-
* Default writer
|
|
2221
|
-
* migrations, FTS backfill, and the retention loop.
|
|
2527
|
+
* Default writer runtime used by tests and direct store consumers.
|
|
2222
2528
|
*/
|
|
2223
2529
|
export const TelemetryStoreLive = makeTelemetryStoreLayer({ readonly: false, runRetention: true })
|
|
2224
2530
|
|
|
2225
2531
|
/**
|
|
2226
|
-
*
|
|
2227
|
-
*
|
|
2228
|
-
* the same time (they'd just serialise behind the write lock and
|
|
2229
|
-
* duplicate work).
|
|
2532
|
+
* The ingest worker's writer. It is the managed daemon's sole owner of
|
|
2533
|
+
* schema migrations, FTS backfill, retention, and page reclamation.
|
|
2230
2534
|
*/
|
|
2231
|
-
export const TelemetryStoreWorkerLive =
|
|
2535
|
+
export const TelemetryStoreWorkerLive = TelemetryStoreLive
|
|
2232
2536
|
|
|
2233
2537
|
/**
|
|
2234
|
-
* Read-only instance for query-only processes (currently the TUI
|
|
2235
|
-
* Skips every DDL/DML statement at startup so
|
|
2236
|
-
* opened while a writer is mid-transaction
|
|
2237
|
-
*
|
|
2538
|
+
* Read-only instance for query-only processes (currently the TUI and
|
|
2539
|
+
* HTTP query handlers). Skips every DDL/DML statement at startup so
|
|
2540
|
+
* the connection can be opened while a writer is mid-transaction
|
|
2541
|
+
* without racing for the write lock. Provided as TelemetryStoreReadonly
|
|
2542
|
+
* — a distinct service identifier so it can coexist with the writer
|
|
2543
|
+
* TelemetryStore in the same runtime.
|
|
2238
2544
|
*/
|
|
2239
|
-
export const TelemetryStoreReadonlyLive =
|
|
2545
|
+
export const TelemetryStoreReadonlyLive = Layer.effect(TelemetryStoreReadonly, makeTelemetryStoreEffect({ readonly: true, runRetention: false })).pipe(Layer.provide(BunFileSystem.layer))
|
|
2546
|
+
|
|
2547
|
+
/** Query-worker reader that waits for the sole writer to finish schema bootstrap. */
|
|
2548
|
+
export const TelemetryStoreQueryWorkerLive = Layer.effect(
|
|
2549
|
+
TelemetryStoreReadonly,
|
|
2550
|
+
makeTelemetryStoreEffect({ readonly: true, runRetention: false }).pipe(
|
|
2551
|
+
Effect.map((store) => TelemetryStoreReadonly.of(store)),
|
|
2552
|
+
Effect.retry(Schedule.spaced("50 millis")),
|
|
2553
|
+
),
|
|
2554
|
+
).pipe(Layer.provide(BunFileSystem.layer))
|