costhawk 1.5.11 → 1.5.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,888 @@
1
+ /**
2
+ * Cursor Local SQLite Parser
3
+ *
4
+ * Parses Cursor IDE chat history from the local SQLite database to extract
5
+ * token usage and timestamps. Read-only. Does not push to any backend.
6
+ *
7
+ * Storage:
8
+ * macOS: ~/Library/Application Support/Cursor/User/globalStorage/state.vscdb
9
+ * Linux: ~/.config/Cursor/User/globalStorage/state.vscdb
10
+ * Windows: %APPDATA%/Cursor/User/globalStorage/state.vscdb
11
+ *
12
+ * Schema:
13
+ * Table cursorDiskKV (key TEXT, value BLOB)
14
+ * Conversations: composerData:<composerId>
15
+ * Messages: bubbleId:<composerId>:<bubbleId>
16
+ *
17
+ * Token data lives at $.tokenCount.inputTokens and $.tokenCount.outputTokens
18
+ * on bubble rows. Model name at $.modelInfo.modelName. Server-side dedup id
19
+ * at $.serverBubbleId.
20
+ *
21
+ * Timestamps (verified in Task #30 against a real state.vscdb):
22
+ * - $.createdAt on bubbles is an ISO 8601 string (~56% coverage, all-or-
23
+ * nothing per composer — likely added in a newer Cursor version).
24
+ * - $.createdAt on composerData rows is a Unix milliseconds number (100%
25
+ * coverage). Same field name, different type — parser handles both.
26
+ * - $.lastUpdatedAt on composerData rows is Unix ms (~13% coverage).
27
+ * - $.timingInfo.client* on bubbles is performance.now()-style relative
28
+ * (seconds since Cursor process start), NOT absolute — never use it as
29
+ * a wall-clock timestamp.
30
+ *
31
+ * Fallback ladder for per-session timestamps: prefer min/max of bubble
32
+ * createdAt when present, otherwise use composerData.createdAt with optional
33
+ * composerData.lastUpdatedAt as end time. Every session gets non-null
34
+ * timestamps; the `timestampSource` / `timestampQuality` fields surface
35
+ * whether the values are precise or approximate.
36
+ *
37
+ * Workspace metadata fields (workspaceHash/workspaceName) remain unverified
38
+ * and return null. composerData.name is a candidate for workspaceName but
39
+ * has not been confirmed yet.
40
+ */
41
+ import { execFileSync } from "child_process";
42
+ import { existsSync, statSync } from "fs";
43
+ import { homedir, platform } from "os";
44
+ import { join } from "path";
45
+ // Defaults — overridable via env vars
46
+ const DEFAULT_SQLITE3_PATH = "/usr/bin/sqlite3";
47
+ const SQLITE_TIMEOUT_MS = 10_000;
48
+ const SQLITE_MAX_BUFFER_BYTES = 32 * 1024 * 1024;
49
+ // Sanity-check range for parsed Unix-ms timestamps. We reject anything before
50
+ // 2020 or at/after 2100 as "not plausibly a Cursor message timestamp" — this
51
+ // catches both `performance.now`-style relative values (which look like
52
+ // fractional seconds) and negative / NaN parse results from malformed data.
53
+ const MIN_UNIX_MS = Date.UTC(2020, 0, 1);
54
+ const MAX_UNIX_MS = Date.UTC(2100, 0, 1);
55
+ // Self-test invariant: bubbles sometimes persist a few minutes earlier than
56
+ // the composerData row due to clock skew or write-order races. Tolerate 5
57
+ // minutes before raising a warning.
58
+ const INVARIANT_SKEW_TOLERANCE_MS = 5 * 60 * 1000;
59
+ /**
60
+ * Parse an ISO 8601 string into Unix ms, or null if the input is not a
61
+ * string, not parseable, or outside the sane range. Callers should treat
62
+ * null as "no usable timestamp here" and fall through to the next source.
63
+ */
64
+ function parseIsoToMs(value) {
65
+ if (typeof value !== "string")
66
+ return null;
67
+ const ms = Date.parse(value);
68
+ if (!Number.isFinite(ms) || ms < MIN_UNIX_MS || ms >= MAX_UNIX_MS) {
69
+ return null;
70
+ }
71
+ return ms;
72
+ }
73
+ /**
74
+ * Parse a number that is supposed to be Unix ms, rejecting values outside
75
+ * the sane range. This specifically catches `timingInfo.clientStartTime`
76
+ * (which is `performance.now()` seconds since process start and lands
77
+ * far below MIN_UNIX_MS).
78
+ */
79
+ function parseUnixMsLoose(value) {
80
+ if (typeof value !== "number" || !Number.isFinite(value))
81
+ return null;
82
+ if (value < MIN_UNIX_MS || value >= MAX_UNIX_MS)
83
+ return null;
84
+ return Math.floor(value);
85
+ }
86
+ /**
87
+ * Accept either shape for a `createdAt`-style field. composerData.createdAt
88
+ * is a number; bubble.createdAt is an ISO string. We try both without
89
+ * signaling which one matched — the caller does not need to know.
90
+ */
91
+ function parseTimestampField(value) {
92
+ return parseUnixMsLoose(value) ?? parseIsoToMs(value);
93
+ }
94
+ function msToIso(ms) {
95
+ return new Date(ms).toISOString();
96
+ }
97
+ function msToUtcDateKey(ms) {
98
+ return new Date(ms).toISOString().split("T")[0];
99
+ }
100
+ function createEmptyTokenUsage() {
101
+ return {
102
+ inputTokens: 0,
103
+ outputTokens: 0,
104
+ cacheCreationTokens: 0,
105
+ cacheReadTokens: 0,
106
+ };
107
+ }
108
+ /**
109
+ * Get the default Cursor SQLite path for the current platform, honoring
110
+ * the COSTHAWK_CURSOR_DB_PATH environment override.
111
+ */
112
+ export function getCursorDbPath() {
113
+ const envOverride = process.env.COSTHAWK_CURSOR_DB_PATH;
114
+ if (envOverride && envOverride.length > 0) {
115
+ return envOverride;
116
+ }
117
+ const home = homedir();
118
+ if (platform() === "darwin") {
119
+ return join(home, "Library", "Application Support", "Cursor", "User", "globalStorage", "state.vscdb");
120
+ }
121
+ if (platform() === "win32") {
122
+ const appData = process.env.APPDATA;
123
+ if (appData) {
124
+ return join(appData, "Cursor", "User", "globalStorage", "state.vscdb");
125
+ }
126
+ return join(home, "AppData", "Roaming", "Cursor", "User", "globalStorage", "state.vscdb");
127
+ }
128
+ // Linux and other unix-likes
129
+ return join(home, ".config", "Cursor", "User", "globalStorage", "state.vscdb");
130
+ }
131
+ /**
132
+ * Check whether the Cursor SQLite database exists at the resolved path.
133
+ */
134
+ export function cursorDbExists() {
135
+ return existsSync(getCursorDbPath());
136
+ }
137
+ /**
138
+ * Resolve the sqlite3 binary path. Defaults to /usr/bin/sqlite3, honoring
139
+ * the COSTHAWK_SQLITE3_PATH environment override.
140
+ */
141
+ function getSqlite3Path() {
142
+ return process.env.COSTHAWK_SQLITE3_PATH ?? DEFAULT_SQLITE3_PATH;
143
+ }
144
+ /**
145
+ * Type guard — narrows an unknown error to a CursorParserError.
146
+ */
147
+ function isCursorParserError(value) {
148
+ if (typeof value !== "object" || value === null) {
149
+ return false;
150
+ }
151
+ const obj = value;
152
+ return (typeof obj.code === "string" &&
153
+ (obj.code === "CURSOR_DB_NOT_FOUND" ||
154
+ obj.code === "CURSOR_SQLITE3_NOT_FOUND" ||
155
+ obj.code === "CURSOR_SQLITE_QUERY_FAILED") &&
156
+ typeof obj.message === "string");
157
+ }
158
+ /**
159
+ * Run a SQL query against the Cursor SQLite via shell-out to the system
160
+ * sqlite3 binary. Returns parsed rows as an array of {key, value} objects,
161
+ * or throws CursorParserError on unrecoverable failures.
162
+ *
163
+ * Uses execFileSync with an arg array (not shell strings) to avoid shell
164
+ * injection. Sets explicit timeout and maxBuffer to defend against runaway
165
+ * queries or oversized state.vscdb files.
166
+ */
167
+ function runCursorQuery(sql) {
168
+ const sqlite3Path = getSqlite3Path();
169
+ const dbPath = getCursorDbPath();
170
+ if (!existsSync(dbPath)) {
171
+ const error = {
172
+ code: "CURSOR_DB_NOT_FOUND",
173
+ message: `Cursor SQLite database not found at ${dbPath}. Make sure Cursor is installed and you have used it at least once. Set COSTHAWK_CURSOR_DB_PATH to override.`,
174
+ };
175
+ throw error;
176
+ }
177
+ let stdout;
178
+ try {
179
+ stdout = execFileSync(sqlite3Path, ["-readonly", "-batch", "-json", "--", dbPath, sql], {
180
+ encoding: "utf8",
181
+ timeout: SQLITE_TIMEOUT_MS,
182
+ maxBuffer: SQLITE_MAX_BUFFER_BYTES,
183
+ });
184
+ }
185
+ catch (err) {
186
+ const errno = err.code;
187
+ if (errno === "ENOENT") {
188
+ const error = {
189
+ code: "CURSOR_SQLITE3_NOT_FOUND",
190
+ message: `sqlite3 binary not found at ${sqlite3Path}. Set COSTHAWK_SQLITE3_PATH to override the default path.`,
191
+ };
192
+ throw error;
193
+ }
194
+ const message = err instanceof Error ? err.message : String(err);
195
+ const error = {
196
+ code: "CURSOR_SQLITE_QUERY_FAILED",
197
+ message,
198
+ };
199
+ throw error;
200
+ }
201
+ if (!stdout || stdout.trim().length === 0) {
202
+ return [];
203
+ }
204
+ // sqlite3 -json output is a JSON array of objects when rows exist,
205
+ // or empty / whitespace when no rows. Anything else is a real failure
206
+ // and must surface as CURSOR_SQLITE_QUERY_FAILED — silently returning
207
+ // [] would mask a parser failure as "no sessions found".
208
+ let parsed;
209
+ try {
210
+ parsed = JSON.parse(stdout);
211
+ }
212
+ catch {
213
+ const error = {
214
+ code: "CURSOR_SQLITE_QUERY_FAILED",
215
+ message: "sqlite3 returned invalid JSON output",
216
+ };
217
+ throw error;
218
+ }
219
+ if (!Array.isArray(parsed)) {
220
+ const error = {
221
+ code: "CURSOR_SQLITE_QUERY_FAILED",
222
+ message: "sqlite3 returned a non-array JSON payload",
223
+ };
224
+ throw error;
225
+ }
226
+ return parsed.filter((row) => typeof row === "object" &&
227
+ row !== null &&
228
+ "key" in row &&
229
+ "value" in row &&
230
+ typeof row.key === "string" &&
231
+ typeof row.value === "string");
232
+ }
233
+ function hasTokenUsage(bubble) {
234
+ return bubble.inputTokens > 0 || bubble.outputTokens > 0;
235
+ }
236
+ const BUBBLE_KEY_REGEX = /^bubbleId:([^:]+):(.+)$/;
237
+ const COMPOSER_KEY_REGEX = /^composerData:(.+)$/;
238
+ /**
239
+ * Parse a single bubbleId row into structured BubbleData.
240
+ *
241
+ * Returns null if the row key is malformed, the value is not parseable JSON,
242
+ * or the row contains neither a non-empty model name, positive token counts,
243
+ * nor a parseable timestamp. Cursor can store model metadata, token usage,
244
+ * and timestamps on different rows, so the parser accepts any usable signal
245
+ * in isolation and lets the per-composer aggregation merge them.
246
+ *
247
+ * Timestamp handling: `createdAt` is accepted as either an ISO 8601 string
248
+ * (standard shape on bubble rows) or a Unix-ms number (defensive fallback).
249
+ * Rows with only a timestamp and no tokens/model still return BubbleData
250
+ * so the timestamp contributes to per-composer start/end resolution.
251
+ */
252
+ function parseBubble(row) {
253
+ const match = BUBBLE_KEY_REGEX.exec(row.key);
254
+ if (!match) {
255
+ return null;
256
+ }
257
+ const [, composerId, bubbleId] = match;
258
+ let value;
259
+ try {
260
+ value = JSON.parse(row.value);
261
+ }
262
+ catch {
263
+ return null;
264
+ }
265
+ if (typeof value !== "object" || value === null) {
266
+ return null;
267
+ }
268
+ const obj = value;
269
+ let inputTokens = 0;
270
+ let outputTokens = 0;
271
+ const tokenCount = obj.tokenCount;
272
+ if (typeof tokenCount === "object" && tokenCount !== null) {
273
+ const tc = tokenCount;
274
+ inputTokens = typeof tc.inputTokens === "number" ? tc.inputTokens : 0;
275
+ outputTokens = typeof tc.outputTokens === "number" ? tc.outputTokens : 0;
276
+ }
277
+ let modelName;
278
+ const modelInfo = obj.modelInfo;
279
+ if (typeof modelInfo === "object" && modelInfo !== null) {
280
+ const mi = modelInfo;
281
+ if (typeof mi.modelName === "string" && mi.modelName.length > 0) {
282
+ modelName = mi.modelName;
283
+ }
284
+ }
285
+ const createdAtMs = parseTimestampField(obj.createdAt);
286
+ // Skip rows with no usable signal at all — neither model metadata,
287
+ // positive token counts, nor a parseable timestamp. These are typically
288
+ // system messages, empty bubbles, or tool-call bookkeeping rows.
289
+ if (!modelName && inputTokens === 0 && outputTokens === 0 && createdAtMs === null) {
290
+ return null;
291
+ }
292
+ let serverBubbleId;
293
+ if (typeof obj.serverBubbleId === "string" &&
294
+ obj.serverBubbleId.length > 0) {
295
+ serverBubbleId = obj.serverBubbleId;
296
+ }
297
+ return {
298
+ composerId,
299
+ bubbleId,
300
+ serverBubbleId,
301
+ modelName,
302
+ inputTokens,
303
+ outputTokens,
304
+ createdAtMs,
305
+ };
306
+ }
307
+ /**
308
+ * Parse a composerData row into ComposerMetadata.
309
+ *
310
+ * Returns null for malformed keys, unparseable JSON, or rows with no
311
+ * usable timestamp fields. composerData.createdAt in real Cursor data
312
+ * is a Unix-ms number, but the parser accepts either shape defensively.
313
+ *
314
+ * If lastUpdatedAt is earlier than createdAt (clock skew, data corruption),
315
+ * lastUpdatedAt is dropped rather than trusted, so downstream aggregation
316
+ * never produces endTime < startTime.
317
+ */
318
+ function parseComposerData(row) {
319
+ const match = COMPOSER_KEY_REGEX.exec(row.key);
320
+ if (!match) {
321
+ return null;
322
+ }
323
+ const [, composerId] = match;
324
+ let value;
325
+ try {
326
+ value = JSON.parse(row.value);
327
+ }
328
+ catch {
329
+ return null;
330
+ }
331
+ if (typeof value !== "object" || value === null) {
332
+ return null;
333
+ }
334
+ const obj = value;
335
+ const createdAtMs = parseTimestampField(obj.createdAt);
336
+ const rawLastUpdatedAtMs = parseTimestampField(obj.lastUpdatedAt);
337
+ // Drop lastUpdatedAt if it violates the ordering invariant. We never want
338
+ // to produce a session where endTime < startTime because the source
339
+ // values were corrupt.
340
+ const lastUpdatedAtMs = createdAtMs !== null &&
341
+ rawLastUpdatedAtMs !== null &&
342
+ rawLastUpdatedAtMs < createdAtMs
343
+ ? null
344
+ : rawLastUpdatedAtMs;
345
+ if (createdAtMs === null && lastUpdatedAtMs === null) {
346
+ return null;
347
+ }
348
+ return {
349
+ composerId,
350
+ createdAtMs,
351
+ lastUpdatedAtMs,
352
+ };
353
+ }
354
+ /**
355
+ * Resolve per-session start/end times and provenance from the bubble and
356
+ * composer timestamp sources. This is the core of the PR2 fallback ladder:
357
+ *
358
+ * - If any bubble in the composer has createdAtMs, use min/max of bubble
359
+ * timestamps. Source = "bubble", quality = "precise". If composerData
360
+ * provides a later lastUpdatedAt, prefer it for endTime and downgrade
361
+ * the source to "mixed" (still "approximate" since we can't prove
362
+ * those two sources describe the same timeline fidelity).
363
+ * - Otherwise, if composerData has createdAtMs, use it for start and
364
+ * (lastUpdatedAt ?? createdAtMs) for end. Source = "composer",
365
+ * quality = "approximate".
366
+ * - Otherwise, source = "none", quality = "none", startTime = endTime
367
+ * = null. Callers should still emit the session — the tokens are real
368
+ * even if the timing isn't.
369
+ */
370
+ function resolveSessionTimestamps(bubbleCreatedAtsMs, composerMeta) {
371
+ const hasBubbleTimestamps = bubbleCreatedAtsMs.length > 0;
372
+ const composerCreatedAtMs = composerMeta?.createdAtMs ?? null;
373
+ const composerLastUpdatedAtMs = composerMeta?.lastUpdatedAtMs ?? null;
374
+ if (hasBubbleTimestamps) {
375
+ let startMs = bubbleCreatedAtsMs[0];
376
+ let endMs = bubbleCreatedAtsMs[0];
377
+ for (const ms of bubbleCreatedAtsMs) {
378
+ if (ms < startMs)
379
+ startMs = ms;
380
+ if (ms > endMs)
381
+ endMs = ms;
382
+ }
383
+ // If the composer's own lastUpdatedAt is AFTER our max bubble timestamp,
384
+ // prefer it — Cursor can persist the composer row when the session is
385
+ // closed, capturing activity that never produced a token-bearing bubble.
386
+ let mixed = false;
387
+ if (composerLastUpdatedAtMs !== null && composerLastUpdatedAtMs > endMs) {
388
+ endMs = composerLastUpdatedAtMs;
389
+ mixed = true;
390
+ }
391
+ return {
392
+ startTime: msToIso(startMs),
393
+ endTime: msToIso(endMs),
394
+ source: mixed ? "mixed" : "bubble",
395
+ quality: mixed ? "approximate" : "precise",
396
+ };
397
+ }
398
+ if (composerCreatedAtMs !== null) {
399
+ const endMs = composerLastUpdatedAtMs !== null && composerLastUpdatedAtMs >= composerCreatedAtMs
400
+ ? composerLastUpdatedAtMs
401
+ : composerCreatedAtMs;
402
+ return {
403
+ startTime: msToIso(composerCreatedAtMs),
404
+ endTime: msToIso(endMs),
405
+ source: "composer",
406
+ quality: "approximate",
407
+ };
408
+ }
409
+ return {
410
+ startTime: null,
411
+ endTime: null,
412
+ source: "none",
413
+ quality: "none",
414
+ };
415
+ }
416
+ /**
417
+ * Parse Cursor usage from local SQLite. Read-only — does NOT push anything
418
+ * to the costcanary backend.
419
+ *
420
+ * Returns aggregated session data per composer with per-session token totals,
421
+ * message counts, start/end timestamps, and daily usage buckets. Throws
422
+ * CursorParserError on unrecoverable failures (missing DB, missing sqlite3
423
+ * binary, malformed SQLite output).
424
+ *
425
+ * Dedup strategy: per composer, keep one entry per (serverBubbleId ?? bubbleId).
426
+ * On collision, keep the candidate with the larger token total.
427
+ *
428
+ * Mixed-model handling: if a composer contains multiple non-empty model names,
429
+ * the returned `model` field is "mixed". If no model info is present on any
430
+ * bubble, the field is "unknown".
431
+ *
432
+ * Sort order: total tokens descending.
433
+ */
434
+ export function parseCursorUsage() {
435
+ const dbPath = getCursorDbPath();
436
+ // Throws CursorParserError on missing DB / missing sqlite3 / query failure
437
+ const bubbleRows = runCursorQuery("SELECT key, value FROM cursorDiskKV WHERE key LIKE 'bubbleId:%'");
438
+ const composerRows = runCursorQuery("SELECT key, value FROM cursorDiskKV WHERE key LIKE 'composerData:%'");
439
+ // Cursor splits model metadata, token usage, and timestamps across
440
+ // different bubble rows. We collect them into separate per-composer
441
+ // structures so each signal is captured even when rows carry only one
442
+ // of them.
443
+ //
444
+ // - tokenBubblesByComposer: per-composer dedup map for bubbles that carry
445
+ // positive token counts. Dedup key is (serverBubbleId ?? bubbleId).
446
+ // Collision rule: keep the candidate with the larger token total.
447
+ // - modelsByComposer: per-composer set of all distinct non-empty model
448
+ // names found on ANY bubble row in the composer.
449
+ // - bubbleCreatedAtsByComposer: per-composer list of parsed bubble
450
+ // createdAtMs values. Not deduped — we only need min/max, and duplicate
451
+ // values are harmless for those aggregations.
452
+ // - composerMetaById: composerData row metadata, used as the fallback
453
+ // source for per-session timestamps.
454
+ const tokenBubblesByComposer = new Map();
455
+ const modelsByComposer = new Map();
456
+ const bubbleCreatedAtsByComposer = new Map();
457
+ const composerMetaById = new Map();
458
+ for (const row of composerRows) {
459
+ const meta = parseComposerData(row);
460
+ if (!meta) {
461
+ continue;
462
+ }
463
+ composerMetaById.set(meta.composerId, meta);
464
+ }
465
+ for (const row of bubbleRows) {
466
+ const bubble = parseBubble(row);
467
+ if (!bubble) {
468
+ continue;
469
+ }
470
+ if (bubble.modelName) {
471
+ let composerModels = modelsByComposer.get(bubble.composerId);
472
+ if (!composerModels) {
473
+ composerModels = new Set();
474
+ modelsByComposer.set(bubble.composerId, composerModels);
475
+ }
476
+ composerModels.add(bubble.modelName);
477
+ }
478
+ if (bubble.createdAtMs !== null) {
479
+ let composerCreatedAts = bubbleCreatedAtsByComposer.get(bubble.composerId);
480
+ if (!composerCreatedAts) {
481
+ composerCreatedAts = [];
482
+ bubbleCreatedAtsByComposer.set(bubble.composerId, composerCreatedAts);
483
+ }
484
+ composerCreatedAts.push(bubble.createdAtMs);
485
+ }
486
+ if (!hasTokenUsage(bubble)) {
487
+ continue;
488
+ }
489
+ let composerMap = tokenBubblesByComposer.get(bubble.composerId);
490
+ if (!composerMap) {
491
+ composerMap = new Map();
492
+ tokenBubblesByComposer.set(bubble.composerId, composerMap);
493
+ }
494
+ const dedupKey = bubble.serverBubbleId ?? bubble.bubbleId;
495
+ const existing = composerMap.get(dedupKey);
496
+ if (existing) {
497
+ const existingTotal = existing.inputTokens + existing.outputTokens;
498
+ const newTotal = bubble.inputTokens + bubble.outputTokens;
499
+ if (newTotal > existingTotal) {
500
+ composerMap.set(dedupKey, bubble);
501
+ }
502
+ continue;
503
+ }
504
+ composerMap.set(dedupKey, bubble);
505
+ }
506
+ // Aggregate per composer into the parser output shape.
507
+ const sessions = [];
508
+ for (const [composerId, composerMap] of tokenBubblesByComposer) {
509
+ let inputTokens = 0;
510
+ let outputTokens = 0;
511
+ let messageCount = 0;
512
+ const modelsSeen = modelsByComposer.get(composerId) ?? new Set();
513
+ const composerMeta = composerMetaById.get(composerId);
514
+ const bubbleCreatedAts = bubbleCreatedAtsByComposer.get(composerId) ?? [];
515
+ // Daily bucketing: for each token-bearing bubble, prefer its own
516
+ // createdAt; otherwise fall back to the composer's createdAt so the
517
+ // session still contributes to some day rather than silently
518
+ // dropping tokens from the daily view. We track whether any bucket
519
+ // used the composer fallback so the session-level dailyUsageSource
520
+ // reflects approximate day attribution.
521
+ const dailyUsage = {};
522
+ let anyBubbleFellBackToComposer = false;
523
+ for (const bubble of composerMap.values()) {
524
+ inputTokens += bubble.inputTokens;
525
+ outputTokens += bubble.outputTokens;
526
+ messageCount += 1;
527
+ let bucketMs = null;
528
+ if (bubble.createdAtMs !== null) {
529
+ bucketMs = bubble.createdAtMs;
530
+ }
531
+ else if (composerMeta?.createdAtMs != null) {
532
+ bucketMs = composerMeta.createdAtMs;
533
+ anyBubbleFellBackToComposer = true;
534
+ }
535
+ if (bucketMs !== null) {
536
+ const dateKey = msToUtcDateKey(bucketMs);
537
+ let bucket = dailyUsage[dateKey];
538
+ if (!bucket) {
539
+ bucket = createEmptyTokenUsage();
540
+ dailyUsage[dateKey] = bucket;
541
+ }
542
+ bucket.inputTokens += bubble.inputTokens;
543
+ bucket.outputTokens += bubble.outputTokens;
544
+ }
545
+ }
546
+ if (messageCount === 0) {
547
+ continue;
548
+ }
549
+ let model;
550
+ if (modelsSeen.size === 0) {
551
+ model = "unknown";
552
+ }
553
+ else if (modelsSeen.size === 1) {
554
+ model = Array.from(modelsSeen)[0];
555
+ }
556
+ else {
557
+ model = "mixed";
558
+ }
559
+ const timing = resolveSessionTimestamps(bubbleCreatedAts, composerMeta);
560
+ // dailyUsageSource classification:
561
+ // "bubble" — every bucket came from a bubble-level createdAt (precise)
562
+ // "composer" — at least one bucket fell back to composer.createdAt,
563
+ // so the whole per-day view is approximate. Any fallback
564
+ // downgrades the entire session so downstream renderers
565
+ // don't imply message-level precision we can't back up.
566
+ // "none" — no bucket had any timestamp source; dailyUsage is empty.
567
+ let dailyUsageSource;
568
+ if (Object.keys(dailyUsage).length === 0) {
569
+ dailyUsageSource = "none";
570
+ }
571
+ else if (anyBubbleFellBackToComposer) {
572
+ dailyUsageSource = "composer";
573
+ }
574
+ else {
575
+ dailyUsageSource = "bubble";
576
+ }
577
+ sessions.push({
578
+ sessionId: composerId,
579
+ workspaceHash: null,
580
+ workspaceName: null,
581
+ model,
582
+ tokens: {
583
+ inputTokens,
584
+ outputTokens,
585
+ cacheCreationTokens: 0,
586
+ cacheReadTokens: 0,
587
+ },
588
+ messageCount,
589
+ filePath: dbPath,
590
+ startTime: timing.startTime,
591
+ endTime: timing.endTime,
592
+ timestampSource: timing.source,
593
+ timestampQuality: timing.quality,
594
+ dailyUsage,
595
+ dailyUsageSource,
596
+ });
597
+ }
598
+ // Sort by total tokens descending. Downstream surfaces can re-sort by
599
+ // startTime if chronological order matters.
600
+ sessions.sort((a, b) => {
601
+ const aTotal = a.tokens.inputTokens + a.tokens.outputTokens;
602
+ const bTotal = b.tokens.inputTokens + b.tokens.outputTokens;
603
+ return bTotal - aTotal;
604
+ });
605
+ return {
606
+ sessions,
607
+ filePath: dbPath,
608
+ };
609
+ }
610
+ /**
611
+ * Backward-compat alias. PR1 consumers called this function name; keep it
612
+ * working for one release after the rename.
613
+ */
614
+ export const parseCursorUsageDryRun = parseCursorUsage;
615
+ /**
616
+ * Truncate a UUID-ish identifier to 8 characters for safe display in
617
+ * transparency output. Real UUIDs become e.g. "399974f0" — enough for a
618
+ * human to distinguish keys at a glance, not enough to serve as a stable
619
+ * correlation handle if the output leaks.
620
+ */
621
+ function truncateId(id) {
622
+ return id.length <= 8 ? id : id.slice(0, 8);
623
+ }
624
+ /**
625
+ * Return transparency metadata about the Cursor SQLite: file size, table
626
+ * list, key-prefix histogram, and a small sample of bubble and composer
627
+ * keys with their UUIDs truncated. Powers the `what_we_read` MCP mode so
628
+ * users can see exactly what data CostHawk is reading.
629
+ *
630
+ * Throws CursorParserError on missing DB, missing sqlite3, or query failure.
631
+ */
632
+ export function getCursorMeta() {
633
+ const dbPath = getCursorDbPath();
634
+ if (!existsSync(dbPath)) {
635
+ const error = {
636
+ code: "CURSOR_DB_NOT_FOUND",
637
+ message: `Cursor SQLite database not found at ${dbPath}. Make sure Cursor is installed and you have used it at least once. Set COSTHAWK_CURSOR_DB_PATH to override.`,
638
+ };
639
+ throw error;
640
+ }
641
+ let dbFileSize = 0;
642
+ try {
643
+ dbFileSize = statSync(dbPath).size;
644
+ }
645
+ catch {
646
+ dbFileSize = 0;
647
+ }
648
+ const tableRows = runCursorQuery("SELECT name AS key, 'table' AS value FROM sqlite_master WHERE type='table' ORDER BY name");
649
+ const tables = tableRows.map((row) => row.key);
650
+ // Histogram of key prefixes in cursorDiskKV. The CASE expression mirrors
651
+ // the manual probe from Task #30 — substring up to the first colon, or
652
+ // the whole key if there is no colon. ORDER BY count(*) (not the TEXT
653
+ // cast of the count) so the ordering is numeric — otherwise "9" sorts
654
+ // above "184" lexicographically.
655
+ const prefixRows = runCursorQuery("SELECT CASE WHEN instr(key,':')>0 THEN substr(key,1,instr(key,':')-1) ELSE key END AS key, CAST(count(*) AS TEXT) AS value FROM cursorDiskKV GROUP BY 1 ORDER BY count(*) DESC");
656
+ const keyPrefixes = {};
657
+ for (const row of prefixRows) {
658
+ const count = Number.parseInt(row.value, 10);
659
+ if (Number.isFinite(count)) {
660
+ keyPrefixes[row.key] = count;
661
+ }
662
+ }
663
+ const bubbleSampleRows = runCursorQuery("SELECT key, '' AS value FROM cursorDiskKV WHERE key LIKE 'bubbleId:%' LIMIT 5");
664
+ const composerSampleRows = runCursorQuery("SELECT key, '' AS value FROM cursorDiskKV WHERE key LIKE 'composerData:%' LIMIT 5");
665
+ const sampleBubbleKeys = bubbleSampleRows.map((row) => {
666
+ const match = BUBBLE_KEY_REGEX.exec(row.key);
667
+ if (!match)
668
+ return row.key;
669
+ const [, composerId, bubbleId] = match;
670
+ return `bubbleId:${truncateId(composerId)}:${truncateId(bubbleId)}`;
671
+ });
672
+ const sampleComposerKeys = composerSampleRows.map((row) => {
673
+ const match = COMPOSER_KEY_REGEX.exec(row.key);
674
+ if (!match)
675
+ return row.key;
676
+ const [, composerId] = match;
677
+ return `composerData:${truncateId(composerId)}`;
678
+ });
679
+ return {
680
+ filePath: dbPath,
681
+ dbFileSize,
682
+ tables,
683
+ keyPrefixes,
684
+ sampleBubbleKeys,
685
+ sampleComposerKeys,
686
+ };
687
+ }
688
+ /**
689
+ * Run a full parser health check against the live DB. Reports coverage
690
+ * numbers, validates invariants, and classifies the result as PASS,
691
+ * DEGRADED, or FAIL.
692
+ *
693
+ * - FAIL is reserved for unrecoverable failures (DB missing, sqlite3
694
+ * missing, query error). The MCP tool surfaces FAIL as isError:true.
695
+ * - DEGRADED means the parser ran but flagged warnings — e.g., invariant
696
+ * tolerance exceeded, partial timestamp coverage, unexpected row shapes.
697
+ * - PASS means the parser ran cleanly with full coverage and no warnings.
698
+ *
699
+ * Never throws — catches errors and reports them as FAIL so callers can
700
+ * present the full structured payload to users.
701
+ */
702
+ export function runCursorSelfTest() {
703
+ const dbPath = getCursorDbPath();
704
+ const sqlite3Path = getSqlite3Path();
705
+ const errors = [];
706
+ const warnings = [];
707
+ const invariantChecks = [];
708
+ const result = {
709
+ filePath: dbPath,
710
+ dbExists: existsSync(dbPath),
711
+ sqlite3Path,
712
+ canQuery: false,
713
+ tokenBubbleCount: 0,
714
+ composerCount: 0,
715
+ sessionsWithTokens: 0,
716
+ timestampCoverage: {
717
+ bubblesWithCreatedAt: 0,
718
+ totalBubbles: 0,
719
+ composersWithCreatedAt: 0,
720
+ totalComposers: 0,
721
+ },
722
+ invariantChecks,
723
+ warnings,
724
+ errors,
725
+ overallStatus: "FAIL",
726
+ };
727
+ if (!result.dbExists) {
728
+ errors.push(`Cursor SQLite database not found at ${dbPath}. Set COSTHAWK_CURSOR_DB_PATH to override.`);
729
+ return result;
730
+ }
731
+ let bubbleRows;
732
+ let composerRows;
733
+ try {
734
+ bubbleRows = runCursorQuery("SELECT key, value FROM cursorDiskKV WHERE key LIKE 'bubbleId:%'");
735
+ composerRows = runCursorQuery("SELECT key, value FROM cursorDiskKV WHERE key LIKE 'composerData:%'");
736
+ result.canQuery = true;
737
+ }
738
+ catch (err) {
739
+ const code = isCursorParserError(err) ? err.code : "UNKNOWN";
740
+ const message = err instanceof Error
741
+ ? err.message
742
+ : typeof err === "object" && err !== null && "message" in err
743
+ ? String(err.message)
744
+ : "Unknown error";
745
+ errors.push(`[${code}] ${message}`);
746
+ return result;
747
+ }
748
+ result.timestampCoverage.totalBubbles = bubbleRows.length;
749
+ result.timestampCoverage.totalComposers = composerRows.length;
750
+ // Count bubbles with a usable createdAt timestamp (string or number).
751
+ // This mirrors parseBubble's `createdAtMs` logic so the reported
752
+ // coverage matches what the parser will actually use.
753
+ for (const row of bubbleRows) {
754
+ let obj;
755
+ try {
756
+ obj = JSON.parse(row.value);
757
+ }
758
+ catch {
759
+ continue;
760
+ }
761
+ if (typeof obj !== "object" || obj === null)
762
+ continue;
763
+ const record = obj;
764
+ if (parseTimestampField(record.createdAt) !== null) {
765
+ result.timestampCoverage.bubblesWithCreatedAt += 1;
766
+ }
767
+ }
768
+ for (const row of composerRows) {
769
+ const meta = parseComposerData(row);
770
+ if (meta && meta.createdAtMs !== null) {
771
+ result.timestampCoverage.composersWithCreatedAt += 1;
772
+ }
773
+ }
774
+ // Invariant 1: the parser runs without throwing.
775
+ let parserResult = null;
776
+ try {
777
+ parserResult = parseCursorUsage();
778
+ invariantChecks.push({ name: "parser_runs", passed: true });
779
+ }
780
+ catch (err) {
781
+ const message = err instanceof Error ? err.message : String(err);
782
+ invariantChecks.push({
783
+ name: "parser_runs",
784
+ passed: false,
785
+ details: message,
786
+ });
787
+ errors.push(`Parser threw: ${message}`);
788
+ return result;
789
+ }
790
+ result.sessionsWithTokens = parserResult.sessions.length;
791
+ result.tokenBubbleCount = parserResult.sessions.reduce((acc, s) => acc + s.messageCount, 0);
792
+ result.composerCount = composerRows.length;
793
+ // Invariant 2: every session has a non-null timestampSource classification.
794
+ const sessionsWithoutTiming = parserResult.sessions.filter((s) => s.timestampSource === "none");
795
+ if (sessionsWithoutTiming.length > 0) {
796
+ invariantChecks.push({
797
+ name: "all_sessions_have_timestamp_source",
798
+ passed: false,
799
+ details: `${sessionsWithoutTiming.length} sessions have timestampSource="none"`,
800
+ });
801
+ warnings.push(`${sessionsWithoutTiming.length}/${parserResult.sessions.length} sessions have no parseable timestamp source. They will appear with null startTime/endTime in usage output.`);
802
+ }
803
+ else {
804
+ invariantChecks.push({
805
+ name: "all_sessions_have_timestamp_source",
806
+ passed: true,
807
+ });
808
+ }
809
+ // Invariant 3: for every session that resolved start AND end, start <= end.
810
+ const ordering = parserResult.sessions.filter((s) => s.startTime !== null && s.endTime !== null);
811
+ const badOrdering = ordering.filter((s) => (s.startTime !== null ? Date.parse(s.startTime) : 0) >
812
+ (s.endTime !== null ? Date.parse(s.endTime) : 0));
813
+ if (badOrdering.length > 0) {
814
+ invariantChecks.push({
815
+ name: "start_time_le_end_time",
816
+ passed: false,
817
+ details: `${badOrdering.length} sessions violate start <= end`,
818
+ });
819
+ warnings.push(`${badOrdering.length} sessions have startTime > endTime after resolution. This is a parser bug — please report.`);
820
+ }
821
+ else {
822
+ invariantChecks.push({
823
+ name: "start_time_le_end_time",
824
+ passed: true,
825
+ });
826
+ }
827
+ // Invariant 4: for composers where both bubble and composer timestamps
828
+ // exist, min(bubble.createdAt) should be within tolerance of composer
829
+ // createdAt. Violations suggest schema drift or corrupt timing data.
830
+ //
831
+ // We re-derive the per-composer minimum bubble createdAtMs from
832
+ // bubbleRows directly rather than re-running the parser, so the check
833
+ // stays independent of any changes to the main aggregation logic.
834
+ const minBubbleCreatedAtByComposer = new Map();
835
+ for (const row of bubbleRows) {
836
+ const bubble = parseBubble(row);
837
+ if (!bubble || bubble.createdAtMs === null)
838
+ continue;
839
+ const prior = minBubbleCreatedAtByComposer.get(bubble.composerId);
840
+ if (prior === undefined || bubble.createdAtMs < prior) {
841
+ minBubbleCreatedAtByComposer.set(bubble.composerId, bubble.createdAtMs);
842
+ }
843
+ }
844
+ const composerMetaByIdForCheck = new Map();
845
+ for (const row of composerRows) {
846
+ const meta = parseComposerData(row);
847
+ if (meta)
848
+ composerMetaByIdForCheck.set(meta.composerId, meta);
849
+ }
850
+ let skewWarnings = 0;
851
+ for (const [composerId, minBubbleMs] of minBubbleCreatedAtByComposer) {
852
+ const meta = composerMetaByIdForCheck.get(composerId);
853
+ if (!meta || meta.createdAtMs === null)
854
+ continue;
855
+ const skew = meta.createdAtMs - minBubbleMs;
856
+ if (skew > INVARIANT_SKEW_TOLERANCE_MS) {
857
+ skewWarnings += 1;
858
+ }
859
+ }
860
+ if (skewWarnings > 0) {
861
+ invariantChecks.push({
862
+ name: "bubble_composer_createdat_skew",
863
+ passed: false,
864
+ details: `${skewWarnings} composers where min(bubble.createdAt) is more than ${INVARIANT_SKEW_TOLERANCE_MS / 1000}s before composerData.createdAt`,
865
+ });
866
+ warnings.push(`${skewWarnings} composers show unexpected clock skew between bubble and composer timestamps. Values are still usable but may indicate schema drift.`);
867
+ }
868
+ else {
869
+ invariantChecks.push({
870
+ name: "bubble_composer_createdat_skew",
871
+ passed: true,
872
+ });
873
+ }
874
+ if (errors.length > 0) {
875
+ result.overallStatus = "FAIL";
876
+ }
877
+ else if (warnings.length > 0) {
878
+ result.overallStatus = "DEGRADED";
879
+ }
880
+ else {
881
+ result.overallStatus = "PASS";
882
+ }
883
+ return result;
884
+ }
885
+ // Re-export the type guard so the MCP tool registration in index.ts can
886
+ // distinguish CursorParserError from generic Error in its catch block.
887
+ export { isCursorParserError };
888
+ //# sourceMappingURL=cursor-parser.js.map