@aperdomoll90/ledger-ai 1.3.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/cli.js +177 -221
  2. package/dist/commands/add.js +51 -100
  3. package/dist/commands/backfill.js +55 -0
  4. package/dist/commands/backup.js +10 -10
  5. package/dist/commands/check.js +21 -29
  6. package/dist/commands/config.js +13 -12
  7. package/dist/commands/delete.js +22 -17
  8. package/dist/commands/eval-judge.js +11 -0
  9. package/dist/commands/eval.js +321 -0
  10. package/dist/commands/export.js +8 -10
  11. package/dist/commands/get.js +9 -0
  12. package/dist/commands/hunt.js +206 -0
  13. package/dist/commands/ingest.js +15 -14
  14. package/dist/commands/init.js +18 -20
  15. package/dist/commands/list.js +21 -7
  16. package/dist/commands/migrate.js +11 -11
  17. package/dist/commands/onboard.js +2 -2
  18. package/dist/commands/pull.js +3 -2
  19. package/dist/commands/push.js +8 -8
  20. package/dist/commands/restore.js +38 -38
  21. package/dist/commands/show.js +13 -16
  22. package/dist/commands/sync.js +58 -19
  23. package/dist/commands/tag.js +20 -14
  24. package/dist/commands/update.js +50 -18
  25. package/dist/commands/wizard.js +3 -3
  26. package/dist/lib/ai-search.js +163 -0
  27. package/dist/lib/audit.js +19 -0
  28. package/dist/lib/backfill.js +60 -0
  29. package/dist/lib/config.js +19 -2
  30. package/dist/lib/document-classification.js +5 -0
  31. package/dist/lib/document-fetching.js +77 -0
  32. package/dist/lib/document-operations.js +150 -0
  33. package/dist/lib/documents/classification.js +5 -0
  34. package/dist/lib/documents/fetching.js +89 -0
  35. package/dist/lib/documents/operations.js +304 -0
  36. package/dist/lib/domains.js +116 -0
  37. package/dist/lib/embeddings.js +190 -0
  38. package/dist/lib/errors.js +3 -1
  39. package/dist/lib/eval/eval-advanced.js +289 -0
  40. package/dist/lib/eval/eval-judge-session.js +233 -0
  41. package/dist/lib/eval/eval-store.js +105 -0
  42. package/dist/lib/eval/eval.js +303 -0
  43. package/dist/lib/file-writer.js +23 -0
  44. package/dist/lib/generators.js +44 -45
  45. package/dist/lib/hunter-db.js +235 -0
  46. package/dist/lib/hunter-rss.js +30 -0
  47. package/dist/lib/hunter-scoring.js +55 -0
  48. package/dist/lib/hunter-types.js +36 -0
  49. package/dist/lib/lint-configs.js +20 -0
  50. package/dist/lib/migrate.js +2 -2
  51. package/dist/lib/notes.js +173 -59
  52. package/dist/lib/observability.js +296 -0
  53. package/dist/lib/op-add-note-types.test.js +7 -6
  54. package/dist/lib/prompt.js +8 -8
  55. package/dist/lib/rate-limiter.js +103 -0
  56. package/dist/lib/search/ai-search.js +396 -0
  57. package/dist/lib/search/chunk-context-enrichment.js +155 -0
  58. package/dist/lib/search/embeddings.js +293 -0
  59. package/dist/lib/search/reranker.js +120 -0
  60. package/dist/lib/search/semantic-cache.js +53 -0
  61. package/dist/lib/type-registry.test.js +6 -6
  62. package/dist/mcp-server.js +553 -66
  63. package/dist/migrations/migrations/005-audit-log.sql +22 -0
  64. package/dist/migrations/migrations/005_opportunities.sql +48 -0
  65. package/dist/migrations/migrations/006-audited-operations.sql +235 -0
  66. package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
  67. package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
  68. package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
  69. package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
  70. package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
  71. package/dist/scripts/batch-grade.js +344 -0
  72. package/dist/scripts/benchmark-ingestion.js +376 -0
  73. package/dist/scripts/convert-judgments-to-graded.js +88 -0
  74. package/dist/scripts/diagnose-first-result.js +333 -0
  75. package/dist/scripts/drop-golden-query.js +53 -0
  76. package/dist/scripts/eval-search.js +115 -0
  77. package/dist/scripts/grade-unjudged-top1.js +138 -0
  78. package/dist/scripts/hunter-analytics.js +38 -0
  79. package/dist/scripts/hunter-cron.js +63 -0
  80. package/dist/scripts/hunter-purge.js +25 -0
  81. package/dist/scripts/migrate-v2.js +140 -0
  82. package/dist/scripts/reindex.js +74 -0
  83. package/dist/scripts/sync-local-docs.js +153 -0
  84. package/package.json +7 -1
@@ -0,0 +1,55 @@
1
+ import { backfillMetadata } from '../lib/backfill.js';
2
+ export async function backfill(config, options) {
3
+ const { dryRun } = options;
4
+ console.error('Fetching all notes...');
5
+ const { data: notes, error } = await config.supabase
6
+ .from('notes')
7
+ .select('id, metadata')
8
+ .order('id', { ascending: true });
9
+ if (error) {
10
+ console.error(`Error fetching notes: ${error.message}`);
11
+ process.exit(1);
12
+ }
13
+ if (!notes || notes.length === 0) {
14
+ console.error('No notes found.');
15
+ return;
16
+ }
17
+ console.error(`Found ${notes.length} notes. Running v2 backfill...`);
18
+ let migrated = 0;
19
+ let skipped = 0;
20
+ let errors = 0;
21
+ for (const note of notes) {
22
+ const oldMeta = note.metadata;
23
+ const newMeta = backfillMetadata(oldMeta);
24
+ // Check if anything changed (idempotent skip)
25
+ if (JSON.stringify(oldMeta) === JSON.stringify(newMeta)) {
26
+ skipped++;
27
+ continue;
28
+ }
29
+ if (dryRun) {
30
+ const oldType = oldMeta.type ?? '?';
31
+ const newType = newMeta.type ?? '?';
32
+ const domain = newMeta.domain ?? '?';
33
+ const key = oldMeta.upsert_key ?? `id-${note.id}`;
34
+ console.error(` [${note.id}] ${key}: ${oldType} → ${domain}/${newType}`);
35
+ migrated++;
36
+ continue;
37
+ }
38
+ const { error: updateError } = await config.supabase
39
+ .from('notes')
40
+ .update({ metadata: newMeta })
41
+ .eq('id', note.id);
42
+ if (updateError) {
43
+ console.error(` [${note.id}] ERROR: ${updateError.message}`);
44
+ errors++;
45
+ }
46
+ else {
47
+ migrated++;
48
+ }
49
+ }
50
+ console.error(`\nBackfill ${dryRun ? '(dry run) ' : ''}complete:`);
51
+ console.error(` ${migrated} migrated, ${skipped} already up-to-date, ${errors} errors`);
52
+ if (dryRun && migrated > 0) {
53
+ console.error('\nRun without --dry-run to apply changes.');
54
+ }
55
+ }
@@ -6,10 +6,10 @@ export async function backup(config, options) {
6
6
  const { quiet } = options;
7
7
  const backupDir = resolve(getLedgerDir(), 'backups');
8
8
  mkdirSync(backupDir, { recursive: true });
9
- // Fetch all notes (not just cached)
9
+ // Fetch all documents (not just cached)
10
10
  const { data, error } = await config.supabase
11
- .from('notes')
12
- .select('id, content, metadata, created_at, updated_at')
11
+ .from('documents')
12
+ .select('id, name, domain, document_type, project, protection, content, description, status, created_at, updated_at')
13
13
  .order('id', { ascending: true });
14
14
  if (error) {
15
15
  console.error(`Backup failed: ${error.message}`);
@@ -17,7 +17,7 @@ export async function backup(config, options) {
17
17
  }
18
18
  if (!data || data.length === 0) {
19
19
  if (!quiet)
20
- console.error('No notes to backup.');
20
+ console.error('No documents to backup.');
21
21
  return;
22
22
  }
23
23
  const date = new Date().toISOString().split('T')[0];
@@ -25,7 +25,7 @@ export async function backup(config, options) {
25
25
  writeFileSync(filePath, JSON.stringify(data, null, 2) + '\n');
26
26
  // Keep last 5 backups, delete older
27
27
  const backups = readdirSync(backupDir)
28
- .filter(f => f.endsWith('.json'))
28
+ .filter(file => file.endsWith('.json'))
29
29
  .sort()
30
30
  .reverse();
31
31
  for (const old of backups.slice(5)) {
@@ -34,7 +34,7 @@ export async function backup(config, options) {
34
34
  console.error(` deleted old backup: ${old}`);
35
35
  }
36
36
  if (!quiet) {
37
- console.error(`Backed up ${data.length} notes to ${filePath}`);
37
+ console.error(`Backed up ${data.length} documents to ${filePath}`);
38
38
  }
39
39
  console.log(filePath);
40
40
  }
@@ -59,8 +59,8 @@ export function enableBackupCron() {
59
59
  throw new Error(result.stderr?.toString() || 'crontab failed');
60
60
  console.error('Daily backup enabled (1am). View with `crontab -l`.');
61
61
  }
62
- catch (e) {
63
- console.error(`Failed to set cron: ${e.message}`);
62
+ catch (cronError) {
63
+ console.error(`Failed to set cron: ${cronError.message}`);
64
64
  console.error(`Add manually: ${cronLine}`);
65
65
  }
66
66
  }
@@ -83,7 +83,7 @@ export function disableBackupCron() {
83
83
  throw new Error(result.stderr?.toString() || 'crontab failed');
84
84
  console.error('Backup cron disabled.');
85
85
  }
86
- catch (e) {
87
- console.error(`Failed to update cron: ${e.message}`);
86
+ catch (cronError) {
87
+ console.error(`Failed to update cron: ${cronError.message}`);
88
88
  }
89
89
  }
@@ -1,6 +1,6 @@
1
1
  import { readFileSync, readdirSync, existsSync } from 'fs';
2
- import { resolve } from 'path';
3
- import { fetchNoteHashes, checkChunkIntegrity } from '../lib/notes.js';
2
+ import { resolve, basename } from 'path';
3
+ import { fetchSyncableDocuments } from '../lib/documents/fetching.js';
4
4
  import { contentHash } from '../lib/hash.js';
5
5
  export async function check(config) {
6
6
  const result = {
@@ -16,51 +16,55 @@ export async function check(config) {
16
16
  console.error('Memory directory not found. Run `ledger pull` first.');
17
17
  return result;
18
18
  }
19
- const noteHashes = await fetchNoteHashes(config.supabase);
20
- const notesByFile = new Map(noteHashes.map(n => [n.localFile, n]));
19
+ // Fetch all auto-load documents — these are the ones that sync locally
20
+ const syncableDocuments = await fetchSyncableDocuments(config.supabase);
21
+ // Build a map of local filename → document for comparison
22
+ const documentsByFile = new Map(syncableDocuments
23
+ .filter(document => document.file_path)
24
+ .map(document => [basename(document.file_path), document]));
21
25
  const localFiles = readdirSync(config.memoryDir)
22
- .filter(f => f.endsWith('.md') && f !== 'MEMORY.md');
26
+ .filter(file => file.endsWith('.md') && file !== 'MEMORY.md');
23
27
  for (const file of localFiles) {
24
28
  const filePath = resolve(config.memoryDir, file);
25
29
  const localContent = readFileSync(filePath, 'utf-8').trim();
26
30
  const localHash = contentHash(localContent);
27
- const note = notesByFile.get(file);
28
- if (!note) {
31
+ const document = documentsByFile.get(file);
32
+ if (!document) {
29
33
  console.error(` ${file} — unknown (not in Ledger)`);
30
34
  result.files.push({ file, state: 'unknown' });
31
35
  result.unknown++;
32
- notesByFile.delete(file);
36
+ documentsByFile.delete(file);
33
37
  continue;
34
38
  }
35
- const ledgerHash = contentHash(note.content);
36
- const storedHash = note.contentHash;
39
+ const ledgerHash = contentHash(document.content);
40
+ const storedHash = document.content_hash;
37
41
  const localChanged = localHash !== storedHash;
38
42
  const ledgerChanged = ledgerHash !== storedHash;
39
43
  if (!localChanged && !ledgerChanged) {
40
44
  console.error(` ${file} — in sync`);
41
- result.files.push({ file, state: 'clean', noteId: note.id });
45
+ result.files.push({ file, state: 'clean', documentId: document.id });
42
46
  result.clean++;
43
47
  }
44
48
  else if (localChanged && !ledgerChanged) {
45
49
  console.error(` ${file} — modified locally`);
46
- result.files.push({ file, state: 'modified', noteId: note.id });
50
+ result.files.push({ file, state: 'modified', documentId: document.id });
47
51
  result.modified++;
48
52
  }
49
53
  else if (!localChanged && ledgerChanged) {
50
54
  console.error(` ${file} — updated in Ledger`);
51
- result.files.push({ file, state: 'upstream', noteId: note.id });
55
+ result.files.push({ file, state: 'upstream', documentId: document.id });
52
56
  result.upstream++;
53
57
  }
54
58
  else {
55
59
  console.error(` ${file} — CONFLICT (both changed)`);
56
- result.files.push({ file, state: 'conflict', noteId: note.id });
60
+ result.files.push({ file, state: 'conflict', documentId: document.id });
57
61
  result.conflicts++;
58
62
  }
59
- notesByFile.delete(file);
63
+ documentsByFile.delete(file);
60
64
  }
61
- for (const [file, note] of notesByFile) {
65
+ for (const [file, document] of documentsByFile) {
62
66
  console.error(` ${file} — missing locally (exists in Ledger)`);
63
- result.files.push({ file, state: 'deleted', noteId: note.id });
67
+ result.files.push({ file, state: 'deleted', documentId: document.id });
64
68
  result.deleted++;
65
69
  }
66
70
  const summary = [
@@ -77,15 +81,3 @@ export async function check(config) {
77
81
  }
78
82
  return result;
79
83
  }
80
- export async function checkChunks(config) {
81
- console.error('Checking chunk integrity...');
82
- const result = await checkChunkIntegrity(config.supabase);
83
- if (result.incompleteGroups.length === 0) {
84
- console.log('All chunk groups are complete.');
85
- return;
86
- }
87
- console.error(`Found ${result.incompleteGroups.length} incomplete chunk group(s):`);
88
- for (const group of result.incompleteGroups) {
89
- console.error(` group ${group.groupId}: expected ${group.expected} chunks, found ${group.found}`);
90
- }
91
- }
@@ -1,5 +1,5 @@
1
1
  import { getLedgerDir, saveConfigFile, loadConfigFile } from '../lib/config.js';
2
- import { BUILTIN_TYPES, getTypeRegistry, opUpdateMetadata, validateTypeName } from '../lib/notes.js';
2
+ import { BUILTIN_TYPES, getTypeRegistry, opUpdateMetadata, validateTypeName, inferDomain } from '../lib/notes.js';
3
3
  import { choose, confirm } from '../lib/prompt.js';
4
4
  import { resolve } from 'path';
5
5
  const CONFIG_PATH = resolve(getLedgerDir(), 'config.json');
@@ -156,7 +156,7 @@ export async function configSet(key, value, clients) {
156
156
  const typeName = key.slice(6);
157
157
  const delivery = value;
158
158
  if (!['persona', 'project', 'knowledge', 'protected'].includes(delivery)) {
159
- console.error(`Invalid delivery tier: "${value}". Must be: persona, project, knowledge, or protected.`);
159
+ console.error(`Invalid domain/delivery: "${value}". Must be: persona, project, knowledge, or protected.`);
160
160
  process.exit(1);
161
161
  }
162
162
  const nameError = validateTypeName(typeName);
@@ -172,30 +172,31 @@ export async function configSet(key, value, clients) {
172
172
  const isBuiltin = typeName in BUILTIN_TYPES;
173
173
  const action = isBuiltin ? 'overridden' : 'registered';
174
174
  console.error(`types.${typeName}: ${delivery} (${action})`);
175
- // Delivery change propagation — only if we have DB access and delivery actually changed
175
+ // Domain change propagation — only if we have DB access and delivery actually changed
176
176
  if (clients && oldDelivery && oldDelivery !== delivery) {
177
+ const newDomain = inferDomain(typeName);
177
178
  const { data: notes } = await clients.supabase
178
179
  .from('notes')
179
180
  .select('id, metadata')
180
181
  .eq('metadata->>type', typeName);
181
- const affected = (notes ?? []).filter((n) => n.metadata.delivery !== delivery);
182
+ const affected = (notes ?? []).filter((n) => n.metadata.domain !== newDomain);
182
183
  if (affected.length > 0) {
183
- console.error(`\n${affected.length} note(s) currently have a different delivery:`);
184
+ console.error(`\n${affected.length} note(s) currently have a different domain:`);
184
185
  for (const note of affected) {
185
186
  const meta = note.metadata;
186
187
  const uKey = meta.upsert_key || `id-${note.id}`;
187
- console.error(` [${note.id}] ${uKey} — delivery: ${meta.delivery}`);
188
+ console.error(` [${note.id}] ${uKey} — domain: ${meta.domain}`);
188
189
  }
189
- const action = await choose('\nUpdate delivery on these notes?', [
190
+ const action = await choose('\nUpdate domain on these notes?', [
190
191
  'all — update all notes',
191
192
  'select — choose individually',
192
193
  'none — only affect new notes',
193
194
  ]);
194
195
  if (action.startsWith('all')) {
195
196
  for (const note of affected) {
196
- await opUpdateMetadata(clients, note.id, { delivery });
197
+ await opUpdateMetadata(clients, note.id, { domain: newDomain });
197
198
  }
198
- console.error(`Updated delivery to "${delivery}" on ${affected.length} note(s).`);
199
+ console.error(`Updated domain to "${newDomain}" on ${affected.length} note(s).`);
199
200
  }
200
201
  else if (action.startsWith('select')) {
201
202
  let updated = 0;
@@ -204,11 +205,11 @@ export async function configSet(key, value, clients) {
204
205
  const uKey = meta.upsert_key || `id-${note.id}`;
205
206
  const yes = await confirm(` Update [${note.id}] ${uKey}?`);
206
207
  if (yes) {
207
- await opUpdateMetadata(clients, note.id, { delivery });
208
+ await opUpdateMetadata(clients, note.id, { domain: newDomain });
208
209
  updated++;
209
210
  }
210
211
  }
211
- console.error(`Updated delivery on ${updated} note(s).`);
212
+ console.error(`Updated domain on ${updated} note(s).`);
212
213
  }
213
214
  }
214
215
  }
@@ -238,7 +239,7 @@ export async function configUnset(key, clients) {
238
239
  .select('id')
239
240
  .eq('metadata->>type', typeName);
240
241
  if (notes && notes.length > 0) {
241
- console.error(`\n${notes.length} note(s) use type "${typeName}". They will become unregistered (delivery defaults to "knowledge").`);
242
+ console.error(`\n${notes.length} note(s) use type "${typeName}". They will become unregistered (domain defaults to "project").`);
242
243
  const proceed = await confirm('Proceed?');
243
244
  if (!proceed) {
244
245
  console.error('Cancelled.');
@@ -1,22 +1,27 @@
1
- import { opDeleteNote } from '../lib/notes.js';
1
+ import { getDocumentById } from '../lib/documents/fetching.js';
2
+ import { deleteDocument as deleteDocumentOperation } from '../lib/documents/operations.js';
2
3
  import { confirm } from '../lib/prompt.js';
3
- export async function deleteNote(config, id) {
4
- const clients = { supabase: config.supabase, openai: config.openai };
5
- // First call: show confirmation
6
- const preview = await opDeleteNote(clients, id, false);
7
- if (preview.status === 'error') {
8
- console.error(preview.message);
4
+ export async function removeDocument(config, id, options = {}) {
5
+ const document = await getDocumentById(config.supabase, id);
6
+ if (!document) {
7
+ console.error(`Document ${id} not found.`);
9
8
  process.exit(1);
10
9
  }
11
- console.error(preview.message);
12
- const proceed = await confirm('\nProceed with deletion?');
13
- if (!proceed) {
14
- console.error('Cancelled.');
15
- return;
16
- }
17
- // Second call: execute
18
- const result = await opDeleteNote(clients, id, true);
19
- console.error(result.message);
20
- if (result.status === 'error')
10
+ if (document.protection === 'immutable') {
11
+ console.error(`Document "${document.name}" (id: ${id}) is immutable and cannot be deleted.`);
21
12
  process.exit(1);
13
+ }
14
+ console.error(`Document: "${document.name}" (id: ${id})`);
15
+ console.error(`Domain: ${document.domain} | Type: ${document.document_type}`);
16
+ console.error(`Protection: ${document.protection}`);
17
+ console.error(`Content preview: ${document.content.slice(0, 200)}${document.content.length > 200 ? '...' : ''}`);
18
+ if (!options.yes) {
19
+ const proceed = await confirm('\nProceed with deletion?');
20
+ if (!proceed) {
21
+ console.error('Cancelled.');
22
+ return;
23
+ }
24
+ }
25
+ await deleteDocumentOperation({ supabase: config.supabase, openai: config.openai }, id, 'cli');
26
+ console.error(`Document ${id} soft-deleted. Can be restored within 30 days.`);
22
27
  }
@@ -0,0 +1,11 @@
1
+ // eval-judge.ts
2
+ // CLI command for the graded-relevance rejudging walkthrough.
3
+ import { runJudgeSession } from '../lib/eval/eval-judge-session.js';
4
+ export async function evalJudge(config, options) {
5
+ const clients = {
6
+ supabase: config.supabase,
7
+ openai: config.openai,
8
+ cohereApiKey: config.cohereApiKey,
9
+ };
10
+ await runJudgeSession(clients, options.query);
11
+ }
@@ -0,0 +1,321 @@
1
+ import { randomUUID } from 'node:crypto';
2
+ import { searchHybrid } from '../lib/search/ai-search.js';
3
+ import { scoreTestCase, computeMetrics, formatReport, compareRuns, formatComparison } from '../lib/eval/eval.js';
4
+ import { saveEvalRun, loadPreviousRun, loadEvalRun, CURRENT_SEARCH_CONFIG } from '../lib/eval/eval-store.js';
5
+ import { computeConfidenceIntervals, computeScoreCalibration, computeCoverageAnalysis, formatAdvancedReport } from '../lib/eval/eval-advanced.js';
6
+ import { runEvalTrace, runEvalQuerySpan, startSpan } from '../lib/observability.js';
7
+ // Search config imported from eval-store.ts (single source of truth)
8
+ // =============================================================================
9
+ // Command
10
+ // =============================================================================
11
+ export async function evalSearch(config, options) {
12
+ // Tag every search trace from this eval run with environment=eval and a
13
+ // shared session ID so the Langfuse dashboard can filter eval traffic from
14
+ // prod and group all queries in this run under one session.
15
+ const evalRunUuid = randomUUID();
16
+ const clients = {
17
+ supabase: config.supabase,
18
+ openai: config.openai,
19
+ cohereApiKey: config.cohereApiKey,
20
+ sessionId: `eval-${evalRunUuid}`,
21
+ observabilityEnvironment: 'eval',
22
+ };
23
+ console.log('\n' + '='.repeat(60));
24
+ console.log('Ledger Search Evaluation');
25
+ if (options.dryRun)
26
+ console.log('(dry run — results will not be saved)');
27
+ console.log('='.repeat(60) + '\n');
28
+ const previousRun = await loadPreviousRun(clients.supabase);
29
+ if (previousRun) {
30
+ console.log(`Previous run: ${previousRun.run_date} (id: ${previousRun.id})\n`);
31
+ }
32
+ else {
33
+ console.log('No previous run found — this will be the first stored run.\n');
34
+ }
35
+ const { data: testCases, error } = await clients.supabase
36
+ .from('eval_golden_dataset')
37
+ .select('id, query, tags, judgments:eval_golden_judgments(document_id, grade)')
38
+ .order('id');
39
+ if (error || !testCases) {
40
+ process.stderr.write(`Failed to load golden dataset: ${error?.message ?? 'no data'}\n`);
41
+ process.exit(1);
42
+ }
43
+ console.log(`Loaded ${testCases.length} test cases.\n`);
44
+ await runEvalTrace({
45
+ sessionId: clients.sessionId,
46
+ tags: ['eval', 'run'],
47
+ config: CURRENT_SEARCH_CONFIG,
48
+ dryRun: options.dryRun,
49
+ }, async (evalTrace) => {
50
+ const results = [];
51
+ for (const testCase of testCases) {
52
+ const scored = await runEvalQuerySpan({
53
+ query: testCase.query,
54
+ goldenId: testCase.id,
55
+ tags: testCase.tags,
56
+ expectedDocs: testCase.judgments
57
+ .filter(judgment => judgment.grade >= 2)
58
+ .map(judgment => judgment.document_id),
59
+ }, async (querySpan) => {
60
+ const startTime = Date.now();
61
+ const searchResults = await searchHybrid(clients, {
62
+ query: testCase.query,
63
+ limit: CURRENT_SEARCH_CONFIG.limit,
64
+ reranker: CURRENT_SEARCH_CONFIG.reranker,
65
+ });
66
+ const result = scoreTestCase(testCase, searchResults, Date.now() - startTime);
67
+ querySpan.update({
68
+ output: {
69
+ hit: result.hit,
70
+ firstResultHit: result.firstResultHit,
71
+ position: result.position,
72
+ reciprocalRank: result.reciprocalRank,
73
+ normalizedDiscountedCumulativeGain: result.normalizedDiscountedCumulativeGain,
74
+ responseTimeMs: Date.now() - startTime,
75
+ },
76
+ });
77
+ return result;
78
+ });
79
+ results.push(scored);
80
+ const isOutOfScope = !testCase.judgments.some(judgment => judgment.grade >= 2);
81
+ if (isOutOfScope) {
82
+ const status = scored.hit ? 'PASS' : `NOISE (${scored.returnedIds.length} results)`;
83
+ console.log(` [${status}] "${testCase.query}" (out-of-scope)`);
84
+ }
85
+ else {
86
+ const status = scored.firstResultHit ? 'TOP' : scored.hit ? 'HIT' : 'MISS';
87
+ const positionInfo = scored.position !== null ? `@${scored.position + 1}` : '';
88
+ console.log(` [${status}${positionInfo}] "${testCase.query}" → found ${scored.expectedFound}/${scored.expectedTotal}`);
89
+ }
90
+ }
91
+ const metrics = computeMetrics(results);
92
+ console.log('\n' + formatReport(metrics));
93
+ // Advanced analysis
94
+ const confidenceIntervals = computeConfidenceIntervals(results);
95
+ const scoreCalibration = computeScoreCalibration(results);
96
+ const coverageAnalysis = computeCoverageAnalysis(results);
97
+ // Eval analysis span
98
+ const analysisSpan = startSpan('eval-analysis');
99
+ analysisSpan.update({
100
+ input: {
101
+ testCaseCount: results.length,
102
+ normalCount: metrics.normalCases,
103
+ outOfScopeCount: metrics.outOfScopeCases,
104
+ },
105
+ });
106
+ if (!options.dryRun) {
107
+ const runId = await saveEvalRun(clients.supabase, {
108
+ metrics,
109
+ config: CURRENT_SEARCH_CONFIG,
110
+ results,
111
+ confidenceIntervals,
112
+ scoreCalibration,
113
+ coverageAnalysis,
114
+ });
115
+ process.stderr.write(`\nRun saved to eval_runs (id: ${runId})\n`);
116
+ }
117
+ let comparisonSeverity = 'none';
118
+ if (previousRun) {
119
+ const comparison = compareRuns({
120
+ hitRate: metrics.hitRate,
121
+ firstResultAccuracy: metrics.firstResultAccuracy,
122
+ recall: metrics.recall,
123
+ zeroResultRate: metrics.zeroResultRate,
124
+ meanReciprocalRank: metrics.meanReciprocalRank,
125
+ normalizedDiscountedCumulativeGain: metrics.normalizedDiscountedCumulativeGain,
126
+ avgResponseTimeMs: metrics.avgResponseTimeMs,
127
+ }, {
128
+ hitRate: previousRun.hit_rate,
129
+ firstResultAccuracy: previousRun.first_result_accuracy,
130
+ recall: previousRun.recall,
131
+ zeroResultRate: previousRun.zero_result_rate,
132
+ meanReciprocalRank: previousRun.mean_reciprocal_rank ?? 0,
133
+ normalizedDiscountedCumulativeGain: previousRun.normalized_discounted_cumulative_gain ?? 0,
134
+ avgResponseTimeMs: previousRun.avg_response_time_ms,
135
+ });
136
+ console.log('\n' + formatComparison(comparison));
137
+ comparisonSeverity = comparison.severity;
138
+ }
139
+ analysisSpan.update({
140
+ output: { metrics, comparisonSeverity },
141
+ });
142
+ analysisSpan.end();
143
+ evalTrace.update({
144
+ output: {
145
+ hitRate: metrics.hitRate,
146
+ firstResultAccuracy: metrics.firstResultAccuracy,
147
+ recall: metrics.recall,
148
+ meanReciprocalRank: metrics.meanReciprocalRank,
149
+ normalizedDiscountedCumulativeGain: metrics.normalizedDiscountedCumulativeGain,
150
+ comparisonSeverity,
151
+ },
152
+ });
153
+ console.log('\n' + formatAdvancedReport(confidenceIntervals, scoreCalibration, coverageAnalysis));
154
+ });
155
+ }
156
+ // =============================================================================
157
+ // Threshold sweep — test multiple thresholds to find optimal value
158
+ // =============================================================================
159
+ /**
160
+ * Run the golden dataset at multiple similarity thresholds and compare.
161
+ * Prints a table showing how each metric changes with the threshold.
162
+ *
163
+ * Usage: ledger eval:sweep
164
+ * ledger eval:sweep --thresholds 0.15,0.20,0.25,0.30,0.35,0.40
165
+ */
166
+ export async function sweepThreshold(config, options) {
167
+ const sweepRunUuid = randomUUID();
168
+ const clients = {
169
+ supabase: config.supabase,
170
+ openai: config.openai,
171
+ cohereApiKey: config.cohereApiKey,
172
+ sessionId: `eval-sweep-${sweepRunUuid}`,
173
+ observabilityEnvironment: 'eval',
174
+ };
175
+ const thresholds = options.thresholds
176
+ .split(',')
177
+ .map(value => parseFloat(value.trim()))
178
+ .filter(value => !isNaN(value) && value > 0 && value < 1);
179
+ if (thresholds.length === 0) {
180
+ console.error('No valid thresholds provided. Use comma-separated values like: 0.15,0.20,0.25');
181
+ process.exit(1);
182
+ }
183
+ const { data: testCases, error } = await clients.supabase
184
+ .from('eval_golden_dataset')
185
+ .select('id, query, tags, judgments:eval_golden_judgments(document_id, grade)')
186
+ .order('id');
187
+ if (error || !testCases) {
188
+ console.error('Failed to load golden dataset:', error?.message ?? 'no data');
189
+ process.exit(1);
190
+ }
191
+ const goldenCases = testCases;
192
+ const normalCount = goldenCases.filter(testCase => testCase.judgments.some(judgment => judgment.grade >= 2)).length;
193
+ console.log(`\nLoaded ${goldenCases.length} test cases (${normalCount} normal)\n`);
194
+ console.log('threshold | hit_rate | first_result | recall | MRR | NDCG | avg_ms');
195
+ console.log('----------|----------|--------------|----------|--------|--------|-------');
196
+ for (const threshold of thresholds) {
197
+ const results = [];
198
+ for (const testCase of goldenCases) {
199
+ const startTime = Date.now();
200
+ const searchResults = await searchHybrid(clients, {
201
+ query: testCase.query,
202
+ limit: CURRENT_SEARCH_CONFIG.limit,
203
+ threshold,
204
+ reranker: CURRENT_SEARCH_CONFIG.reranker,
205
+ });
206
+ results.push(scoreTestCase(testCase, searchResults, Date.now() - startTime));
207
+ }
208
+ const metrics = computeMetrics(results);
209
+ // metrics.hitRate etc are already percentages (0-100) from computeMetrics
210
+ console.log(`${threshold.toFixed(2).padStart(9)} | ` +
211
+ `${metrics.hitRate.toFixed(1).padStart(6)}% | ` +
212
+ `${metrics.firstResultAccuracy.toFixed(1).padStart(10)}% | ` +
213
+ `${metrics.recall.toFixed(1).padStart(6)}% | ` +
214
+ `${metrics.meanReciprocalRank.toFixed(3).padStart(6)} | ` +
215
+ `${metrics.normalizedDiscountedCumulativeGain.toFixed(3).padStart(6)} | ` +
216
+ `${metrics.avgResponseTimeMs.toFixed(0).padStart(5)}`);
217
+ }
218
+ console.log(`\nCurrent threshold: ${CURRENT_SEARCH_CONFIG.threshold}`);
219
+ }
220
+ async function fetchDocLookup(supabase, docIds) {
221
+ const lookup = new Map();
222
+ if (docIds.length === 0)
223
+ return lookup;
224
+ const { data, error } = await supabase
225
+ .from('documents')
226
+ .select('id, name, content')
227
+ .in('id', docIds);
228
+ if (error || !data)
229
+ return lookup;
230
+ for (const documentRow of data) {
231
+ const content = documentRow.content ?? '';
232
+ const snippet = content.replace(/\s+/g, ' ').slice(0, 140);
233
+ lookup.set(documentRow.id, { id: documentRow.id, name: documentRow.name, snippet });
234
+ }
235
+ return lookup;
236
+ }
237
+ export async function showEvalRun(config, runId, options) {
238
+ const supabase = config.supabase;
239
+ const run = await loadEvalRun(supabase, runId);
240
+ if (!run) {
241
+ process.stderr.write(`Eval run ${runId} not found\n`);
242
+ process.exit(1);
243
+ }
244
+ console.log('\n' + '='.repeat(60));
245
+ console.log(`Eval Run ${run.id} — ${run.run_date}`);
246
+ console.log('='.repeat(60));
247
+ console.log(`Test cases: ${run.test_case_count}`);
248
+ console.log(`Hit rate: ${run.hit_rate.toFixed(1)}%`);
249
+ console.log(`First-result acc: ${run.first_result_accuracy.toFixed(1)}%`);
250
+ console.log(`Recall: ${run.recall.toFixed(1)}%`);
251
+ console.log(`MRR: ${(run.mean_reciprocal_rank ?? 0).toFixed(3)}`);
252
+ console.log(`NDCG: ${(run.normalized_discounted_cumulative_gain ?? 0).toFixed(3)}`);
253
+ console.log(`Zero-result rate: ${run.zero_result_rate.toFixed(1)}%`);
254
+ console.log(`Avg response (ms): ${run.avg_response_time_ms.toFixed(0)}`);
255
+ const missedQueries = run.missed_queries ?? [];
256
+ console.log(`\nMissed queries: ${missedQueries.length}\n`);
257
+ if (missedQueries.length === 0) {
258
+ console.log(' (none)');
259
+ return;
260
+ }
261
+ // Support both the new graded shape (judgments) and the legacy binary shape
262
+ // (expected) so eval:show still works against historical runs.
263
+ const expectedDocsFor = (missedQuery) => {
264
+ const judgments = missedQuery.judgments;
265
+ if (judgments && judgments.length > 0) {
266
+ return judgments
267
+ .filter(judgment => judgment.grade >= 2)
268
+ .map(judgment => ({ id: judgment.document_id, grade: judgment.grade }));
269
+ }
270
+ const expected = missedQuery.expected;
271
+ return (expected ?? []).map(id => ({ id }));
272
+ };
273
+ // Resolve doc ids → names + snippets in one batch
274
+ const allDocIds = new Set();
275
+ for (const missedQuery of missedQueries) {
276
+ for (const expected of expectedDocsFor(missedQuery))
277
+ allDocIds.add(expected.id);
278
+ for (const returnedId of missedQuery.got.slice(0, 3))
279
+ allDocIds.add(returnedId);
280
+ }
281
+ const lookup = await fetchDocLookup(supabase, Array.from(allDocIds));
282
+ const formatDoc = (docId, score) => {
283
+ const document = lookup.get(docId);
284
+ const documentName = document?.name ?? '<unknown>';
285
+ const scoreLabel = score !== undefined ? ` (${score.toFixed(3)})` : '';
286
+ return `#${docId} ${documentName}${scoreLabel}`;
287
+ };
288
+ for (const [missedIndex, missedQuery] of missedQueries.entries()) {
289
+ console.log(`[${missedIndex + 1}] "${missedQuery.query}"`);
290
+ if (missedQuery.tags.length > 0)
291
+ console.log(` tags: ${missedQuery.tags.join(', ')}`);
292
+ const expectedDocs = expectedDocsFor(missedQuery);
293
+ console.log(` expected:`);
294
+ for (const expected of expectedDocs) {
295
+ const gradeLabel = expected.grade !== undefined ? ` (grade ${expected.grade})` : '';
296
+ console.log(` - ${formatDoc(expected.id)}${gradeLabel}`);
297
+ }
298
+ if (missedQuery.got.length === 0) {
299
+ console.log(` got: (none — zero results)`);
300
+ }
301
+ else {
302
+ console.log(` got (top 3):`);
303
+ const topReturned = Math.min(3, missedQuery.got.length);
304
+ for (let position = 0; position < topReturned; position++) {
305
+ console.log(` ${position + 1}. ${formatDoc(missedQuery.got[position], missedQuery.gotScores[position])}`);
306
+ }
307
+ const topDoc = lookup.get(missedQuery.got[0]);
308
+ if (topDoc?.snippet)
309
+ console.log(` top1 snippet: "${topDoc.snippet}…"`);
310
+ }
311
+ console.log('');
312
+ }
313
+ if (options.full && run.per_query_results) {
314
+ console.log('='.repeat(60));
315
+ console.log('Per-query results (full)');
316
+ console.log('='.repeat(60));
317
+ for (const queryResult of run.per_query_results) {
318
+ console.log(JSON.stringify(queryResult));
319
+ }
320
+ }
321
+ }