@aperdomoll90/ledger-ai 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/cli.js +177 -221
  2. package/dist/commands/add.js +51 -100
  3. package/dist/commands/backfill.js +55 -0
  4. package/dist/commands/backup.js +10 -10
  5. package/dist/commands/check.js +21 -29
  6. package/dist/commands/config.js +13 -12
  7. package/dist/commands/delete.js +22 -17
  8. package/dist/commands/eval-judge.js +11 -0
  9. package/dist/commands/eval.js +321 -0
  10. package/dist/commands/export.js +8 -10
  11. package/dist/commands/get.js +9 -0
  12. package/dist/commands/hunt.js +206 -0
  13. package/dist/commands/ingest.js +15 -14
  14. package/dist/commands/init.js +18 -20
  15. package/dist/commands/list.js +21 -7
  16. package/dist/commands/migrate.js +11 -11
  17. package/dist/commands/onboard.js +2 -2
  18. package/dist/commands/pull.js +3 -2
  19. package/dist/commands/push.js +8 -8
  20. package/dist/commands/restore.js +38 -38
  21. package/dist/commands/show.js +13 -16
  22. package/dist/commands/sync.js +58 -19
  23. package/dist/commands/tag.js +20 -14
  24. package/dist/commands/update.js +50 -18
  25. package/dist/commands/wizard.js +3 -3
  26. package/dist/lib/ai-search.js +163 -0
  27. package/dist/lib/audit.js +19 -0
  28. package/dist/lib/backfill.js +60 -0
  29. package/dist/lib/config.js +19 -2
  30. package/dist/lib/document-classification.js +5 -0
  31. package/dist/lib/document-fetching.js +77 -0
  32. package/dist/lib/document-operations.js +150 -0
  33. package/dist/lib/documents/classification.js +5 -0
  34. package/dist/lib/documents/fetching.js +89 -0
  35. package/dist/lib/documents/operations.js +304 -0
  36. package/dist/lib/domains.js +116 -0
  37. package/dist/lib/embeddings.js +190 -0
  38. package/dist/lib/errors.js +3 -1
  39. package/dist/lib/eval/eval-advanced.js +289 -0
  40. package/dist/lib/eval/eval-judge-session.js +233 -0
  41. package/dist/lib/eval/eval-store.js +105 -0
  42. package/dist/lib/eval/eval.js +303 -0
  43. package/dist/lib/file-writer.js +23 -0
  44. package/dist/lib/generators.js +44 -45
  45. package/dist/lib/hunter-db.js +235 -0
  46. package/dist/lib/hunter-rss.js +30 -0
  47. package/dist/lib/hunter-scoring.js +55 -0
  48. package/dist/lib/hunter-types.js +36 -0
  49. package/dist/lib/lint-configs.js +20 -0
  50. package/dist/lib/migrate.js +2 -2
  51. package/dist/lib/notes.js +173 -59
  52. package/dist/lib/observability.js +296 -0
  53. package/dist/lib/op-add-note-types.test.js +7 -6
  54. package/dist/lib/prompt.js +8 -8
  55. package/dist/lib/rate-limiter.js +103 -0
  56. package/dist/lib/search/ai-search.js +396 -0
  57. package/dist/lib/search/chunk-context-enrichment.js +155 -0
  58. package/dist/lib/search/embeddings.js +293 -0
  59. package/dist/lib/search/reranker.js +120 -0
  60. package/dist/lib/search/semantic-cache.js +53 -0
  61. package/dist/lib/type-registry.test.js +6 -6
  62. package/dist/mcp-server.js +553 -66
  63. package/dist/migrations/migrations/005-audit-log.sql +22 -0
  64. package/dist/migrations/migrations/005_opportunities.sql +48 -0
  65. package/dist/migrations/migrations/006-audited-operations.sql +235 -0
  66. package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
  67. package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
  68. package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
  69. package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
  70. package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
  71. package/dist/scripts/batch-grade.js +344 -0
  72. package/dist/scripts/benchmark-ingestion.js +376 -0
  73. package/dist/scripts/convert-judgments-to-graded.js +88 -0
  74. package/dist/scripts/diagnose-first-result.js +333 -0
  75. package/dist/scripts/drop-golden-query.js +53 -0
  76. package/dist/scripts/eval-search.js +115 -0
  77. package/dist/scripts/grade-unjudged-top1.js +138 -0
  78. package/dist/scripts/hunter-analytics.js +38 -0
  79. package/dist/scripts/hunter-cron.js +63 -0
  80. package/dist/scripts/hunter-purge.js +25 -0
  81. package/dist/scripts/migrate-v2.js +140 -0
  82. package/dist/scripts/reindex.js +74 -0
  83. package/dist/scripts/sync-local-docs.js +153 -0
  84. package/package.json +7 -1
@@ -0,0 +1,333 @@
1
+ // diagnose-first-result.ts
2
+ // Diagnostic analysis of first-result accuracy failures in eval run 16.
3
+ // Classifies every query by failure mode, analyzes score distributions,
4
+ // and writes a structured markdown report.
5
+ //
6
+ // Run: npx tsx src/scripts/diagnose-first-result.ts
7
+ // Custom run: npx tsx src/scripts/diagnose-first-result.ts --run 14
8
+ import 'dotenv/config';
9
+ import { resolve } from 'path';
10
+ import { config } from 'dotenv';
11
+ import { createClient } from '@supabase/supabase-js';
12
+ import { loadEvalRun } from '../lib/eval/eval-store.js';
13
+ import { writeFileSync, mkdirSync } from 'fs';
14
+ // Load .env from ~/.ledger/.env (same as other scripts)
15
+ config({ path: resolve(process.env.HOME ?? '', '.ledger', '.env') });
16
+ const supabaseUrl = process.env.SUPABASE_URL;
17
+ const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY;
18
+ if (!supabaseUrl || !supabaseKey) {
19
+ console.error('Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY');
20
+ process.exit(1);
21
+ }
22
+ const supabase = createClient(supabaseUrl, supabaseKey);
23
+ // Allow overriding run ID via --run flag, default to 16
24
+ const runIdArg = process.argv.indexOf('--run');
25
+ const RUN_ID = runIdArg !== -1 ? parseInt(process.argv[runIdArg + 1], 10) : 16;
26
+ // =============================================================================
27
+ // Classification (priority order per spec)
28
+ // =============================================================================
29
+ const HIT_THRESHOLD = 2;
30
+ function classify(result) {
31
+ const topDocId = result.returnedIds.length > 0 ? result.returnedIds[0] : null;
32
+ const judgmentMap = new Map(result.judgments.map(j => [j.document_id, j.grade]));
33
+ const topDocGrade = topDocId !== null ? (judgmentMap.get(topDocId) ?? null) : null;
34
+ let category;
35
+ // Priority 1: out-of-scope
36
+ if (result.tags.includes('out-of-scope')) {
37
+ category = 'out-of-scope';
38
+ }
39
+ // Priority 2: top-1 correct
40
+ else if (result.firstResultHit) {
41
+ category = 'top-1-correct';
42
+ }
43
+ // Priority 3: unjudged winner (top doc has no judgment at all)
44
+ else if (topDocId !== null && topDocGrade === null) {
45
+ category = 'unjudged-winner';
46
+ }
47
+ // Priority 4: near miss (first correct at position 2-3, i.e., 0-indexed 1-2)
48
+ else if (result.position !== null && result.position >= 1 && result.position <= 2) {
49
+ category = 'near-miss';
50
+ }
51
+ // Priority 5: buried (first correct at position 4-10, i.e., 0-indexed 3-9)
52
+ else if (result.position !== null && result.position >= 3 && result.position <= 9) {
53
+ category = 'buried';
54
+ }
55
+ // Priority 6: absent (no grade >= 2 in top 10)
56
+ else if (!result.hit) {
57
+ category = 'absent';
58
+ }
59
+ // Fallback: position is set but doesn't match ranges (shouldn't happen)
60
+ else {
61
+ category = 'buried';
62
+ }
63
+ return {
64
+ query: result.query,
65
+ tags: result.tags,
66
+ category,
67
+ position: result.position,
68
+ returnedIds: result.returnedIds,
69
+ returnedScores: result.returnedScores,
70
+ judgments: result.judgments,
71
+ topDocId,
72
+ topDocGrade,
73
+ ndcg: result.normalizedDiscountedCumulativeGain,
74
+ };
75
+ }
76
+ function classifyAll(results) {
77
+ return results.map(classify);
78
+ }
79
+ function computeStats(values) {
80
+ if (values.length === 0)
81
+ return { min: 0, median: 0, max: 0, mean: 0 };
82
+ const sorted = [...values].sort((a, b) => a - b);
83
+ const mid = Math.floor(sorted.length / 2);
84
+ return {
85
+ min: sorted[0],
86
+ median: sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid],
87
+ max: sorted[sorted.length - 1],
88
+ mean: sorted.reduce((sum, v) => sum + v, 0) / sorted.length,
89
+ };
90
+ }
91
+ function analyzeScores(classified) {
92
+ const categories = ['near-miss', 'buried', 'absent', 'unjudged-winner'];
93
+ const stats = [];
94
+ for (const category of categories) {
95
+ const queries = classified.filter(q => q.category === category);
96
+ if (queries.length === 0)
97
+ continue;
98
+ const scoreAtPos1 = queries
99
+ .filter(q => q.returnedScores.length > 0)
100
+ .map(q => q.returnedScores[0]);
101
+ const scoreGapPos1Pos2 = queries
102
+ .filter(q => q.returnedScores.length >= 2)
103
+ .map(q => q.returnedScores[0] - q.returnedScores[1]);
104
+ const queriesWithCorrect = queries.filter(q => q.position !== null);
105
+ const scoreOfFirstCorrect = queriesWithCorrect.map(q => q.returnedScores[q.position]);
106
+ const scoreGapToCorrect = queriesWithCorrect.map(q => q.returnedScores[0] - q.returnedScores[q.position]);
107
+ stats.push({
108
+ category,
109
+ count: queries.length,
110
+ scoreAtPos1: computeStats(scoreAtPos1),
111
+ scoreGapPos1Pos2: computeStats(scoreGapPos1Pos2),
112
+ scoreOfFirstCorrect: queriesWithCorrect.length > 0 ? computeStats(scoreOfFirstCorrect) : null,
113
+ scoreGapToCorrect: queriesWithCorrect.length > 0 ? computeStats(scoreGapToCorrect) : null,
114
+ avgNdcg: queries.reduce((sum, q) => sum + q.ndcg, 0) / queries.length,
115
+ });
116
+ }
117
+ return stats;
118
+ }
119
+ function analyzeByTag(classified) {
120
+ const tagMap = new Map();
121
+ for (const query of classified) {
122
+ if (query.category === 'out-of-scope')
123
+ continue;
124
+ for (const tag of query.tags) {
125
+ if (!tagMap.has(tag))
126
+ tagMap.set(tag, []);
127
+ tagMap.get(tag).push(query);
128
+ }
129
+ }
130
+ const breakdowns = [];
131
+ for (const [tag, queries] of tagMap) {
132
+ const correct = queries.filter(q => q.category === 'top-1-correct').length;
133
+ const nearMiss = queries.filter(q => q.category === 'near-miss').length;
134
+ const buried = queries.filter(q => q.category === 'buried').length;
135
+ const absent = queries.filter(q => q.category === 'absent').length;
136
+ const unjudged = queries.filter(q => q.category === 'unjudged-winner').length;
137
+ const gaps = queries
138
+ .filter(q => q.returnedScores.length >= 2)
139
+ .map(q => q.returnedScores[0] - q.returnedScores[1]);
140
+ breakdowns.push({
141
+ tag,
142
+ total: queries.length,
143
+ correct,
144
+ nearMiss,
145
+ buried,
146
+ absent,
147
+ unjudged,
148
+ firstResultAccuracy: queries.length > 0 ? (correct / queries.length) * 100 : 0,
149
+ avgScoreGap: gaps.length > 0 ? gaps.reduce((s, v) => s + v, 0) / gaps.length : 0,
150
+ });
151
+ }
152
+ return breakdowns.sort((a, b) => a.firstResultAccuracy - b.firstResultAccuracy);
153
+ }
154
+ // =============================================================================
155
+ // Report generation
156
+ // =============================================================================
157
+ function generateReport(classified, run) {
158
+ const scoreStats = analyzeScores(classified);
159
+ const tagBreakdowns = analyzeByTag(classified);
160
+ const nonScope = classified.filter(q => q.category !== 'out-of-scope');
161
+ const correct = classified.filter(q => q.category === 'top-1-correct');
162
+ const nearMiss = classified.filter(q => q.category === 'near-miss');
163
+ const buried = classified.filter(q => q.category === 'buried');
164
+ const absent = classified.filter(q => q.category === 'absent');
165
+ const unjudged = classified.filter(q => q.category === 'unjudged-winner');
166
+ const outOfScope = classified.filter(q => q.category === 'out-of-scope');
167
+ const lines = [];
168
+ // --- Header ---
169
+ lines.push(`# First-Result Accuracy Diagnostic: Run ${run.id}`);
170
+ lines.push('');
171
+ lines.push(`> Generated: ${new Date().toISOString().slice(0, 10)}`);
172
+ lines.push(`> Run date: ${run.run_date}`);
173
+ lines.push(`> Queries: ${classified.length} (${nonScope.length} scored, ${outOfScope.length} out-of-scope)`);
174
+ lines.push(`> First-result accuracy: ${run.first_result_accuracy.toFixed(1)}%`);
175
+ lines.push('');
176
+ // --- Executive summary ---
177
+ lines.push('## Executive Summary');
178
+ lines.push('');
179
+ const failCount = nonScope.length - correct.length;
180
+ lines.push(`${failCount} of ${nonScope.length} queries (${((failCount / nonScope.length) * 100).toFixed(1)}%) do not have the best result at position 1. `
181
+ + `The dominant failure mode is **${getDominantCategory(nearMiss.length, buried.length, absent.length, unjudged.length)}**. `
182
+ + `${unjudged.length > 0 ? `${unjudged.length} queries have unjudged documents at position 1 and need manual review before the classification is trustworthy.` : 'All top-1 documents have been judged.'}`);
183
+ lines.push('');
184
+ // --- Failure taxonomy ---
185
+ lines.push('## Failure Taxonomy');
186
+ lines.push('');
187
+ lines.push('| Category | Count | % | Description |');
188
+ lines.push('|-----------------|-------|-------|-----------------------------------------|');
189
+ lines.push(`| Top-1 correct | ${pad(correct.length)} | ${pad(pct(correct.length, nonScope.length))} | Position 1 has grade >= 2 |`);
190
+ lines.push(`| Near miss | ${pad(nearMiss.length)} | ${pad(pct(nearMiss.length, nonScope.length))} | First correct at position 2-3 |`);
191
+ lines.push(`| Buried | ${pad(buried.length)} | ${pad(pct(buried.length, nonScope.length))} | First correct at position 4-10 |`);
192
+ lines.push(`| Unjudged winner | ${pad(unjudged.length)} | ${pad(pct(unjudged.length, nonScope.length))} | Top doc never graded (judgment gap) |`);
193
+ lines.push(`| Absent | ${pad(absent.length)} | ${pad(pct(absent.length, nonScope.length))} | No grade >= 2 doc in top 10 |`);
194
+ lines.push(`| Out-of-scope | ${pad(outOfScope.length)} | n/a | Excluded from metric |`);
195
+ lines.push('');
196
+ // --- Tag heatmap ---
197
+ lines.push('## Tag Breakdown');
198
+ lines.push('');
199
+ lines.push('Sorted by first-result accuracy (worst first).');
200
+ lines.push('');
201
+ lines.push('| Tag | Total | Correct | Near Miss | Buried | Absent | Unjudged | 1st-Result% | Avg Gap |');
202
+ lines.push('|--------------------|-------|---------|-----------|--------|--------|----------|-------------|----------|');
203
+ for (const t of tagBreakdowns) {
204
+ lines.push(`| ${padR(t.tag, 18)} | ${pad(t.total)} | ${pad(t.correct)} | ${pad(t.nearMiss)} | ${pad(t.buried)} | ${pad(t.absent)} | ${pad(t.unjudged)} | ${padR(t.firstResultAccuracy.toFixed(1) + '%', 11)} | ${t.avgScoreGap.toFixed(4)} |`);
205
+ }
206
+ lines.push('');
207
+ // --- Score analysis ---
208
+ lines.push('## Score Distribution by Failure Category');
209
+ lines.push('');
210
+ for (const stat of scoreStats) {
211
+ lines.push(`### ${stat.category} (${stat.count} queries, avg NDCG: ${stat.avgNdcg.toFixed(3)})`);
212
+ lines.push('');
213
+ lines.push('| Metric | Min | Median | Max | Mean |');
214
+ lines.push('|-------------------------|--------|--------|--------|--------|');
215
+ lines.push(`| Score at position 1 | ${fmtRow(stat.scoreAtPos1)} |`);
216
+ lines.push(`| Gap (pos 1 vs pos 2) | ${fmtRow(stat.scoreGapPos1Pos2)} |`);
217
+ if (stat.scoreOfFirstCorrect) {
218
+ lines.push(`| Score of first correct | ${fmtRow(stat.scoreOfFirstCorrect)} |`);
219
+ }
220
+ if (stat.scoreGapToCorrect) {
221
+ lines.push(`| Gap to correct | ${fmtRow(stat.scoreGapToCorrect)} |`);
222
+ }
223
+ lines.push('');
224
+ }
225
+ // --- Unjudged audit ---
226
+ if (unjudged.length > 0) {
227
+ lines.push('## Unjudged Audit');
228
+ lines.push('');
229
+ lines.push('These queries have a document at position 1 with no judgment. Grade them');
230
+ lines.push('via `ledger eval:judge` before trusting the failure classification.');
231
+ lines.push('');
232
+ lines.push('| Query | Top Doc ID | Tags |');
233
+ lines.push('|-------|------------|------|');
234
+ for (const q of unjudged) {
235
+ lines.push(`| ${q.query} | ${q.topDocId} | ${q.tags.join(', ')} |`);
236
+ }
237
+ lines.push('');
238
+ }
239
+ // --- Worst offenders ---
240
+ lines.push('## Worst Offenders');
241
+ lines.push('');
242
+ lines.push('Queries with the largest gap between expected and actual ranking (excluding absent and out-of-scope).');
243
+ lines.push('');
244
+ const offenders = classified
245
+ .filter(q => q.category === 'near-miss' || q.category === 'buried')
246
+ .sort((a, b) => {
247
+ const gapA = a.position !== null && a.returnedScores.length > 0
248
+ ? a.returnedScores[0] - a.returnedScores[a.position]
249
+ : 0;
250
+ const gapB = b.position !== null && b.returnedScores.length > 0
251
+ ? b.returnedScores[0] - b.returnedScores[b.position]
252
+ : 0;
253
+ return gapB - gapA;
254
+ })
255
+ .slice(0, 10);
256
+ lines.push('| Query | Category | Position | Top Doc (grade) | Score Gap |');
257
+ lines.push('|-------|----------|----------|-----------------|-----------|');
258
+ for (const q of offenders) {
259
+ const gradeLabel = q.topDocGrade !== null ? `g${q.topDocGrade}` : 'unjudged';
260
+ const gap = q.position !== null ? (q.returnedScores[0] - q.returnedScores[q.position]).toFixed(4) : 'n/a';
261
+ lines.push(`| ${q.query} | ${q.category} | ${q.position !== null ? q.position + 1 : 'n/a'} | ${q.topDocId} (${gradeLabel}) | ${gap} |`);
262
+ }
263
+ lines.push('');
264
+ // --- Hypotheses ---
265
+ lines.push('## Hypotheses');
266
+ lines.push('');
267
+ lines.push('Ranked by estimated impact (number of queries affected). These are generated');
268
+ lines.push('from the data patterns above and need human review before acting on them.');
269
+ lines.push('');
270
+ lines.push('*Hypotheses will be written manually after reviewing the data above.*');
271
+ lines.push('*The script produces the data; Adrian and Charlie interpret it together.*');
272
+ return lines.join('\n');
273
+ }
274
+ // --- Formatting helpers ---
275
+ function pad(value, width = 5) {
276
+ return String(value).padStart(width);
277
+ }
278
+ function padR(value, width) {
279
+ return value.padEnd(width);
280
+ }
281
+ function pct(count, total) {
282
+ if (total === 0)
283
+ return '0.0%';
284
+ return ((count / total) * 100).toFixed(1) + '%';
285
+ }
286
+ function fmtRow(s) {
287
+ return `${s.min.toFixed(4)} | ${s.median.toFixed(4)} | ${s.max.toFixed(4)} | ${s.mean.toFixed(4)}`;
288
+ }
289
+ function getDominantCategory(nearMiss, buried, absent, unjudged) {
290
+ const max = Math.max(nearMiss, buried, absent, unjudged);
291
+ if (max === nearMiss)
292
+ return 'near miss (position 2-3)';
293
+ if (max === buried)
294
+ return 'buried (position 4-10)';
295
+ if (max === unjudged)
296
+ return 'unjudged winner (judgment gap)';
297
+ return 'absent (retrieval failure)';
298
+ }
299
+ // =============================================================================
300
+ // Write report to disk
301
+ // =============================================================================
302
+ function writeReport(report) {
303
+ const scriptDir = new URL('.', import.meta.url).pathname;
304
+ const dir = resolve(scriptDir, '../../docs/analysis');
305
+ mkdirSync(dir, { recursive: true });
306
+ const path = resolve(dir, `first-result-accuracy-run${RUN_ID}.md`);
307
+ writeFileSync(path, report, 'utf-8');
308
+ console.error(`Report written to ${path}`);
309
+ }
310
+ // =============================================================================
311
+ // Main
312
+ // =============================================================================
313
+ async function main() {
314
+ console.error(`Loading eval run ${RUN_ID}...`);
315
+ const run = await loadEvalRun(supabase, RUN_ID);
316
+ if (!run) {
317
+ console.error(`Run ${RUN_ID} not found.`);
318
+ process.exit(1);
319
+ }
320
+ const results = run.per_query_results;
321
+ if (!results || results.length === 0) {
322
+ console.error(`Run ${RUN_ID} has no per_query_results.`);
323
+ process.exit(1);
324
+ }
325
+ console.error(`Loaded ${results.length} queries from run ${RUN_ID}.`);
326
+ const classified = classifyAll(results);
327
+ const report = generateReport(classified, run);
328
+ writeReport(report);
329
+ }
330
+ main().catch(err => {
331
+ console.error(err);
332
+ process.exit(1);
333
+ });
@@ -0,0 +1,53 @@
1
+ // drop-golden-query.ts
2
+ // One-off: remove the "all system rules and sync rules" enumeration query
3
+ // from eval_golden_dataset. It's a listing query, not a retrieval test.
4
+ //
5
+ // Run: npx tsx src/scripts/drop-golden-query.ts
6
+ import 'dotenv/config';
7
+ import { createClient } from '@supabase/supabase-js';
8
+ const TARGET_QUERY = 'all system rules and sync rules';
9
+ const supabaseUrl = process.env.SUPABASE_URL;
10
+ const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY;
11
+ if (!supabaseUrl || !supabaseKey) {
12
+ console.error('Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY');
13
+ process.exit(1);
14
+ }
15
+ const supabase = createClient(supabaseUrl, supabaseKey);
16
+ async function main() {
17
+ const { data: matches, error: findError } = await supabase
18
+ .from('eval_golden_dataset')
19
+ .select('id, query, expected_doc_ids, tags')
20
+ .eq('query', TARGET_QUERY);
21
+ if (findError) {
22
+ console.error('Lookup failed:', findError.message);
23
+ process.exit(1);
24
+ }
25
+ if (!matches || matches.length === 0) {
26
+ console.log(`No row found with query: "${TARGET_QUERY}"`);
27
+ process.exit(0);
28
+ }
29
+ if (matches.length > 1) {
30
+ console.error(`Refusing to delete: found ${matches.length} matching rows. Inspect manually.`);
31
+ console.error(matches);
32
+ process.exit(1);
33
+ }
34
+ const targetRow = matches[0];
35
+ console.log('Found row:');
36
+ console.log(` id: ${targetRow.id}`);
37
+ console.log(` query: "${targetRow.query}"`);
38
+ console.log(` expected_doc_ids: ${JSON.stringify(targetRow.expected_doc_ids)}`);
39
+ console.log(` tags: ${JSON.stringify(targetRow.tags)}`);
40
+ const { error: deleteError } = await supabase
41
+ .from('eval_golden_dataset')
42
+ .delete()
43
+ .eq('id', targetRow.id);
44
+ if (deleteError) {
45
+ console.error('Delete failed:', deleteError.message);
46
+ process.exit(1);
47
+ }
48
+ console.log(`\nDeleted row id ${targetRow.id}.`);
49
+ }
50
+ main().catch((error) => {
51
+ console.error(error);
52
+ process.exit(1);
53
+ });
@@ -0,0 +1,115 @@
1
+ // eval-search.ts
2
+ // Run the golden dataset through search, compute metrics, print report.
3
+ //
4
+ // Run: npx tsx src/scripts/eval-search.ts
5
+ // This gives us a measurable score for search quality.
6
+ // Every future change gets compared against this baseline.
7
+ import 'dotenv/config';
8
+ import { createClient } from '@supabase/supabase-js';
9
+ import OpenAI from 'openai';
10
+ import { searchHybrid } from '../lib/search/ai-search.js';
11
+ import { scoreTestCase, computeMetrics, formatReport, compareRuns, formatComparison } from '../lib/eval/eval.js';
12
+ import { saveEvalRun, loadPreviousRun, CURRENT_SEARCH_CONFIG } from '../lib/eval/eval-store.js';
13
+ import { computeConfidenceIntervals, computeScoreCalibration, computeCoverageAnalysis, formatAdvancedReport } from '../lib/eval/eval-advanced.js';
14
+ // =============================================================================
15
+ // Setup
16
+ // =============================================================================
17
+ const supabaseUrl = process.env.SUPABASE_URL;
18
+ const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY;
19
+ const openaiKey = process.env.OPENAI_API_KEY;
20
+ if (!supabaseUrl || !supabaseKey || !openaiKey) {
21
+ console.error('Missing SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, or OPENAI_API_KEY');
22
+ process.exit(1);
23
+ }
24
+ const clients = {
25
+ supabase: createClient(supabaseUrl, supabaseKey),
26
+ openai: new OpenAI({ apiKey: openaiKey }),
27
+ cohereApiKey: process.env.COHERE_API_KEY || undefined,
28
+ };
29
+ // Search config imported from eval-store.ts (single source of truth)
30
+ // =============================================================================
31
+ // Run eval
32
+ // =============================================================================
33
+ async function runEval() {
34
+ console.log('\n' + '='.repeat(60));
35
+ console.log('Ledger Search Evaluation');
36
+ console.log('='.repeat(60) + '\n');
37
+ const previousRun = await loadPreviousRun(clients.supabase);
38
+ if (previousRun) {
39
+ console.log(`Previous run: ${previousRun.run_date} (id: ${previousRun.id})\n`);
40
+ }
41
+ else {
42
+ console.log('No previous run found — this will be the first stored run.\n');
43
+ }
44
+ const { data: testCases, error } = await clients.supabase
45
+ .from('eval_golden_dataset')
46
+ .select('id, query, expected_doc_ids, tags')
47
+ .order('id');
48
+ if (error || !testCases) {
49
+ console.error('Failed to load golden dataset:', error?.message);
50
+ process.exit(1);
51
+ }
52
+ console.log(`Loaded ${testCases.length} test cases.\n`);
53
+ const results = [];
54
+ for (const testCase of testCases) {
55
+ const startTime = Date.now();
56
+ const searchResults = await searchHybrid(clients, {
57
+ query: testCase.query,
58
+ limit: CURRENT_SEARCH_CONFIG.limit,
59
+ reranker: CURRENT_SEARCH_CONFIG.reranker,
60
+ });
61
+ const result = scoreTestCase(testCase, searchResults, Date.now() - startTime);
62
+ results.push(result);
63
+ // Live progress
64
+ const isOutOfScope = testCase.expected_doc_ids.length === 0;
65
+ if (isOutOfScope) {
66
+ const status = result.hit ? 'PASS' : `NOISE (${result.returnedIds.length} results)`;
67
+ console.log(` [${status}] "${testCase.query}" (out-of-scope)`);
68
+ }
69
+ else {
70
+ const status = result.firstResultHit ? 'TOP' : result.hit ? 'HIT' : 'MISS';
71
+ const positionInfo = result.position !== null ? `@${result.position + 1}` : '';
72
+ console.log(` [${status}${positionInfo}] "${testCase.query}" → found ${result.expectedFound}/${result.expectedTotal}`);
73
+ }
74
+ }
75
+ const metrics = computeMetrics(results);
76
+ console.log('\n' + formatReport(metrics));
77
+ // Compute advanced analysis before saving so everything is persisted
78
+ const confidenceIntervals = computeConfidenceIntervals(results);
79
+ const scoreCalibration = computeScoreCalibration(results);
80
+ const coverageAnalysis = computeCoverageAnalysis(results);
81
+ const runId = await saveEvalRun(clients.supabase, {
82
+ metrics,
83
+ config: CURRENT_SEARCH_CONFIG,
84
+ results,
85
+ confidenceIntervals,
86
+ scoreCalibration,
87
+ coverageAnalysis,
88
+ });
89
+ console.log(`\nRun saved to eval_runs (id: ${runId})`);
90
+ if (previousRun) {
91
+ const comparison = compareRuns({
92
+ hitRate: metrics.hitRate,
93
+ firstResultAccuracy: metrics.firstResultAccuracy,
94
+ recall: metrics.recall,
95
+ zeroResultRate: metrics.zeroResultRate,
96
+ meanReciprocalRank: metrics.meanReciprocalRank,
97
+ normalizedDiscountedCumulativeGain: metrics.normalizedDiscountedCumulativeGain,
98
+ avgResponseTimeMs: metrics.avgResponseTimeMs,
99
+ }, {
100
+ hitRate: previousRun.hit_rate,
101
+ firstResultAccuracy: previousRun.first_result_accuracy,
102
+ recall: previousRun.recall,
103
+ zeroResultRate: previousRun.zero_result_rate,
104
+ meanReciprocalRank: previousRun.mean_reciprocal_rank ?? 0,
105
+ normalizedDiscountedCumulativeGain: previousRun.normalized_discounted_cumulative_gain ?? 0,
106
+ avgResponseTimeMs: previousRun.avg_response_time_ms,
107
+ });
108
+ console.log('\n' + formatComparison(comparison));
109
+ }
110
+ console.log('\n' + formatAdvancedReport(confidenceIntervals, scoreCalibration, coverageAnalysis));
111
+ }
112
+ runEval().catch((error) => {
113
+ console.error('Eval crashed:', error);
114
+ process.exit(1);
115
+ });
@@ -0,0 +1,138 @@
1
+ // grade-unjudged-top1.ts
2
+ // Targeted grading for queries where the top-1 document has no judgment.
3
+ // Shows the query, document name, and a content preview. You type 0-3.
4
+ //
5
+ // Run: npx tsx src/scripts/grade-unjudged-top1.ts
6
+ // Dry run: npx tsx src/scripts/grade-unjudged-top1.ts --dry-run
7
+ import 'dotenv/config';
8
+ import { resolve } from 'path';
9
+ import { config } from 'dotenv';
10
+ import { createClient } from '@supabase/supabase-js';
11
+ import { createInterface } from 'node:readline';
12
+ config({ path: resolve(process.env.HOME ?? '', '.ledger', '.env') });
13
+ const supabaseUrl = process.env.SUPABASE_URL;
14
+ const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY;
15
+ if (!supabaseUrl || !supabaseKey) {
16
+ console.error('Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY');
17
+ process.exit(1);
18
+ }
19
+ const supabase = createClient(supabaseUrl, supabaseKey);
20
+ const dryRun = process.argv.includes('--dry-run');
21
+ // The 11 unjudged pairs from run 17 diagnostic (query text + top doc ID)
22
+ const UNJUDGED_PAIRS = [
23
+ { query: 'how does Ledger process a search query end to end', docId: 152 },
24
+ { query: 'how does chunking work for embeddings', docId: 160 },
25
+ { query: 'how does Ledger\'s audit log track document changes', docId: 152 },
26
+ { query: 'what is Ledger\'s query cache and how does it work', docId: 152 },
27
+ { query: 'how does Ledger protect sensitive documents with access control', docId: 144 },
28
+ { query: 'document_create RPC', docId: 152 },
29
+ { query: 'Ledger text-embedding-3-small embedding model', docId: 152 },
30
+ { query: 'Ledger keyword search websearch_to_tsquery GIN index', docId: 152 },
31
+ { query: 'soft delete deleted_at document_purge', docId: 162 },
32
+ { query: 'ledger architecture all sections', docId: 144 },
33
+ { query: 'what is the right way to write a custom skill', docId: 163 },
34
+ ];
35
+ const RUBRIC = `
36
+ 0 = NOT RELEVANT Wrong topic, no useful info
37
+ 1 = RELATED Touches the topic but doesn't answer
38
+ 2 = RELEVANT Answers the query (user would be happy with this result)
39
+ 3 = HIGHLY RELEVANT The canonical, best-possible answer
40
+ `;
41
+ async function main() {
42
+ const readline = createInterface({ input: process.stdin, output: process.stdout });
43
+ const ask = (q) => new Promise(resolve => readline.question(q, resolve));
44
+ console.log('='.repeat(60));
45
+ console.log('Targeted grading: 12 unjudged top-1 documents');
46
+ console.log(dryRun ? '(DRY RUN: no writes)' : '(LIVE: writes to eval_golden_judgments)');
47
+ console.log('='.repeat(60));
48
+ console.log(RUBRIC);
49
+ let graded = 0;
50
+ let skipped = 0;
51
+ for (let i = 0; i < UNJUDGED_PAIRS.length; i++) {
52
+ const pair = UNJUDGED_PAIRS[i];
53
+ // Look up golden dataset ID for this query
54
+ const { data: goldenRows } = await supabase
55
+ .from('eval_golden_dataset')
56
+ .select('id')
57
+ .eq('query', pair.query)
58
+ .limit(1);
59
+ const goldenId = goldenRows?.[0]?.id;
60
+ if (!goldenId) {
61
+ console.log(`\n[${i + 1}/12] SKIP: query not found in golden dataset: "${pair.query}"`);
62
+ skipped++;
63
+ continue;
64
+ }
65
+ // Fetch document name and content preview
66
+ const { data: docRow } = await supabase
67
+ .from('documents')
68
+ .select('name, content')
69
+ .eq('id', pair.docId)
70
+ .single();
71
+ const docName = docRow?.name ?? '(unknown)';
72
+ const preview = docRow?.content
73
+ ? docRow.content.slice(0, 300).replace(/\n/g, ' ')
74
+ : '(no content)';
75
+ console.log('\n' + '-'.repeat(60));
76
+ console.log(`[${i + 1}/12] Query: "${pair.query}"`);
77
+ console.log(` Doc #${pair.docId}: ${docName}`);
78
+ console.log(` Preview: ${preview}...`);
79
+ console.log('-'.repeat(60));
80
+ let grade = null;
81
+ while (grade === null) {
82
+ const input = await ask('Grade (0/1/2/3) or s=skip, q=quit: ');
83
+ const trimmed = input.trim();
84
+ if (trimmed === 'q') {
85
+ console.log(`\nQuit. Graded ${graded}, skipped ${skipped + (UNJUDGED_PAIRS.length - i)}.`);
86
+ readline.close();
87
+ return;
88
+ }
89
+ if (trimmed === 's') {
90
+ skipped++;
91
+ break;
92
+ }
93
+ if (['0', '1', '2', '3'].includes(trimmed)) {
94
+ grade = parseInt(trimmed, 10);
95
+ }
96
+ else {
97
+ console.log(' Invalid. Type 0, 1, 2, 3, s, or q.');
98
+ }
99
+ }
100
+ if (grade === null)
101
+ continue; // skipped
102
+ if (dryRun) {
103
+ console.log(` [DRY RUN] Would save: golden_id=${goldenId}, doc_id=${pair.docId}, grade=${grade}`);
104
+ }
105
+ else {
106
+ const { error } = await supabase.rpc('judgment_create', {
107
+ p_golden_id: goldenId,
108
+ p_document_id: pair.docId,
109
+ p_grade: grade,
110
+ p_judged_by: 'adrian',
111
+ p_notes: 'unjudged top-1 from run-16 diagnostic',
112
+ });
113
+ if (error) {
114
+ // Might already exist (race condition), try update
115
+ const { error: updateError } = await supabase.rpc('judgment_update', {
116
+ p_golden_id: goldenId,
117
+ p_document_id: pair.docId,
118
+ p_grade: grade,
119
+ p_notes: 'unjudged top-1 from run-16 diagnostic',
120
+ });
121
+ if (updateError) {
122
+ console.log(` [ERR] Failed to save: ${updateError.message}`);
123
+ continue;
124
+ }
125
+ }
126
+ console.log(` Saved: grade ${grade}`);
127
+ }
128
+ graded++;
129
+ }
130
+ console.log('\n' + '='.repeat(60));
131
+ console.log(`Done. Graded: ${graded}, Skipped: ${skipped}`);
132
+ console.log('='.repeat(60));
133
+ readline.close();
134
+ }
135
+ main().catch(err => {
136
+ console.error(err);
137
+ process.exit(1);
138
+ });
@@ -0,0 +1,38 @@
1
+ import { loadConfig, loadConfigFile } from '../lib/config.js';
2
+ import { getOpportunitiesForPeriod, insertAnalytics, buildAnalyticsRow } from '../lib/hunter-db.js';
3
+ import { DEFAULT_HUNTER_CONFIG } from '../lib/hunter-types.js';
4
+ export async function runAnalytics(periodDays) {
5
+ const config = loadConfig();
6
+ const fileConfig = loadConfigFile();
7
+ const hunterConfig = {
8
+ ...DEFAULT_HUNTER_CONFIG,
9
+ ...fileConfig.hunter,
10
+ };
11
+ const days = periodDays ?? hunterConfig.analytics_interval_days;
12
+ const end = new Date();
13
+ const start = new Date();
14
+ start.setDate(start.getDate() - days);
15
+ const periodStart = start.toISOString().split('T')[0];
16
+ const periodEnd = end.toISOString().split('T')[0];
17
+ console.error(`[hunter-analytics] Analyzing ${periodStart} to ${periodEnd}...`);
18
+ const opportunities = await getOpportunitiesForPeriod(config.supabase, start.toISOString(), end.toISOString());
19
+ if (opportunities.length === 0) {
20
+ console.error('[hunter-analytics] No opportunities found in period.');
21
+ return;
22
+ }
23
+ const row = buildAnalyticsRow(opportunities, periodStart, periodEnd);
24
+ await insertAnalytics(config.supabase, row);
25
+ console.error(`[hunter-analytics] Saved. ${row.total_found} opportunities analyzed.`);
26
+ console.error(` Freelance: ${row.freelance_count} | Employment: ${row.employment_count}`);
27
+ console.error(` 80+: ${row.score_80_plus} | 60-79: ${row.score_60_79} | 40-59: ${row.score_40_59} | <40: ${row.score_below_40}`);
28
+ console.error(` Applied: ${row.applied_count} | Won: ${row.won_count} | Lost: ${row.lost_count}`);
29
+ }
30
+ const isDirectExecution = process.argv[1]?.endsWith('hunter-analytics.js')
31
+ || process.argv[1]?.endsWith('hunter-analytics.ts');
32
+ if (isDirectExecution) {
33
+ const days = process.argv[2] ? parseInt(process.argv[2], 10) : undefined;
34
+ runAnalytics(days).catch(err => {
35
+ console.error(`[hunter-analytics] Error: ${err.message}`);
36
+ process.exit(1);
37
+ });
38
+ }