@aperdomoll90/ledger-ai 1.3.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/cli.js +177 -221
  2. package/dist/commands/add.js +51 -100
  3. package/dist/commands/backfill.js +55 -0
  4. package/dist/commands/backup.js +10 -10
  5. package/dist/commands/check.js +21 -29
  6. package/dist/commands/config.js +13 -12
  7. package/dist/commands/delete.js +22 -17
  8. package/dist/commands/eval-judge.js +11 -0
  9. package/dist/commands/eval.js +321 -0
  10. package/dist/commands/export.js +8 -10
  11. package/dist/commands/get.js +9 -0
  12. package/dist/commands/hunt.js +206 -0
  13. package/dist/commands/ingest.js +15 -14
  14. package/dist/commands/init.js +18 -20
  15. package/dist/commands/list.js +21 -7
  16. package/dist/commands/migrate.js +11 -11
  17. package/dist/commands/onboard.js +2 -2
  18. package/dist/commands/pull.js +3 -2
  19. package/dist/commands/push.js +8 -8
  20. package/dist/commands/restore.js +38 -38
  21. package/dist/commands/show.js +13 -16
  22. package/dist/commands/sync.js +58 -19
  23. package/dist/commands/tag.js +20 -14
  24. package/dist/commands/update.js +50 -18
  25. package/dist/commands/wizard.js +3 -3
  26. package/dist/lib/ai-search.js +163 -0
  27. package/dist/lib/audit.js +19 -0
  28. package/dist/lib/backfill.js +60 -0
  29. package/dist/lib/config.js +19 -2
  30. package/dist/lib/document-classification.js +5 -0
  31. package/dist/lib/document-fetching.js +77 -0
  32. package/dist/lib/document-operations.js +150 -0
  33. package/dist/lib/documents/classification.js +5 -0
  34. package/dist/lib/documents/fetching.js +89 -0
  35. package/dist/lib/documents/operations.js +304 -0
  36. package/dist/lib/domains.js +116 -0
  37. package/dist/lib/embeddings.js +190 -0
  38. package/dist/lib/errors.js +3 -1
  39. package/dist/lib/eval/eval-advanced.js +289 -0
  40. package/dist/lib/eval/eval-judge-session.js +233 -0
  41. package/dist/lib/eval/eval-store.js +105 -0
  42. package/dist/lib/eval/eval.js +303 -0
  43. package/dist/lib/file-writer.js +23 -0
  44. package/dist/lib/generators.js +44 -45
  45. package/dist/lib/hunter-db.js +235 -0
  46. package/dist/lib/hunter-rss.js +30 -0
  47. package/dist/lib/hunter-scoring.js +55 -0
  48. package/dist/lib/hunter-types.js +36 -0
  49. package/dist/lib/lint-configs.js +20 -0
  50. package/dist/lib/migrate.js +2 -2
  51. package/dist/lib/notes.js +173 -59
  52. package/dist/lib/observability.js +296 -0
  53. package/dist/lib/op-add-note-types.test.js +7 -6
  54. package/dist/lib/prompt.js +8 -8
  55. package/dist/lib/rate-limiter.js +103 -0
  56. package/dist/lib/search/ai-search.js +396 -0
  57. package/dist/lib/search/chunk-context-enrichment.js +155 -0
  58. package/dist/lib/search/embeddings.js +293 -0
  59. package/dist/lib/search/reranker.js +120 -0
  60. package/dist/lib/search/semantic-cache.js +53 -0
  61. package/dist/lib/type-registry.test.js +6 -6
  62. package/dist/mcp-server.js +553 -66
  63. package/dist/migrations/migrations/005-audit-log.sql +22 -0
  64. package/dist/migrations/migrations/005_opportunities.sql +48 -0
  65. package/dist/migrations/migrations/006-audited-operations.sql +235 -0
  66. package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
  67. package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
  68. package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
  69. package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
  70. package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
  71. package/dist/scripts/batch-grade.js +344 -0
  72. package/dist/scripts/benchmark-ingestion.js +376 -0
  73. package/dist/scripts/convert-judgments-to-graded.js +88 -0
  74. package/dist/scripts/diagnose-first-result.js +333 -0
  75. package/dist/scripts/drop-golden-query.js +53 -0
  76. package/dist/scripts/eval-search.js +115 -0
  77. package/dist/scripts/grade-unjudged-top1.js +138 -0
  78. package/dist/scripts/hunter-analytics.js +38 -0
  79. package/dist/scripts/hunter-cron.js +63 -0
  80. package/dist/scripts/hunter-purge.js +25 -0
  81. package/dist/scripts/migrate-v2.js +140 -0
  82. package/dist/scripts/reindex.js +74 -0
  83. package/dist/scripts/sync-local-docs.js +153 -0
  84. package/package.json +7 -1
@@ -0,0 +1,233 @@
1
+ // eval-judge-session.ts
2
+ // Session state, input parsing, progress rendering, and durable writes for
3
+ // the `ledger eval:judge` rejudging walkthrough.
4
+ import { createInterface } from 'node:readline';
5
+ import { searchHybrid } from '../search/ai-search.js';
6
+ import { CURRENT_SEARCH_CONFIG } from './eval-store.js';
7
+ // =============================================================================
8
+ // Pure helpers (unit-tested)
9
+ // =============================================================================
10
+ export function parseGradeInput(rawInput) {
11
+ const input = rawInput.trim();
12
+ if (input === '0' || input === '1' || input === '2' || input === '3') {
13
+ return { kind: 'grade', value: parseInt(input, 10) };
14
+ }
15
+ if (input === 's')
16
+ return { kind: 'skip' };
17
+ if (input === 'b')
18
+ return { kind: 'back' };
19
+ if (input === 'n')
20
+ return { kind: 'note' };
21
+ if (input === '?')
22
+ return { kind: 'rubric' };
23
+ if (input === 'q')
24
+ return { kind: 'quit' };
25
+ return { kind: 'invalid', raw: input };
26
+ }
27
+ export function pickNextUngraded(candidates) {
28
+ for (const candidate of candidates) {
29
+ if (!candidate.graded)
30
+ return candidate;
31
+ }
32
+ return null;
33
+ }
34
+ export function formatProgressLine(progress) {
35
+ const percentage = progress.queriesTotal > 0
36
+ ? Math.round((progress.queriesComplete / progress.queriesTotal) * 100)
37
+ : 0;
38
+ return `Progress: ${progress.queriesComplete} / ${progress.queriesTotal} queries complete (${percentage}%). Judgments: ${progress.judgmentsTotal}.`;
39
+ }
40
+ // =============================================================================
41
+ // Rubric
42
+ // =============================================================================
43
+ const RUBRIC_TEXT = `
44
+ TREC 4-level grading rubric:
45
+
46
+ 0 NOT RELEVANT No useful info for this query. Wrong topic.
47
+ 1 RELATED Touches the topic but doesn't answer.
48
+ 2 RELEVANT Answers the query, but not the ideal/canonical source.
49
+ 3 HIGHLY RELEVANT The canonical, complete answer.
50
+
51
+ Boundary heuristics:
52
+ 1 vs 2: "Would a user be happy if this was the top result?" Yes = 2, No = 1.
53
+ 2 vs 3: "Is there a better doc for this query that I know exists?" Yes = 2, No = 3.
54
+ `;
55
+ // =============================================================================
56
+ // Database I/O
57
+ // =============================================================================
58
+ async function loadNextGolden(supabase, startId = 0) {
59
+ const { data, error } = await supabase
60
+ .from('eval_golden_dataset')
61
+ .select('id, query, tags, judgments:eval_golden_judgments(document_id, grade)')
62
+ .gte('id', startId)
63
+ .order('id', { ascending: true });
64
+ if (error) {
65
+ process.stderr.write(`[ledger] loadNextGolden failed: ${error.message}\n`);
66
+ return null;
67
+ }
68
+ if (!data)
69
+ return null;
70
+ for (const row of data) {
71
+ const gradedMap = new Map();
72
+ for (const judgment of row.judgments ?? []) {
73
+ gradedMap.set(judgment.document_id, judgment.grade);
74
+ }
75
+ return {
76
+ id: row.id,
77
+ query: row.query,
78
+ tags: row.tags ?? [],
79
+ existing_grades: gradedMap,
80
+ };
81
+ }
82
+ return null;
83
+ }
84
+ async function fetchProgress(supabase) {
85
+ const { count: queriesTotal } = await supabase
86
+ .from('eval_golden_dataset')
87
+ .select('*', { count: 'exact', head: true });
88
+ const { count: judgmentsTotal } = await supabase
89
+ .from('eval_golden_judgments')
90
+ .select('*', { count: 'exact', head: true });
91
+ const { data: rpcData } = await supabase
92
+ .rpc('count_golden_with_min_judgments', { p_min: 10 });
93
+ const queriesComplete = typeof rpcData === 'number' ? rpcData : 0;
94
+ return {
95
+ queriesTotal: queriesTotal ?? 0,
96
+ queriesComplete,
97
+ judgmentsTotal: judgmentsTotal ?? 0,
98
+ };
99
+ }
100
+ // =============================================================================
101
+ // Prompt helper
102
+ // =============================================================================
103
+ function promptUser(readline, question) {
104
+ return new Promise(resolve => {
105
+ readline.question(question, (answer) => resolve(answer));
106
+ });
107
+ }
108
+ function snippet(content, maxChars = 200) {
109
+ if (!content)
110
+ return '';
111
+ return content.replace(/\s+/g, ' ').slice(0, maxChars);
112
+ }
113
+ // =============================================================================
114
+ // Interactive session
115
+ // =============================================================================
116
+ export async function runJudgeSession(clients, startGoldenId) {
117
+ const supabase = clients.supabase;
118
+ const readline = createInterface({ input: process.stdin, output: process.stdout });
119
+ try {
120
+ let currentId = startGoldenId ?? 0;
121
+ while (true) {
122
+ const progress = await fetchProgress(supabase);
123
+ console.log('');
124
+ console.log(formatProgressLine(progress));
125
+ console.log('');
126
+ const golden = await loadNextGolden(supabase, currentId);
127
+ if (!golden) {
128
+ console.log('No more queries to judge. Done.');
129
+ return;
130
+ }
131
+ // Run search for this query
132
+ const searchResults = await searchHybrid(clients, {
133
+ query: golden.query,
134
+ limit: CURRENT_SEARCH_CONFIG.limit,
135
+ reranker: CURRENT_SEARCH_CONFIG.reranker,
136
+ });
137
+ // Build candidate list from top 10
138
+ const candidates = [];
139
+ for (const result of searchResults.slice(0, 10)) {
140
+ candidates.push({
141
+ id: result.id,
142
+ name: result.name ?? '<unknown>',
143
+ score: result.score ?? result.similarity ?? 0,
144
+ content: snippet(result.content),
145
+ graded: golden.existing_grades.has(result.id),
146
+ });
147
+ }
148
+ const ungradedList = candidates.filter(candidate => !candidate.graded);
149
+ if (ungradedList.length === 0) {
150
+ // This query is fully graded. Advance.
151
+ currentId = golden.id + 1;
152
+ continue;
153
+ }
154
+ // Print header
155
+ console.log('='.repeat(60));
156
+ console.log(`Query #${golden.id}: "${golden.query}"`);
157
+ if (golden.tags.length > 0)
158
+ console.log(`Tags: ${golden.tags.join(', ')}`);
159
+ if (golden.existing_grades.size > 0) {
160
+ console.log('Already graded:');
161
+ for (const [documentId, grade] of golden.existing_grades.entries()) {
162
+ console.log(` #${documentId} -> ${grade}`);
163
+ }
164
+ }
165
+ console.log('='.repeat(60));
166
+ let pendingNote = null;
167
+ let candidateIndex = 0;
168
+ while (candidateIndex < ungradedList.length) {
169
+ const candidate = ungradedList[candidateIndex];
170
+ console.log('');
171
+ console.log(`[${candidateIndex + 1}/${ungradedList.length}] #${candidate.id} ${candidate.name} (score ${candidate.score.toFixed(3)})`);
172
+ console.log(`"${candidate.content}..."`);
173
+ const answer = await promptUser(readline, 'Grade [0/1/2/3] s=skip b=back n=notes ?=rubric q=save & quit: ');
174
+ const parsed = parseGradeInput(answer);
175
+ if (parsed.kind === 'invalid') {
176
+ console.log(`(unrecognized input "${parsed.raw}". Press ? for rubric.)`);
177
+ continue;
178
+ }
179
+ if (parsed.kind === 'rubric') {
180
+ console.log(RUBRIC_TEXT);
181
+ continue;
182
+ }
183
+ if (parsed.kind === 'note') {
184
+ pendingNote = await promptUser(readline, 'Note: ');
185
+ continue;
186
+ }
187
+ if (parsed.kind === 'quit') {
188
+ console.log('Saving and exiting.');
189
+ return;
190
+ }
191
+ if (parsed.kind === 'skip') {
192
+ candidateIndex++;
193
+ pendingNote = null;
194
+ continue;
195
+ }
196
+ if (parsed.kind === 'back') {
197
+ if (candidateIndex > 0)
198
+ candidateIndex--;
199
+ pendingNote = null;
200
+ continue;
201
+ }
202
+ // Grade: durable write via RPC
203
+ const { error: rpcError } = await supabase.rpc('judgment_create', {
204
+ p_golden_id: golden.id,
205
+ p_document_id: candidate.id,
206
+ p_grade: parsed.value,
207
+ p_judged_by: 'adrian',
208
+ p_notes: pendingNote,
209
+ });
210
+ if (rpcError) {
211
+ // Duplicate? Try update instead.
212
+ const { error: updateError } = await supabase.rpc('judgment_update', {
213
+ p_golden_id: golden.id,
214
+ p_document_id: candidate.id,
215
+ p_grade: parsed.value,
216
+ p_notes: pendingNote,
217
+ });
218
+ if (updateError) {
219
+ console.error(` [ERR] Could not save grade: ${updateError.message}`);
220
+ continue;
221
+ }
222
+ }
223
+ pendingNote = null;
224
+ candidateIndex++;
225
+ }
226
+ console.log(`Query #${golden.id} complete.`);
227
+ currentId = golden.id + 1;
228
+ }
229
+ }
230
+ finally {
231
+ readline.close();
232
+ }
233
+ }
@@ -0,0 +1,105 @@
1
+ // eval-store.ts
2
+ // Persistence layer for eval runs — save results to eval_runs table and load previous runs.
3
+ /**
4
+ * Current search configuration. Saved with each eval run for reproducibility.
5
+ * Single source of truth — used by both the eval script and CLI command.
6
+ * Update this when search parameters change (threshold, model, reranker, etc.).
7
+ */
8
+ export const CURRENT_SEARCH_CONFIG = {
9
+ threshold: 0.38,
10
+ reciprocalRankFusionK: 60,
11
+ embedding_model: 'openai/text-embedding-3-small',
12
+ limit: 10,
13
+ chunking: 'recursive',
14
+ chunk_max_size: 1000,
15
+ chunk_overlap: 200,
16
+ context_enrichment: true,
17
+ context_enrichment_model: 'gpt-4o-mini',
18
+ reranker: 'none',
19
+ hit_threshold: 2,
20
+ ndcg_gain_formula: '2^g - 1',
21
+ };
22
+ // =============================================================================
23
+ // Save
24
+ // =============================================================================
25
+ export async function saveEvalRun(supabase, props) {
26
+ const { metrics, config, results, confidenceIntervals, scoreCalibration, coverageAnalysis } = props;
27
+ const row = {
28
+ config,
29
+ test_case_count: metrics.totalCases,
30
+ hit_rate: metrics.hitRate,
31
+ first_result_accuracy: metrics.firstResultAccuracy,
32
+ recall: metrics.recall,
33
+ zero_result_rate: metrics.zeroResultRate,
34
+ avg_response_time_ms: metrics.avgResponseTimeMs,
35
+ mean_reciprocal_rank: metrics.meanReciprocalRank,
36
+ normalized_discounted_cumulative_gain: metrics.normalizedDiscountedCumulativeGain,
37
+ confidence_intervals: confidenceIntervals ?? null,
38
+ score_calibration: scoreCalibration ?? null,
39
+ coverage_analysis: coverageAnalysis ?? null,
40
+ results_by_tag: metrics.tagStats,
41
+ missed_queries: metrics.missed.map(missedResult => ({
42
+ query: missedResult.testCase.query,
43
+ tags: missedResult.testCase.tags,
44
+ judgments: missedResult.testCase.judgments,
45
+ got: missedResult.returnedIds,
46
+ gotScores: missedResult.returnedScores,
47
+ })),
48
+ per_query_results: results.map(testResult => ({
49
+ query: testResult.testCase.query,
50
+ tags: testResult.testCase.tags,
51
+ judgments: testResult.testCase.judgments,
52
+ hit: testResult.hit,
53
+ firstResultHit: testResult.firstResultHit,
54
+ position: testResult.position,
55
+ expectedFound: testResult.expectedFound,
56
+ expectedTotal: testResult.expectedTotal,
57
+ responseTimeMs: testResult.responseTimeMs,
58
+ reciprocalRank: testResult.reciprocalRank,
59
+ normalizedDiscountedCumulativeGain: testResult.normalizedDiscountedCumulativeGain,
60
+ returnedIds: testResult.returnedIds,
61
+ returnedScores: testResult.returnedScores,
62
+ })),
63
+ };
64
+ const { data, error } = await supabase
65
+ .from('eval_runs')
66
+ .insert(row)
67
+ .select('id')
68
+ .single();
69
+ if (error) {
70
+ throw new Error(`Failed to save eval run (${props.metrics.totalCases} cases): ${error.message}`);
71
+ }
72
+ return data.id;
73
+ }
74
+ // =============================================================================
75
+ // Load
76
+ // =============================================================================
77
+ export async function loadPreviousRun(supabase) {
78
+ const { data, error } = await supabase
79
+ .from('eval_runs')
80
+ .select('*')
81
+ .order('run_date', { ascending: false })
82
+ .limit(1)
83
+ .single();
84
+ if (error) {
85
+ if (error.code !== 'PGRST116') {
86
+ process.stderr.write(`[ledger] loadPreviousRun failed: ${error.message}\n`);
87
+ }
88
+ return null;
89
+ }
90
+ return data;
91
+ }
92
+ export async function loadEvalRun(supabase, runId) {
93
+ const { data, error } = await supabase
94
+ .from('eval_runs')
95
+ .select('*')
96
+ .eq('id', runId)
97
+ .single();
98
+ if (error) {
99
+ if (error.code !== 'PGRST116') {
100
+ process.stderr.write(`[ledger] loadEvalRun(${runId}) failed: ${error.message}\n`);
101
+ }
102
+ return null;
103
+ }
104
+ return data;
105
+ }
@@ -0,0 +1,303 @@
1
+ // eval.ts
2
+ // Types and metric computation for search evaluation.
3
+ // Pure functions — no I/O, no database calls.
4
+ /**
5
+ * Rate metrics (hit rate, first-result accuracy, recall, MRR) count a result
6
+ * as "good" when its grade is at or above this threshold. Only NDCG uses the
7
+ * full 2^g - 1 gain function across all grades.
8
+ */
9
+ export const HIT_THRESHOLD = 2;
10
+ // =============================================================================
11
+ // NDCG@k helper — graded (2^g - 1 gain, TREC 4-level)
12
+ // =============================================================================
13
+ function gradeGain(grade) {
14
+ return Math.pow(2, grade) - 1;
15
+ }
16
+ function computeNormalizedDiscountedCumulativeGain(returnedIds, gradeByDocId) {
17
+ // Collect all non-zero grades (any doc that contributes to ideal ranking).
18
+ const relevantGrades = [];
19
+ for (const grade of gradeByDocId.values()) {
20
+ if (grade >= 1)
21
+ relevantGrades.push(grade);
22
+ }
23
+ if (relevantGrades.length === 0)
24
+ return 0;
25
+ // DCG against the returned order
26
+ let discountedCumulativeGain = 0;
27
+ for (let position = 0; position < returnedIds.length; position++) {
28
+ const grade = gradeByDocId.get(returnedIds[position]) ?? 0;
29
+ discountedCumulativeGain += gradeGain(grade) / Math.log2(position + 2);
30
+ }
31
+ // IDCG: ideal ordering is grades sorted descending
32
+ const idealGrades = relevantGrades.slice().sort((gradeA, gradeB) => gradeB - gradeA);
33
+ let idealDiscountedCumulativeGain = 0;
34
+ for (let position = 0; position < idealGrades.length; position++) {
35
+ idealDiscountedCumulativeGain += gradeGain(idealGrades[position]) / Math.log2(position + 2);
36
+ }
37
+ if (idealDiscountedCumulativeGain === 0)
38
+ return 0;
39
+ return discountedCumulativeGain / idealDiscountedCumulativeGain;
40
+ }
41
+ // =============================================================================
42
+ // Scoring a single test case
43
+ // =============================================================================
44
+ export function scoreTestCase(testCase, searchResults, responseTimeMs) {
45
+ const returnedIds = searchResults.map(result => result.id);
46
+ const returnedScores = searchResults.map(result => result.score ?? result.similarity ?? 0);
47
+ // Build grade lookup. Missing doc defaults to grade 0 (treated as irrelevant).
48
+ const gradeByDocId = new Map();
49
+ for (const judgment of testCase.judgments) {
50
+ gradeByDocId.set(judgment.document_id, judgment.grade);
51
+ }
52
+ const relevantJudgments = testCase.judgments.filter(judgment => judgment.grade >= HIT_THRESHOLD);
53
+ const isOutOfScope = relevantJudgments.length === 0;
54
+ if (isOutOfScope) {
55
+ return {
56
+ testCase,
57
+ returnedIds,
58
+ returnedScores,
59
+ hit: searchResults.length === 0,
60
+ firstResultHit: searchResults.length === 0,
61
+ expectedFound: 0,
62
+ expectedTotal: 0,
63
+ position: null,
64
+ responseTimeMs,
65
+ reciprocalRank: 0,
66
+ normalizedDiscountedCumulativeGain: 0,
67
+ };
68
+ }
69
+ // First returned doc with grade >= HIT_THRESHOLD
70
+ let firstHitPosition = null;
71
+ for (let position = 0; position < returnedIds.length; position++) {
72
+ const grade = gradeByDocId.get(returnedIds[position]) ?? 0;
73
+ if (grade >= HIT_THRESHOLD) {
74
+ firstHitPosition = position;
75
+ break;
76
+ }
77
+ }
78
+ const topGrade = returnedIds.length > 0 ? (gradeByDocId.get(returnedIds[0]) ?? 0) : 0;
79
+ const foundCount = returnedIds.filter(docId => (gradeByDocId.get(docId) ?? 0) >= HIT_THRESHOLD).length;
80
+ return {
81
+ testCase,
82
+ returnedIds,
83
+ returnedScores,
84
+ hit: firstHitPosition !== null,
85
+ firstResultHit: topGrade >= HIT_THRESHOLD,
86
+ expectedFound: foundCount,
87
+ expectedTotal: relevantJudgments.length,
88
+ position: firstHitPosition,
89
+ responseTimeMs,
90
+ reciprocalRank: firstHitPosition !== null ? 1 / (firstHitPosition + 1) : 0,
91
+ normalizedDiscountedCumulativeGain: computeNormalizedDiscountedCumulativeGain(returnedIds, gradeByDocId),
92
+ };
93
+ }
94
+ // =============================================================================
95
+ // Aggregate metrics from scored results
96
+ // =============================================================================
97
+ const hasRelevantJudgment = (result) => result.testCase.judgments.some(judgment => judgment.grade >= HIT_THRESHOLD);
98
+ export function computeMetrics(results) {
99
+ const normalResults = results.filter(hasRelevantJudgment);
100
+ const outOfScopeResults = results.filter(result => !hasRelevantJudgment(result));
101
+ const totalNormal = normalResults.length;
102
+ const hits = normalResults.filter(result => result.hit).length;
103
+ const firstResultHits = normalResults.filter(result => result.firstResultHit).length;
104
+ const totalExpected = normalResults.reduce((sum, result) => sum + result.expectedTotal, 0);
105
+ const totalFound = normalResults.reduce((sum, result) => sum + result.expectedFound, 0);
106
+ const zeroResults = normalResults.filter(result => result.returnedIds.length === 0).length;
107
+ const outOfScopeCorrect = outOfScopeResults.filter(result => result.hit).length;
108
+ const avgResponseTimeMs = results.length > 0
109
+ ? results.reduce((sum, result) => sum + result.responseTimeMs, 0) / results.length
110
+ : 0;
111
+ const tagStats = {};
112
+ for (const result of normalResults) {
113
+ for (const tag of result.testCase.tags) {
114
+ if (!tagStats[tag])
115
+ tagStats[tag] = { total: 0, hits: 0, firstHits: 0 };
116
+ tagStats[tag].total++;
117
+ if (result.hit)
118
+ tagStats[tag].hits++;
119
+ if (result.firstResultHit)
120
+ tagStats[tag].firstHits++;
121
+ }
122
+ }
123
+ return {
124
+ totalCases: results.length,
125
+ normalCases: totalNormal,
126
+ outOfScopeCases: outOfScopeResults.length,
127
+ hits,
128
+ firstResultHits,
129
+ totalExpected,
130
+ totalFound,
131
+ zeroResults,
132
+ outOfScopeCorrect,
133
+ avgResponseTimeMs,
134
+ hitRate: totalNormal > 0 ? (hits / totalNormal) * 100 : 0,
135
+ firstResultAccuracy: totalNormal > 0 ? (firstResultHits / totalNormal) * 100 : 0,
136
+ recall: totalExpected > 0 ? (totalFound / totalExpected) * 100 : 0,
137
+ zeroResultRate: totalNormal > 0 ? (zeroResults / totalNormal) * 100 : 0,
138
+ outOfScopeAccuracy: outOfScopeResults.length > 0 ? (outOfScopeCorrect / outOfScopeResults.length) * 100 : 0,
139
+ meanReciprocalRank: totalNormal > 0
140
+ ? normalResults.reduce((sum, result) => sum + result.reciprocalRank, 0) / totalNormal
141
+ : 0,
142
+ normalizedDiscountedCumulativeGain: totalNormal > 0
143
+ ? normalResults.reduce((sum, result) => sum + result.normalizedDiscountedCumulativeGain, 0) / totalNormal
144
+ : 0,
145
+ tagStats,
146
+ missed: normalResults.filter(result => !result.hit),
147
+ };
148
+ }
149
+ // =============================================================================
150
+ // Format report as string (no console.log — caller decides output)
151
+ // =============================================================================
152
+ export function formatReport(metrics) {
153
+ const lines = [];
154
+ lines.push('='.repeat(60));
155
+ lines.push(`Results — hit_threshold=${HIT_THRESHOLD}, ndcg_gain=2^g-1`);
156
+ lines.push('='.repeat(60));
157
+ lines.push('');
158
+ lines.push(`Test cases: ${metrics.totalCases} total (${metrics.normalCases} normal, ${metrics.outOfScopeCases} out-of-scope)`);
159
+ lines.push('');
160
+ lines.push('METRICS:');
161
+ lines.push(` Hit rate: ${metrics.hitRate.toFixed(1)}% (${metrics.hits}/${metrics.normalCases} queries found at least one expected doc)`);
162
+ lines.push(` First-result accuracy: ${metrics.firstResultAccuracy.toFixed(1)}% (${metrics.firstResultHits}/${metrics.normalCases} queries had correct #1 result)`);
163
+ lines.push(` Recall: ${metrics.recall.toFixed(1)}% (${metrics.totalFound}/${metrics.totalExpected} expected docs found across all queries)`);
164
+ lines.push(` Zero-result rate: ${metrics.zeroResultRate.toFixed(1)}% (${metrics.zeroResults}/${metrics.normalCases} queries returned nothing)`);
165
+ lines.push(` Out-of-scope accuracy: ${metrics.outOfScopeAccuracy.toFixed(1)}% (${metrics.outOfScopeCorrect}/${metrics.outOfScopeCases} correctly returned nothing)`);
166
+ lines.push(` Avg response time: ${metrics.avgResponseTimeMs.toFixed(0)}ms`);
167
+ lines.push(` MRR: ${metrics.meanReciprocalRank.toFixed(3)} (1.0 = perfect ranking, 0.5 = avg position 2)`);
168
+ lines.push(` NDCG@k: ${metrics.normalizedDiscountedCumulativeGain.toFixed(3)} (1.0 = perfect ranking of all relevant docs)`);
169
+ lines.push('');
170
+ if (metrics.missed.length > 0) {
171
+ lines.push(`MISSED QUERIES (no returned doc at grade >= ${HIT_THRESHOLD}):`);
172
+ for (const miss of metrics.missed) {
173
+ const relevantDocs = miss.testCase.judgments
174
+ .filter(judgment => judgment.grade >= HIT_THRESHOLD)
175
+ .map(judgment => `${judgment.document_id}(g${judgment.grade})`);
176
+ lines.push(` "${miss.testCase.query}" — relevant [${relevantDocs.join(', ')}], got [${miss.returnedIds.slice(0, 5).join(', ')}]`);
177
+ }
178
+ lines.push('');
179
+ }
180
+ lines.push('BY TAG:');
181
+ const sortedTags = Object.entries(metrics.tagStats).sort((entryA, entryB) => entryB[1].total - entryA[1].total);
182
+ for (const [tag, stats] of sortedTags) {
183
+ const hitPercentage = ((stats.hits / stats.total) * 100).toFixed(0);
184
+ const firstResultPercentage = ((stats.firstHits / stats.total) * 100).toFixed(0);
185
+ lines.push(` ${tag}: ${hitPercentage}% hit rate, ${firstResultPercentage}% first-result (${stats.total} queries)`);
186
+ }
187
+ lines.push('');
188
+ lines.push('='.repeat(60));
189
+ return lines.join('\n');
190
+ }
191
+ // =============================================================================
192
+ // compareRuns — diff two eval runs and detect regressions
193
+ // =============================================================================
194
+ const INVERTED_METRICS = new Set(['zeroResultRate', 'avgResponseTimeMs']);
195
+ const UNCHANGED_THRESHOLD = 0.01;
196
+ export function compareRuns(current, previous) {
197
+ const metricKeys = [
198
+ 'hitRate',
199
+ 'firstResultAccuracy',
200
+ 'recall',
201
+ 'zeroResultRate',
202
+ 'meanReciprocalRank',
203
+ 'normalizedDiscountedCumulativeGain',
204
+ 'avgResponseTimeMs',
205
+ ];
206
+ const improvements = [];
207
+ const regressions = [];
208
+ const unchanged = [];
209
+ for (const metricKey of metricKeys) {
210
+ const currentValue = current[metricKey];
211
+ const previousValue = previous[metricKey];
212
+ const diff = currentValue - previousValue;
213
+ const metricDiff = {
214
+ metric: metricKey,
215
+ current: currentValue,
216
+ previous: previousValue,
217
+ diff,
218
+ };
219
+ if (Math.abs(diff) < UNCHANGED_THRESHOLD) {
220
+ unchanged.push(metricDiff);
221
+ continue;
222
+ }
223
+ const isInverted = INVERTED_METRICS.has(metricKey);
224
+ // For normal metrics: positive diff = improvement. For inverted: negative diff = improvement.
225
+ const isImprovement = isInverted ? diff < 0 : diff > 0;
226
+ if (isImprovement) {
227
+ improvements.push(metricDiff);
228
+ }
229
+ else {
230
+ regressions.push(metricDiff);
231
+ }
232
+ }
233
+ const severity = determineSeverity(current, regressions);
234
+ return { improvements, regressions, unchanged, severity };
235
+ }
236
+ function determineSeverity(current, regressions) {
237
+ if (current.hitRate < 80 || current.zeroResultRate > 10) {
238
+ return 'critical';
239
+ }
240
+ if (regressions.length === 0) {
241
+ return 'ok';
242
+ }
243
+ // Worst regression drop: for normal metrics use |diff| (diff is negative for regressions),
244
+ // for inverted metrics use |diff| (diff is positive for regressions).
245
+ // In both cases Math.abs(diff) gives the magnitude of the drop.
246
+ const worstDrop = regressions.reduce((maxDrop, regression) => {
247
+ const dropMagnitude = Math.abs(regression.diff);
248
+ return dropMagnitude > maxDrop ? dropMagnitude : maxDrop;
249
+ }, 0);
250
+ if (worstDrop > 5)
251
+ return 'block';
252
+ if (worstDrop > 2)
253
+ return 'warning';
254
+ return 'ok';
255
+ }
256
+ // =============================================================================
257
+ // formatComparison — human-readable comparison report
258
+ // =============================================================================
259
+ const PERCENTAGE_METRICS = new Set(['hitRate', 'firstResultAccuracy', 'recall', 'zeroResultRate']);
260
+ function formatMetricValue(metricKey, value) {
261
+ if (metricKey === 'meanReciprocalRank' || metricKey === 'normalizedDiscountedCumulativeGain')
262
+ return value.toFixed(3);
263
+ if (PERCENTAGE_METRICS.has(metricKey))
264
+ return `${value.toFixed(1)}%`;
265
+ return value.toFixed(1);
266
+ }
267
+ export function formatComparison(comparison) {
268
+ const lines = [];
269
+ lines.push('='.repeat(60));
270
+ lines.push(`Run Comparison — severity: ${comparison.severity}`);
271
+ lines.push('='.repeat(60));
272
+ lines.push('');
273
+ if (comparison.improvements.length > 0) {
274
+ lines.push('IMPROVEMENTS:');
275
+ for (const metricDiff of comparison.improvements) {
276
+ const previousFormatted = formatMetricValue(metricDiff.metric, metricDiff.previous);
277
+ const currentFormatted = formatMetricValue(metricDiff.metric, metricDiff.current);
278
+ const diffFormatted = formatMetricValue(metricDiff.metric, Math.abs(metricDiff.diff));
279
+ lines.push(` ${metricDiff.metric}: ${previousFormatted} → ${currentFormatted} (+${diffFormatted})`);
280
+ }
281
+ lines.push('');
282
+ }
283
+ if (comparison.regressions.length > 0) {
284
+ lines.push('REGRESSIONS:');
285
+ for (const metricDiff of comparison.regressions) {
286
+ const previousFormatted = formatMetricValue(metricDiff.metric, metricDiff.previous);
287
+ const currentFormatted = formatMetricValue(metricDiff.metric, metricDiff.current);
288
+ const diffFormatted = formatMetricValue(metricDiff.metric, Math.abs(metricDiff.diff));
289
+ lines.push(` ${metricDiff.metric}: ${previousFormatted} → ${currentFormatted} (-${diffFormatted})`);
290
+ }
291
+ lines.push('');
292
+ }
293
+ if (comparison.unchanged.length > 0) {
294
+ lines.push('UNCHANGED:');
295
+ for (const metricDiff of comparison.unchanged) {
296
+ const currentFormatted = formatMetricValue(metricDiff.metric, metricDiff.current);
297
+ lines.push(` ${metricDiff.metric}: ${currentFormatted}`);
298
+ }
299
+ lines.push('');
300
+ }
301
+ lines.push('='.repeat(60));
302
+ return lines.join('\n');
303
+ }
@@ -0,0 +1,23 @@
1
+ import { writeFileSync, existsSync, readFileSync, mkdirSync, chmodSync } from 'fs';
2
+ import { dirname } from 'path';
3
+ /**
4
+ * Write a note's content to disk at the specified file_path.
5
+ * Creates parent directories if needed. Sets Unix permissions.
6
+ * Skips write if content matches existing file (idempotent).
7
+ */
8
+ export function writeNoteFile(content, filePath, permissions) {
9
+ const dir = dirname(filePath);
10
+ const mode = permissions ? parseInt(permissions, 8) : 0o644;
11
+ if (!existsSync(dir)) {
12
+ mkdirSync(dir, { recursive: true });
13
+ }
14
+ if (existsSync(filePath)) {
15
+ const existing = readFileSync(filePath, 'utf-8');
16
+ if (existing === content) {
17
+ return { status: 'skipped', path: filePath };
18
+ }
19
+ }
20
+ writeFileSync(filePath, content, { mode });
21
+ chmodSync(filePath, mode);
22
+ return { status: 'written', path: filePath };
23
+ }