@aperdomoll90/ledger-ai 1.4.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +177 -221
- package/dist/commands/add.js +51 -100
- package/dist/commands/backfill.js +55 -0
- package/dist/commands/backup.js +10 -10
- package/dist/commands/check.js +21 -29
- package/dist/commands/config.js +13 -12
- package/dist/commands/delete.js +22 -17
- package/dist/commands/eval-judge.js +11 -0
- package/dist/commands/eval.js +321 -0
- package/dist/commands/export.js +8 -10
- package/dist/commands/get.js +9 -0
- package/dist/commands/hunt.js +206 -0
- package/dist/commands/ingest.js +15 -14
- package/dist/commands/init.js +18 -20
- package/dist/commands/list.js +21 -7
- package/dist/commands/migrate.js +11 -11
- package/dist/commands/onboard.js +2 -2
- package/dist/commands/pull.js +3 -2
- package/dist/commands/push.js +8 -8
- package/dist/commands/restore.js +38 -38
- package/dist/commands/show.js +13 -16
- package/dist/commands/sync.js +58 -19
- package/dist/commands/tag.js +20 -14
- package/dist/commands/update.js +50 -18
- package/dist/commands/wizard.js +3 -3
- package/dist/lib/ai-search.js +163 -0
- package/dist/lib/audit.js +19 -0
- package/dist/lib/backfill.js +60 -0
- package/dist/lib/config.js +19 -2
- package/dist/lib/document-classification.js +5 -0
- package/dist/lib/document-fetching.js +77 -0
- package/dist/lib/document-operations.js +150 -0
- package/dist/lib/documents/classification.js +5 -0
- package/dist/lib/documents/fetching.js +89 -0
- package/dist/lib/documents/operations.js +304 -0
- package/dist/lib/domains.js +116 -0
- package/dist/lib/embeddings.js +190 -0
- package/dist/lib/errors.js +3 -1
- package/dist/lib/eval/eval-advanced.js +289 -0
- package/dist/lib/eval/eval-judge-session.js +233 -0
- package/dist/lib/eval/eval-store.js +105 -0
- package/dist/lib/eval/eval.js +303 -0
- package/dist/lib/file-writer.js +23 -0
- package/dist/lib/generators.js +44 -45
- package/dist/lib/hunter-db.js +235 -0
- package/dist/lib/hunter-rss.js +30 -0
- package/dist/lib/hunter-scoring.js +55 -0
- package/dist/lib/hunter-types.js +36 -0
- package/dist/lib/lint-configs.js +20 -0
- package/dist/lib/migrate.js +2 -2
- package/dist/lib/notes.js +173 -59
- package/dist/lib/observability.js +296 -0
- package/dist/lib/op-add-note-types.test.js +7 -6
- package/dist/lib/prompt.js +8 -8
- package/dist/lib/rate-limiter.js +103 -0
- package/dist/lib/search/ai-search.js +396 -0
- package/dist/lib/search/chunk-context-enrichment.js +155 -0
- package/dist/lib/search/embeddings.js +293 -0
- package/dist/lib/search/reranker.js +120 -0
- package/dist/lib/search/semantic-cache.js +53 -0
- package/dist/lib/type-registry.test.js +6 -6
- package/dist/mcp-server.js +553 -66
- package/dist/migrations/migrations/005-audit-log.sql +22 -0
- package/dist/migrations/migrations/005_opportunities.sql +48 -0
- package/dist/migrations/migrations/006-audited-operations.sql +235 -0
- package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
- package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
- package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
- package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
- package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
- package/dist/scripts/batch-grade.js +344 -0
- package/dist/scripts/benchmark-ingestion.js +376 -0
- package/dist/scripts/convert-judgments-to-graded.js +88 -0
- package/dist/scripts/diagnose-first-result.js +333 -0
- package/dist/scripts/drop-golden-query.js +53 -0
- package/dist/scripts/eval-search.js +115 -0
- package/dist/scripts/grade-unjudged-top1.js +138 -0
- package/dist/scripts/hunter-analytics.js +38 -0
- package/dist/scripts/hunter-cron.js +63 -0
- package/dist/scripts/hunter-purge.js +25 -0
- package/dist/scripts/migrate-v2.js +140 -0
- package/dist/scripts/reindex.js +74 -0
- package/dist/scripts/sync-local-docs.js +153 -0
- package/package.json +7 -1
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
// eval-judge-session.ts
|
|
2
|
+
// Session state, input parsing, progress rendering, and durable writes for
|
|
3
|
+
// the `ledger eval:judge` rejudging walkthrough.
|
|
4
|
+
import { createInterface } from 'node:readline';
|
|
5
|
+
import { searchHybrid } from '../search/ai-search.js';
|
|
6
|
+
import { CURRENT_SEARCH_CONFIG } from './eval-store.js';
|
|
7
|
+
// =============================================================================
|
|
8
|
+
// Pure helpers (unit-tested)
|
|
9
|
+
// =============================================================================
|
|
10
|
+
export function parseGradeInput(rawInput) {
|
|
11
|
+
const input = rawInput.trim();
|
|
12
|
+
if (input === '0' || input === '1' || input === '2' || input === '3') {
|
|
13
|
+
return { kind: 'grade', value: parseInt(input, 10) };
|
|
14
|
+
}
|
|
15
|
+
if (input === 's')
|
|
16
|
+
return { kind: 'skip' };
|
|
17
|
+
if (input === 'b')
|
|
18
|
+
return { kind: 'back' };
|
|
19
|
+
if (input === 'n')
|
|
20
|
+
return { kind: 'note' };
|
|
21
|
+
if (input === '?')
|
|
22
|
+
return { kind: 'rubric' };
|
|
23
|
+
if (input === 'q')
|
|
24
|
+
return { kind: 'quit' };
|
|
25
|
+
return { kind: 'invalid', raw: input };
|
|
26
|
+
}
|
|
27
|
+
export function pickNextUngraded(candidates) {
|
|
28
|
+
for (const candidate of candidates) {
|
|
29
|
+
if (!candidate.graded)
|
|
30
|
+
return candidate;
|
|
31
|
+
}
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
export function formatProgressLine(progress) {
|
|
35
|
+
const percentage = progress.queriesTotal > 0
|
|
36
|
+
? Math.round((progress.queriesComplete / progress.queriesTotal) * 100)
|
|
37
|
+
: 0;
|
|
38
|
+
return `Progress: ${progress.queriesComplete} / ${progress.queriesTotal} queries complete (${percentage}%). Judgments: ${progress.judgmentsTotal}.`;
|
|
39
|
+
}
|
|
40
|
+
// =============================================================================
|
|
41
|
+
// Rubric
|
|
42
|
+
// =============================================================================
|
|
43
|
+
const RUBRIC_TEXT = `
|
|
44
|
+
TREC 4-level grading rubric:
|
|
45
|
+
|
|
46
|
+
0 NOT RELEVANT No useful info for this query. Wrong topic.
|
|
47
|
+
1 RELATED Touches the topic but doesn't answer.
|
|
48
|
+
2 RELEVANT Answers the query, but not the ideal/canonical source.
|
|
49
|
+
3 HIGHLY RELEVANT The canonical, complete answer.
|
|
50
|
+
|
|
51
|
+
Boundary heuristics:
|
|
52
|
+
1 vs 2: "Would a user be happy if this was the top result?" Yes = 2, No = 1.
|
|
53
|
+
2 vs 3: "Is there a better doc for this query that I know exists?" Yes = 2, No = 3.
|
|
54
|
+
`;
|
|
55
|
+
// =============================================================================
|
|
56
|
+
// Database I/O
|
|
57
|
+
// =============================================================================
|
|
58
|
+
async function loadNextGolden(supabase, startId = 0) {
|
|
59
|
+
const { data, error } = await supabase
|
|
60
|
+
.from('eval_golden_dataset')
|
|
61
|
+
.select('id, query, tags, judgments:eval_golden_judgments(document_id, grade)')
|
|
62
|
+
.gte('id', startId)
|
|
63
|
+
.order('id', { ascending: true });
|
|
64
|
+
if (error) {
|
|
65
|
+
process.stderr.write(`[ledger] loadNextGolden failed: ${error.message}\n`);
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
if (!data)
|
|
69
|
+
return null;
|
|
70
|
+
for (const row of data) {
|
|
71
|
+
const gradedMap = new Map();
|
|
72
|
+
for (const judgment of row.judgments ?? []) {
|
|
73
|
+
gradedMap.set(judgment.document_id, judgment.grade);
|
|
74
|
+
}
|
|
75
|
+
return {
|
|
76
|
+
id: row.id,
|
|
77
|
+
query: row.query,
|
|
78
|
+
tags: row.tags ?? [],
|
|
79
|
+
existing_grades: gradedMap,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
return null;
|
|
83
|
+
}
|
|
84
|
+
async function fetchProgress(supabase) {
|
|
85
|
+
const { count: queriesTotal } = await supabase
|
|
86
|
+
.from('eval_golden_dataset')
|
|
87
|
+
.select('*', { count: 'exact', head: true });
|
|
88
|
+
const { count: judgmentsTotal } = await supabase
|
|
89
|
+
.from('eval_golden_judgments')
|
|
90
|
+
.select('*', { count: 'exact', head: true });
|
|
91
|
+
const { data: rpcData } = await supabase
|
|
92
|
+
.rpc('count_golden_with_min_judgments', { p_min: 10 });
|
|
93
|
+
const queriesComplete = typeof rpcData === 'number' ? rpcData : 0;
|
|
94
|
+
return {
|
|
95
|
+
queriesTotal: queriesTotal ?? 0,
|
|
96
|
+
queriesComplete,
|
|
97
|
+
judgmentsTotal: judgmentsTotal ?? 0,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
// =============================================================================
|
|
101
|
+
// Prompt helper
|
|
102
|
+
// =============================================================================
|
|
103
|
+
function promptUser(readline, question) {
|
|
104
|
+
return new Promise(resolve => {
|
|
105
|
+
readline.question(question, (answer) => resolve(answer));
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
function snippet(content, maxChars = 200) {
|
|
109
|
+
if (!content)
|
|
110
|
+
return '';
|
|
111
|
+
return content.replace(/\s+/g, ' ').slice(0, maxChars);
|
|
112
|
+
}
|
|
113
|
+
// =============================================================================
|
|
114
|
+
// Interactive session
|
|
115
|
+
// =============================================================================
|
|
116
|
+
export async function runJudgeSession(clients, startGoldenId) {
|
|
117
|
+
const supabase = clients.supabase;
|
|
118
|
+
const readline = createInterface({ input: process.stdin, output: process.stdout });
|
|
119
|
+
try {
|
|
120
|
+
let currentId = startGoldenId ?? 0;
|
|
121
|
+
while (true) {
|
|
122
|
+
const progress = await fetchProgress(supabase);
|
|
123
|
+
console.log('');
|
|
124
|
+
console.log(formatProgressLine(progress));
|
|
125
|
+
console.log('');
|
|
126
|
+
const golden = await loadNextGolden(supabase, currentId);
|
|
127
|
+
if (!golden) {
|
|
128
|
+
console.log('No more queries to judge. Done.');
|
|
129
|
+
return;
|
|
130
|
+
}
|
|
131
|
+
// Run search for this query
|
|
132
|
+
const searchResults = await searchHybrid(clients, {
|
|
133
|
+
query: golden.query,
|
|
134
|
+
limit: CURRENT_SEARCH_CONFIG.limit,
|
|
135
|
+
reranker: CURRENT_SEARCH_CONFIG.reranker,
|
|
136
|
+
});
|
|
137
|
+
// Build candidate list from top 10
|
|
138
|
+
const candidates = [];
|
|
139
|
+
for (const result of searchResults.slice(0, 10)) {
|
|
140
|
+
candidates.push({
|
|
141
|
+
id: result.id,
|
|
142
|
+
name: result.name ?? '<unknown>',
|
|
143
|
+
score: result.score ?? result.similarity ?? 0,
|
|
144
|
+
content: snippet(result.content),
|
|
145
|
+
graded: golden.existing_grades.has(result.id),
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
const ungradedList = candidates.filter(candidate => !candidate.graded);
|
|
149
|
+
if (ungradedList.length === 0) {
|
|
150
|
+
// This query is fully graded. Advance.
|
|
151
|
+
currentId = golden.id + 1;
|
|
152
|
+
continue;
|
|
153
|
+
}
|
|
154
|
+
// Print header
|
|
155
|
+
console.log('='.repeat(60));
|
|
156
|
+
console.log(`Query #${golden.id}: "${golden.query}"`);
|
|
157
|
+
if (golden.tags.length > 0)
|
|
158
|
+
console.log(`Tags: ${golden.tags.join(', ')}`);
|
|
159
|
+
if (golden.existing_grades.size > 0) {
|
|
160
|
+
console.log('Already graded:');
|
|
161
|
+
for (const [documentId, grade] of golden.existing_grades.entries()) {
|
|
162
|
+
console.log(` #${documentId} -> ${grade}`);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
console.log('='.repeat(60));
|
|
166
|
+
let pendingNote = null;
|
|
167
|
+
let candidateIndex = 0;
|
|
168
|
+
while (candidateIndex < ungradedList.length) {
|
|
169
|
+
const candidate = ungradedList[candidateIndex];
|
|
170
|
+
console.log('');
|
|
171
|
+
console.log(`[${candidateIndex + 1}/${ungradedList.length}] #${candidate.id} ${candidate.name} (score ${candidate.score.toFixed(3)})`);
|
|
172
|
+
console.log(`"${candidate.content}..."`);
|
|
173
|
+
const answer = await promptUser(readline, 'Grade [0/1/2/3] s=skip b=back n=notes ?=rubric q=save & quit: ');
|
|
174
|
+
const parsed = parseGradeInput(answer);
|
|
175
|
+
if (parsed.kind === 'invalid') {
|
|
176
|
+
console.log(`(unrecognized input "${parsed.raw}". Press ? for rubric.)`);
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
if (parsed.kind === 'rubric') {
|
|
180
|
+
console.log(RUBRIC_TEXT);
|
|
181
|
+
continue;
|
|
182
|
+
}
|
|
183
|
+
if (parsed.kind === 'note') {
|
|
184
|
+
pendingNote = await promptUser(readline, 'Note: ');
|
|
185
|
+
continue;
|
|
186
|
+
}
|
|
187
|
+
if (parsed.kind === 'quit') {
|
|
188
|
+
console.log('Saving and exiting.');
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
if (parsed.kind === 'skip') {
|
|
192
|
+
candidateIndex++;
|
|
193
|
+
pendingNote = null;
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
if (parsed.kind === 'back') {
|
|
197
|
+
if (candidateIndex > 0)
|
|
198
|
+
candidateIndex--;
|
|
199
|
+
pendingNote = null;
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
// Grade: durable write via RPC
|
|
203
|
+
const { error: rpcError } = await supabase.rpc('judgment_create', {
|
|
204
|
+
p_golden_id: golden.id,
|
|
205
|
+
p_document_id: candidate.id,
|
|
206
|
+
p_grade: parsed.value,
|
|
207
|
+
p_judged_by: 'adrian',
|
|
208
|
+
p_notes: pendingNote,
|
|
209
|
+
});
|
|
210
|
+
if (rpcError) {
|
|
211
|
+
// Duplicate? Try update instead.
|
|
212
|
+
const { error: updateError } = await supabase.rpc('judgment_update', {
|
|
213
|
+
p_golden_id: golden.id,
|
|
214
|
+
p_document_id: candidate.id,
|
|
215
|
+
p_grade: parsed.value,
|
|
216
|
+
p_notes: pendingNote,
|
|
217
|
+
});
|
|
218
|
+
if (updateError) {
|
|
219
|
+
console.error(` [ERR] Could not save grade: ${updateError.message}`);
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
pendingNote = null;
|
|
224
|
+
candidateIndex++;
|
|
225
|
+
}
|
|
226
|
+
console.log(`Query #${golden.id} complete.`);
|
|
227
|
+
currentId = golden.id + 1;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
finally {
|
|
231
|
+
readline.close();
|
|
232
|
+
}
|
|
233
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
// eval-store.ts
|
|
2
|
+
// Persistence layer for eval runs — save results to eval_runs table and load previous runs.
|
|
3
|
+
/**
|
|
4
|
+
* Current search configuration. Saved with each eval run for reproducibility.
|
|
5
|
+
* Single source of truth — used by both the eval script and CLI command.
|
|
6
|
+
* Update this when search parameters change (threshold, model, reranker, etc.).
|
|
7
|
+
*/
|
|
8
|
+
export const CURRENT_SEARCH_CONFIG = {
|
|
9
|
+
threshold: 0.38,
|
|
10
|
+
reciprocalRankFusionK: 60,
|
|
11
|
+
embedding_model: 'openai/text-embedding-3-small',
|
|
12
|
+
limit: 10,
|
|
13
|
+
chunking: 'recursive',
|
|
14
|
+
chunk_max_size: 1000,
|
|
15
|
+
chunk_overlap: 200,
|
|
16
|
+
context_enrichment: true,
|
|
17
|
+
context_enrichment_model: 'gpt-4o-mini',
|
|
18
|
+
reranker: 'none',
|
|
19
|
+
hit_threshold: 2,
|
|
20
|
+
ndcg_gain_formula: '2^g - 1',
|
|
21
|
+
};
|
|
22
|
+
// =============================================================================
|
|
23
|
+
// Save
|
|
24
|
+
// =============================================================================
|
|
25
|
+
export async function saveEvalRun(supabase, props) {
|
|
26
|
+
const { metrics, config, results, confidenceIntervals, scoreCalibration, coverageAnalysis } = props;
|
|
27
|
+
const row = {
|
|
28
|
+
config,
|
|
29
|
+
test_case_count: metrics.totalCases,
|
|
30
|
+
hit_rate: metrics.hitRate,
|
|
31
|
+
first_result_accuracy: metrics.firstResultAccuracy,
|
|
32
|
+
recall: metrics.recall,
|
|
33
|
+
zero_result_rate: metrics.zeroResultRate,
|
|
34
|
+
avg_response_time_ms: metrics.avgResponseTimeMs,
|
|
35
|
+
mean_reciprocal_rank: metrics.meanReciprocalRank,
|
|
36
|
+
normalized_discounted_cumulative_gain: metrics.normalizedDiscountedCumulativeGain,
|
|
37
|
+
confidence_intervals: confidenceIntervals ?? null,
|
|
38
|
+
score_calibration: scoreCalibration ?? null,
|
|
39
|
+
coverage_analysis: coverageAnalysis ?? null,
|
|
40
|
+
results_by_tag: metrics.tagStats,
|
|
41
|
+
missed_queries: metrics.missed.map(missedResult => ({
|
|
42
|
+
query: missedResult.testCase.query,
|
|
43
|
+
tags: missedResult.testCase.tags,
|
|
44
|
+
judgments: missedResult.testCase.judgments,
|
|
45
|
+
got: missedResult.returnedIds,
|
|
46
|
+
gotScores: missedResult.returnedScores,
|
|
47
|
+
})),
|
|
48
|
+
per_query_results: results.map(testResult => ({
|
|
49
|
+
query: testResult.testCase.query,
|
|
50
|
+
tags: testResult.testCase.tags,
|
|
51
|
+
judgments: testResult.testCase.judgments,
|
|
52
|
+
hit: testResult.hit,
|
|
53
|
+
firstResultHit: testResult.firstResultHit,
|
|
54
|
+
position: testResult.position,
|
|
55
|
+
expectedFound: testResult.expectedFound,
|
|
56
|
+
expectedTotal: testResult.expectedTotal,
|
|
57
|
+
responseTimeMs: testResult.responseTimeMs,
|
|
58
|
+
reciprocalRank: testResult.reciprocalRank,
|
|
59
|
+
normalizedDiscountedCumulativeGain: testResult.normalizedDiscountedCumulativeGain,
|
|
60
|
+
returnedIds: testResult.returnedIds,
|
|
61
|
+
returnedScores: testResult.returnedScores,
|
|
62
|
+
})),
|
|
63
|
+
};
|
|
64
|
+
const { data, error } = await supabase
|
|
65
|
+
.from('eval_runs')
|
|
66
|
+
.insert(row)
|
|
67
|
+
.select('id')
|
|
68
|
+
.single();
|
|
69
|
+
if (error) {
|
|
70
|
+
throw new Error(`Failed to save eval run (${props.metrics.totalCases} cases): ${error.message}`);
|
|
71
|
+
}
|
|
72
|
+
return data.id;
|
|
73
|
+
}
|
|
74
|
+
// =============================================================================
|
|
75
|
+
// Load
|
|
76
|
+
// =============================================================================
|
|
77
|
+
export async function loadPreviousRun(supabase) {
|
|
78
|
+
const { data, error } = await supabase
|
|
79
|
+
.from('eval_runs')
|
|
80
|
+
.select('*')
|
|
81
|
+
.order('run_date', { ascending: false })
|
|
82
|
+
.limit(1)
|
|
83
|
+
.single();
|
|
84
|
+
if (error) {
|
|
85
|
+
if (error.code !== 'PGRST116') {
|
|
86
|
+
process.stderr.write(`[ledger] loadPreviousRun failed: ${error.message}\n`);
|
|
87
|
+
}
|
|
88
|
+
return null;
|
|
89
|
+
}
|
|
90
|
+
return data;
|
|
91
|
+
}
|
|
92
|
+
export async function loadEvalRun(supabase, runId) {
|
|
93
|
+
const { data, error } = await supabase
|
|
94
|
+
.from('eval_runs')
|
|
95
|
+
.select('*')
|
|
96
|
+
.eq('id', runId)
|
|
97
|
+
.single();
|
|
98
|
+
if (error) {
|
|
99
|
+
if (error.code !== 'PGRST116') {
|
|
100
|
+
process.stderr.write(`[ledger] loadEvalRun(${runId}) failed: ${error.message}\n`);
|
|
101
|
+
}
|
|
102
|
+
return null;
|
|
103
|
+
}
|
|
104
|
+
return data;
|
|
105
|
+
}
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
// eval.ts
|
|
2
|
+
// Types and metric computation for search evaluation.
|
|
3
|
+
// Pure functions — no I/O, no database calls.
|
|
4
|
+
/**
|
|
5
|
+
* Rate metrics (hit rate, first-result accuracy, recall, MRR) count a result
|
|
6
|
+
* as "good" when its grade is at or above this threshold. Only NDCG uses the
|
|
7
|
+
* full 2^g - 1 gain function across all grades.
|
|
8
|
+
*/
|
|
9
|
+
export const HIT_THRESHOLD = 2;
|
|
10
|
+
// =============================================================================
|
|
11
|
+
// NDCG@k helper — graded (2^g - 1 gain, TREC 4-level)
|
|
12
|
+
// =============================================================================
|
|
13
|
+
function gradeGain(grade) {
|
|
14
|
+
return Math.pow(2, grade) - 1;
|
|
15
|
+
}
|
|
16
|
+
function computeNormalizedDiscountedCumulativeGain(returnedIds, gradeByDocId) {
|
|
17
|
+
// Collect all non-zero grades (any doc that contributes to ideal ranking).
|
|
18
|
+
const relevantGrades = [];
|
|
19
|
+
for (const grade of gradeByDocId.values()) {
|
|
20
|
+
if (grade >= 1)
|
|
21
|
+
relevantGrades.push(grade);
|
|
22
|
+
}
|
|
23
|
+
if (relevantGrades.length === 0)
|
|
24
|
+
return 0;
|
|
25
|
+
// DCG against the returned order
|
|
26
|
+
let discountedCumulativeGain = 0;
|
|
27
|
+
for (let position = 0; position < returnedIds.length; position++) {
|
|
28
|
+
const grade = gradeByDocId.get(returnedIds[position]) ?? 0;
|
|
29
|
+
discountedCumulativeGain += gradeGain(grade) / Math.log2(position + 2);
|
|
30
|
+
}
|
|
31
|
+
// IDCG: ideal ordering is grades sorted descending
|
|
32
|
+
const idealGrades = relevantGrades.slice().sort((gradeA, gradeB) => gradeB - gradeA);
|
|
33
|
+
let idealDiscountedCumulativeGain = 0;
|
|
34
|
+
for (let position = 0; position < idealGrades.length; position++) {
|
|
35
|
+
idealDiscountedCumulativeGain += gradeGain(idealGrades[position]) / Math.log2(position + 2);
|
|
36
|
+
}
|
|
37
|
+
if (idealDiscountedCumulativeGain === 0)
|
|
38
|
+
return 0;
|
|
39
|
+
return discountedCumulativeGain / idealDiscountedCumulativeGain;
|
|
40
|
+
}
|
|
41
|
+
// =============================================================================
|
|
42
|
+
// Scoring a single test case
|
|
43
|
+
// =============================================================================
|
|
44
|
+
export function scoreTestCase(testCase, searchResults, responseTimeMs) {
|
|
45
|
+
const returnedIds = searchResults.map(result => result.id);
|
|
46
|
+
const returnedScores = searchResults.map(result => result.score ?? result.similarity ?? 0);
|
|
47
|
+
// Build grade lookup. Missing doc defaults to grade 0 (treated as irrelevant).
|
|
48
|
+
const gradeByDocId = new Map();
|
|
49
|
+
for (const judgment of testCase.judgments) {
|
|
50
|
+
gradeByDocId.set(judgment.document_id, judgment.grade);
|
|
51
|
+
}
|
|
52
|
+
const relevantJudgments = testCase.judgments.filter(judgment => judgment.grade >= HIT_THRESHOLD);
|
|
53
|
+
const isOutOfScope = relevantJudgments.length === 0;
|
|
54
|
+
if (isOutOfScope) {
|
|
55
|
+
return {
|
|
56
|
+
testCase,
|
|
57
|
+
returnedIds,
|
|
58
|
+
returnedScores,
|
|
59
|
+
hit: searchResults.length === 0,
|
|
60
|
+
firstResultHit: searchResults.length === 0,
|
|
61
|
+
expectedFound: 0,
|
|
62
|
+
expectedTotal: 0,
|
|
63
|
+
position: null,
|
|
64
|
+
responseTimeMs,
|
|
65
|
+
reciprocalRank: 0,
|
|
66
|
+
normalizedDiscountedCumulativeGain: 0,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
// First returned doc with grade >= HIT_THRESHOLD
|
|
70
|
+
let firstHitPosition = null;
|
|
71
|
+
for (let position = 0; position < returnedIds.length; position++) {
|
|
72
|
+
const grade = gradeByDocId.get(returnedIds[position]) ?? 0;
|
|
73
|
+
if (grade >= HIT_THRESHOLD) {
|
|
74
|
+
firstHitPosition = position;
|
|
75
|
+
break;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
const topGrade = returnedIds.length > 0 ? (gradeByDocId.get(returnedIds[0]) ?? 0) : 0;
|
|
79
|
+
const foundCount = returnedIds.filter(docId => (gradeByDocId.get(docId) ?? 0) >= HIT_THRESHOLD).length;
|
|
80
|
+
return {
|
|
81
|
+
testCase,
|
|
82
|
+
returnedIds,
|
|
83
|
+
returnedScores,
|
|
84
|
+
hit: firstHitPosition !== null,
|
|
85
|
+
firstResultHit: topGrade >= HIT_THRESHOLD,
|
|
86
|
+
expectedFound: foundCount,
|
|
87
|
+
expectedTotal: relevantJudgments.length,
|
|
88
|
+
position: firstHitPosition,
|
|
89
|
+
responseTimeMs,
|
|
90
|
+
reciprocalRank: firstHitPosition !== null ? 1 / (firstHitPosition + 1) : 0,
|
|
91
|
+
normalizedDiscountedCumulativeGain: computeNormalizedDiscountedCumulativeGain(returnedIds, gradeByDocId),
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
// =============================================================================
|
|
95
|
+
// Aggregate metrics from scored results
|
|
96
|
+
// =============================================================================
|
|
97
|
+
const hasRelevantJudgment = (result) => result.testCase.judgments.some(judgment => judgment.grade >= HIT_THRESHOLD);
|
|
98
|
+
export function computeMetrics(results) {
|
|
99
|
+
const normalResults = results.filter(hasRelevantJudgment);
|
|
100
|
+
const outOfScopeResults = results.filter(result => !hasRelevantJudgment(result));
|
|
101
|
+
const totalNormal = normalResults.length;
|
|
102
|
+
const hits = normalResults.filter(result => result.hit).length;
|
|
103
|
+
const firstResultHits = normalResults.filter(result => result.firstResultHit).length;
|
|
104
|
+
const totalExpected = normalResults.reduce((sum, result) => sum + result.expectedTotal, 0);
|
|
105
|
+
const totalFound = normalResults.reduce((sum, result) => sum + result.expectedFound, 0);
|
|
106
|
+
const zeroResults = normalResults.filter(result => result.returnedIds.length === 0).length;
|
|
107
|
+
const outOfScopeCorrect = outOfScopeResults.filter(result => result.hit).length;
|
|
108
|
+
const avgResponseTimeMs = results.length > 0
|
|
109
|
+
? results.reduce((sum, result) => sum + result.responseTimeMs, 0) / results.length
|
|
110
|
+
: 0;
|
|
111
|
+
const tagStats = {};
|
|
112
|
+
for (const result of normalResults) {
|
|
113
|
+
for (const tag of result.testCase.tags) {
|
|
114
|
+
if (!tagStats[tag])
|
|
115
|
+
tagStats[tag] = { total: 0, hits: 0, firstHits: 0 };
|
|
116
|
+
tagStats[tag].total++;
|
|
117
|
+
if (result.hit)
|
|
118
|
+
tagStats[tag].hits++;
|
|
119
|
+
if (result.firstResultHit)
|
|
120
|
+
tagStats[tag].firstHits++;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return {
|
|
124
|
+
totalCases: results.length,
|
|
125
|
+
normalCases: totalNormal,
|
|
126
|
+
outOfScopeCases: outOfScopeResults.length,
|
|
127
|
+
hits,
|
|
128
|
+
firstResultHits,
|
|
129
|
+
totalExpected,
|
|
130
|
+
totalFound,
|
|
131
|
+
zeroResults,
|
|
132
|
+
outOfScopeCorrect,
|
|
133
|
+
avgResponseTimeMs,
|
|
134
|
+
hitRate: totalNormal > 0 ? (hits / totalNormal) * 100 : 0,
|
|
135
|
+
firstResultAccuracy: totalNormal > 0 ? (firstResultHits / totalNormal) * 100 : 0,
|
|
136
|
+
recall: totalExpected > 0 ? (totalFound / totalExpected) * 100 : 0,
|
|
137
|
+
zeroResultRate: totalNormal > 0 ? (zeroResults / totalNormal) * 100 : 0,
|
|
138
|
+
outOfScopeAccuracy: outOfScopeResults.length > 0 ? (outOfScopeCorrect / outOfScopeResults.length) * 100 : 0,
|
|
139
|
+
meanReciprocalRank: totalNormal > 0
|
|
140
|
+
? normalResults.reduce((sum, result) => sum + result.reciprocalRank, 0) / totalNormal
|
|
141
|
+
: 0,
|
|
142
|
+
normalizedDiscountedCumulativeGain: totalNormal > 0
|
|
143
|
+
? normalResults.reduce((sum, result) => sum + result.normalizedDiscountedCumulativeGain, 0) / totalNormal
|
|
144
|
+
: 0,
|
|
145
|
+
tagStats,
|
|
146
|
+
missed: normalResults.filter(result => !result.hit),
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
// =============================================================================
|
|
150
|
+
// Format report as string (no console.log — caller decides output)
|
|
151
|
+
// =============================================================================
|
|
152
|
+
export function formatReport(metrics) {
|
|
153
|
+
const lines = [];
|
|
154
|
+
lines.push('='.repeat(60));
|
|
155
|
+
lines.push(`Results — hit_threshold=${HIT_THRESHOLD}, ndcg_gain=2^g-1`);
|
|
156
|
+
lines.push('='.repeat(60));
|
|
157
|
+
lines.push('');
|
|
158
|
+
lines.push(`Test cases: ${metrics.totalCases} total (${metrics.normalCases} normal, ${metrics.outOfScopeCases} out-of-scope)`);
|
|
159
|
+
lines.push('');
|
|
160
|
+
lines.push('METRICS:');
|
|
161
|
+
lines.push(` Hit rate: ${metrics.hitRate.toFixed(1)}% (${metrics.hits}/${metrics.normalCases} queries found at least one expected doc)`);
|
|
162
|
+
lines.push(` First-result accuracy: ${metrics.firstResultAccuracy.toFixed(1)}% (${metrics.firstResultHits}/${metrics.normalCases} queries had correct #1 result)`);
|
|
163
|
+
lines.push(` Recall: ${metrics.recall.toFixed(1)}% (${metrics.totalFound}/${metrics.totalExpected} expected docs found across all queries)`);
|
|
164
|
+
lines.push(` Zero-result rate: ${metrics.zeroResultRate.toFixed(1)}% (${metrics.zeroResults}/${metrics.normalCases} queries returned nothing)`);
|
|
165
|
+
lines.push(` Out-of-scope accuracy: ${metrics.outOfScopeAccuracy.toFixed(1)}% (${metrics.outOfScopeCorrect}/${metrics.outOfScopeCases} correctly returned nothing)`);
|
|
166
|
+
lines.push(` Avg response time: ${metrics.avgResponseTimeMs.toFixed(0)}ms`);
|
|
167
|
+
lines.push(` MRR: ${metrics.meanReciprocalRank.toFixed(3)} (1.0 = perfect ranking, 0.5 = avg position 2)`);
|
|
168
|
+
lines.push(` NDCG@k: ${metrics.normalizedDiscountedCumulativeGain.toFixed(3)} (1.0 = perfect ranking of all relevant docs)`);
|
|
169
|
+
lines.push('');
|
|
170
|
+
if (metrics.missed.length > 0) {
|
|
171
|
+
lines.push(`MISSED QUERIES (no returned doc at grade >= ${HIT_THRESHOLD}):`);
|
|
172
|
+
for (const miss of metrics.missed) {
|
|
173
|
+
const relevantDocs = miss.testCase.judgments
|
|
174
|
+
.filter(judgment => judgment.grade >= HIT_THRESHOLD)
|
|
175
|
+
.map(judgment => `${judgment.document_id}(g${judgment.grade})`);
|
|
176
|
+
lines.push(` "${miss.testCase.query}" — relevant [${relevantDocs.join(', ')}], got [${miss.returnedIds.slice(0, 5).join(', ')}]`);
|
|
177
|
+
}
|
|
178
|
+
lines.push('');
|
|
179
|
+
}
|
|
180
|
+
lines.push('BY TAG:');
|
|
181
|
+
const sortedTags = Object.entries(metrics.tagStats).sort((entryA, entryB) => entryB[1].total - entryA[1].total);
|
|
182
|
+
for (const [tag, stats] of sortedTags) {
|
|
183
|
+
const hitPercentage = ((stats.hits / stats.total) * 100).toFixed(0);
|
|
184
|
+
const firstResultPercentage = ((stats.firstHits / stats.total) * 100).toFixed(0);
|
|
185
|
+
lines.push(` ${tag}: ${hitPercentage}% hit rate, ${firstResultPercentage}% first-result (${stats.total} queries)`);
|
|
186
|
+
}
|
|
187
|
+
lines.push('');
|
|
188
|
+
lines.push('='.repeat(60));
|
|
189
|
+
return lines.join('\n');
|
|
190
|
+
}
|
|
191
|
+
// =============================================================================
|
|
192
|
+
// compareRuns — diff two eval runs and detect regressions
|
|
193
|
+
// =============================================================================
|
|
194
|
+
const INVERTED_METRICS = new Set(['zeroResultRate', 'avgResponseTimeMs']);
|
|
195
|
+
const UNCHANGED_THRESHOLD = 0.01;
|
|
196
|
+
export function compareRuns(current, previous) {
|
|
197
|
+
const metricKeys = [
|
|
198
|
+
'hitRate',
|
|
199
|
+
'firstResultAccuracy',
|
|
200
|
+
'recall',
|
|
201
|
+
'zeroResultRate',
|
|
202
|
+
'meanReciprocalRank',
|
|
203
|
+
'normalizedDiscountedCumulativeGain',
|
|
204
|
+
'avgResponseTimeMs',
|
|
205
|
+
];
|
|
206
|
+
const improvements = [];
|
|
207
|
+
const regressions = [];
|
|
208
|
+
const unchanged = [];
|
|
209
|
+
for (const metricKey of metricKeys) {
|
|
210
|
+
const currentValue = current[metricKey];
|
|
211
|
+
const previousValue = previous[metricKey];
|
|
212
|
+
const diff = currentValue - previousValue;
|
|
213
|
+
const metricDiff = {
|
|
214
|
+
metric: metricKey,
|
|
215
|
+
current: currentValue,
|
|
216
|
+
previous: previousValue,
|
|
217
|
+
diff,
|
|
218
|
+
};
|
|
219
|
+
if (Math.abs(diff) < UNCHANGED_THRESHOLD) {
|
|
220
|
+
unchanged.push(metricDiff);
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
const isInverted = INVERTED_METRICS.has(metricKey);
|
|
224
|
+
// For normal metrics: positive diff = improvement. For inverted: negative diff = improvement.
|
|
225
|
+
const isImprovement = isInverted ? diff < 0 : diff > 0;
|
|
226
|
+
if (isImprovement) {
|
|
227
|
+
improvements.push(metricDiff);
|
|
228
|
+
}
|
|
229
|
+
else {
|
|
230
|
+
regressions.push(metricDiff);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
const severity = determineSeverity(current, regressions);
|
|
234
|
+
return { improvements, regressions, unchanged, severity };
|
|
235
|
+
}
|
|
236
|
+
function determineSeverity(current, regressions) {
|
|
237
|
+
if (current.hitRate < 80 || current.zeroResultRate > 10) {
|
|
238
|
+
return 'critical';
|
|
239
|
+
}
|
|
240
|
+
if (regressions.length === 0) {
|
|
241
|
+
return 'ok';
|
|
242
|
+
}
|
|
243
|
+
// Worst regression drop: for normal metrics use |diff| (diff is negative for regressions),
|
|
244
|
+
// for inverted metrics use |diff| (diff is positive for regressions).
|
|
245
|
+
// In both cases Math.abs(diff) gives the magnitude of the drop.
|
|
246
|
+
const worstDrop = regressions.reduce((maxDrop, regression) => {
|
|
247
|
+
const dropMagnitude = Math.abs(regression.diff);
|
|
248
|
+
return dropMagnitude > maxDrop ? dropMagnitude : maxDrop;
|
|
249
|
+
}, 0);
|
|
250
|
+
if (worstDrop > 5)
|
|
251
|
+
return 'block';
|
|
252
|
+
if (worstDrop > 2)
|
|
253
|
+
return 'warning';
|
|
254
|
+
return 'ok';
|
|
255
|
+
}
|
|
256
|
+
// =============================================================================
|
|
257
|
+
// formatComparison — human-readable comparison report
|
|
258
|
+
// =============================================================================
|
|
259
|
+
const PERCENTAGE_METRICS = new Set(['hitRate', 'firstResultAccuracy', 'recall', 'zeroResultRate']);
|
|
260
|
+
function formatMetricValue(metricKey, value) {
|
|
261
|
+
if (metricKey === 'meanReciprocalRank' || metricKey === 'normalizedDiscountedCumulativeGain')
|
|
262
|
+
return value.toFixed(3);
|
|
263
|
+
if (PERCENTAGE_METRICS.has(metricKey))
|
|
264
|
+
return `${value.toFixed(1)}%`;
|
|
265
|
+
return value.toFixed(1);
|
|
266
|
+
}
|
|
267
|
+
export function formatComparison(comparison) {
|
|
268
|
+
const lines = [];
|
|
269
|
+
lines.push('='.repeat(60));
|
|
270
|
+
lines.push(`Run Comparison — severity: ${comparison.severity}`);
|
|
271
|
+
lines.push('='.repeat(60));
|
|
272
|
+
lines.push('');
|
|
273
|
+
if (comparison.improvements.length > 0) {
|
|
274
|
+
lines.push('IMPROVEMENTS:');
|
|
275
|
+
for (const metricDiff of comparison.improvements) {
|
|
276
|
+
const previousFormatted = formatMetricValue(metricDiff.metric, metricDiff.previous);
|
|
277
|
+
const currentFormatted = formatMetricValue(metricDiff.metric, metricDiff.current);
|
|
278
|
+
const diffFormatted = formatMetricValue(metricDiff.metric, Math.abs(metricDiff.diff));
|
|
279
|
+
lines.push(` ${metricDiff.metric}: ${previousFormatted} → ${currentFormatted} (+${diffFormatted})`);
|
|
280
|
+
}
|
|
281
|
+
lines.push('');
|
|
282
|
+
}
|
|
283
|
+
if (comparison.regressions.length > 0) {
|
|
284
|
+
lines.push('REGRESSIONS:');
|
|
285
|
+
for (const metricDiff of comparison.regressions) {
|
|
286
|
+
const previousFormatted = formatMetricValue(metricDiff.metric, metricDiff.previous);
|
|
287
|
+
const currentFormatted = formatMetricValue(metricDiff.metric, metricDiff.current);
|
|
288
|
+
const diffFormatted = formatMetricValue(metricDiff.metric, Math.abs(metricDiff.diff));
|
|
289
|
+
lines.push(` ${metricDiff.metric}: ${previousFormatted} → ${currentFormatted} (-${diffFormatted})`);
|
|
290
|
+
}
|
|
291
|
+
lines.push('');
|
|
292
|
+
}
|
|
293
|
+
if (comparison.unchanged.length > 0) {
|
|
294
|
+
lines.push('UNCHANGED:');
|
|
295
|
+
for (const metricDiff of comparison.unchanged) {
|
|
296
|
+
const currentFormatted = formatMetricValue(metricDiff.metric, metricDiff.current);
|
|
297
|
+
lines.push(` ${metricDiff.metric}: ${currentFormatted}`);
|
|
298
|
+
}
|
|
299
|
+
lines.push('');
|
|
300
|
+
}
|
|
301
|
+
lines.push('='.repeat(60));
|
|
302
|
+
return lines.join('\n');
|
|
303
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { writeFileSync, existsSync, readFileSync, mkdirSync, chmodSync } from 'fs';
|
|
2
|
+
import { dirname } from 'path';
|
|
3
|
+
/**
|
|
4
|
+
* Write a note's content to disk at the specified file_path.
|
|
5
|
+
* Creates parent directories if needed. Sets Unix permissions.
|
|
6
|
+
* Skips write if content matches existing file (idempotent).
|
|
7
|
+
*/
|
|
8
|
+
export function writeNoteFile(content, filePath, permissions) {
|
|
9
|
+
const dir = dirname(filePath);
|
|
10
|
+
const mode = permissions ? parseInt(permissions, 8) : 0o644;
|
|
11
|
+
if (!existsSync(dir)) {
|
|
12
|
+
mkdirSync(dir, { recursive: true });
|
|
13
|
+
}
|
|
14
|
+
if (existsSync(filePath)) {
|
|
15
|
+
const existing = readFileSync(filePath, 'utf-8');
|
|
16
|
+
if (existing === content) {
|
|
17
|
+
return { status: 'skipped', path: filePath };
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
writeFileSync(filePath, content, { mode });
|
|
21
|
+
chmodSync(filePath, mode);
|
|
22
|
+
return { status: 'written', path: filePath };
|
|
23
|
+
}
|