@aperdomoll90/ledger-ai 1.3.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +177 -221
- package/dist/commands/add.js +51 -100
- package/dist/commands/backfill.js +55 -0
- package/dist/commands/backup.js +10 -10
- package/dist/commands/check.js +21 -29
- package/dist/commands/config.js +13 -12
- package/dist/commands/delete.js +22 -17
- package/dist/commands/eval-judge.js +11 -0
- package/dist/commands/eval.js +321 -0
- package/dist/commands/export.js +8 -10
- package/dist/commands/get.js +9 -0
- package/dist/commands/hunt.js +206 -0
- package/dist/commands/ingest.js +15 -14
- package/dist/commands/init.js +18 -20
- package/dist/commands/list.js +21 -7
- package/dist/commands/migrate.js +11 -11
- package/dist/commands/onboard.js +2 -2
- package/dist/commands/pull.js +3 -2
- package/dist/commands/push.js +8 -8
- package/dist/commands/restore.js +38 -38
- package/dist/commands/show.js +13 -16
- package/dist/commands/sync.js +58 -19
- package/dist/commands/tag.js +20 -14
- package/dist/commands/update.js +50 -18
- package/dist/commands/wizard.js +3 -3
- package/dist/lib/ai-search.js +163 -0
- package/dist/lib/audit.js +19 -0
- package/dist/lib/backfill.js +60 -0
- package/dist/lib/config.js +19 -2
- package/dist/lib/document-classification.js +5 -0
- package/dist/lib/document-fetching.js +77 -0
- package/dist/lib/document-operations.js +150 -0
- package/dist/lib/documents/classification.js +5 -0
- package/dist/lib/documents/fetching.js +89 -0
- package/dist/lib/documents/operations.js +304 -0
- package/dist/lib/domains.js +116 -0
- package/dist/lib/embeddings.js +190 -0
- package/dist/lib/errors.js +3 -1
- package/dist/lib/eval/eval-advanced.js +289 -0
- package/dist/lib/eval/eval-judge-session.js +233 -0
- package/dist/lib/eval/eval-store.js +105 -0
- package/dist/lib/eval/eval.js +303 -0
- package/dist/lib/file-writer.js +23 -0
- package/dist/lib/generators.js +44 -45
- package/dist/lib/hunter-db.js +235 -0
- package/dist/lib/hunter-rss.js +30 -0
- package/dist/lib/hunter-scoring.js +55 -0
- package/dist/lib/hunter-types.js +36 -0
- package/dist/lib/lint-configs.js +20 -0
- package/dist/lib/migrate.js +2 -2
- package/dist/lib/notes.js +173 -59
- package/dist/lib/observability.js +296 -0
- package/dist/lib/op-add-note-types.test.js +7 -6
- package/dist/lib/prompt.js +8 -8
- package/dist/lib/rate-limiter.js +103 -0
- package/dist/lib/search/ai-search.js +396 -0
- package/dist/lib/search/chunk-context-enrichment.js +155 -0
- package/dist/lib/search/embeddings.js +293 -0
- package/dist/lib/search/reranker.js +120 -0
- package/dist/lib/search/semantic-cache.js +53 -0
- package/dist/lib/type-registry.test.js +6 -6
- package/dist/mcp-server.js +553 -66
- package/dist/migrations/migrations/005-audit-log.sql +22 -0
- package/dist/migrations/migrations/005_opportunities.sql +48 -0
- package/dist/migrations/migrations/006-audited-operations.sql +235 -0
- package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
- package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
- package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
- package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
- package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
- package/dist/scripts/batch-grade.js +344 -0
- package/dist/scripts/benchmark-ingestion.js +376 -0
- package/dist/scripts/convert-judgments-to-graded.js +88 -0
- package/dist/scripts/diagnose-first-result.js +333 -0
- package/dist/scripts/drop-golden-query.js +53 -0
- package/dist/scripts/eval-search.js +115 -0
- package/dist/scripts/grade-unjudged-top1.js +138 -0
- package/dist/scripts/hunter-analytics.js +38 -0
- package/dist/scripts/hunter-cron.js +63 -0
- package/dist/scripts/hunter-purge.js +25 -0
- package/dist/scripts/migrate-v2.js +140 -0
- package/dist/scripts/reindex.js +74 -0
- package/dist/scripts/sync-local-docs.js +153 -0
- package/package.json +7 -1
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { backfillMetadata } from '../lib/backfill.js';
|
|
2
|
+
export async function backfill(config, options) {
|
|
3
|
+
const { dryRun } = options;
|
|
4
|
+
console.error('Fetching all notes...');
|
|
5
|
+
const { data: notes, error } = await config.supabase
|
|
6
|
+
.from('notes')
|
|
7
|
+
.select('id, metadata')
|
|
8
|
+
.order('id', { ascending: true });
|
|
9
|
+
if (error) {
|
|
10
|
+
console.error(`Error fetching notes: ${error.message}`);
|
|
11
|
+
process.exit(1);
|
|
12
|
+
}
|
|
13
|
+
if (!notes || notes.length === 0) {
|
|
14
|
+
console.error('No notes found.');
|
|
15
|
+
return;
|
|
16
|
+
}
|
|
17
|
+
console.error(`Found ${notes.length} notes. Running v2 backfill...`);
|
|
18
|
+
let migrated = 0;
|
|
19
|
+
let skipped = 0;
|
|
20
|
+
let errors = 0;
|
|
21
|
+
for (const note of notes) {
|
|
22
|
+
const oldMeta = note.metadata;
|
|
23
|
+
const newMeta = backfillMetadata(oldMeta);
|
|
24
|
+
// Check if anything changed (idempotent skip)
|
|
25
|
+
if (JSON.stringify(oldMeta) === JSON.stringify(newMeta)) {
|
|
26
|
+
skipped++;
|
|
27
|
+
continue;
|
|
28
|
+
}
|
|
29
|
+
if (dryRun) {
|
|
30
|
+
const oldType = oldMeta.type ?? '?';
|
|
31
|
+
const newType = newMeta.type ?? '?';
|
|
32
|
+
const domain = newMeta.domain ?? '?';
|
|
33
|
+
const key = oldMeta.upsert_key ?? `id-${note.id}`;
|
|
34
|
+
console.error(` [${note.id}] ${key}: ${oldType} → ${domain}/${newType}`);
|
|
35
|
+
migrated++;
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
const { error: updateError } = await config.supabase
|
|
39
|
+
.from('notes')
|
|
40
|
+
.update({ metadata: newMeta })
|
|
41
|
+
.eq('id', note.id);
|
|
42
|
+
if (updateError) {
|
|
43
|
+
console.error(` [${note.id}] ERROR: ${updateError.message}`);
|
|
44
|
+
errors++;
|
|
45
|
+
}
|
|
46
|
+
else {
|
|
47
|
+
migrated++;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
console.error(`\nBackfill ${dryRun ? '(dry run) ' : ''}complete:`);
|
|
51
|
+
console.error(` ${migrated} migrated, ${skipped} already up-to-date, ${errors} errors`);
|
|
52
|
+
if (dryRun && migrated > 0) {
|
|
53
|
+
console.error('\nRun without --dry-run to apply changes.');
|
|
54
|
+
}
|
|
55
|
+
}
|
package/dist/commands/backup.js
CHANGED
|
@@ -6,10 +6,10 @@ export async function backup(config, options) {
|
|
|
6
6
|
const { quiet } = options;
|
|
7
7
|
const backupDir = resolve(getLedgerDir(), 'backups');
|
|
8
8
|
mkdirSync(backupDir, { recursive: true });
|
|
9
|
-
// Fetch all
|
|
9
|
+
// Fetch all documents (not just cached)
|
|
10
10
|
const { data, error } = await config.supabase
|
|
11
|
-
.from('
|
|
12
|
-
.select('id, content,
|
|
11
|
+
.from('documents')
|
|
12
|
+
.select('id, name, domain, document_type, project, protection, content, description, status, created_at, updated_at')
|
|
13
13
|
.order('id', { ascending: true });
|
|
14
14
|
if (error) {
|
|
15
15
|
console.error(`Backup failed: ${error.message}`);
|
|
@@ -17,7 +17,7 @@ export async function backup(config, options) {
|
|
|
17
17
|
}
|
|
18
18
|
if (!data || data.length === 0) {
|
|
19
19
|
if (!quiet)
|
|
20
|
-
console.error('No
|
|
20
|
+
console.error('No documents to backup.');
|
|
21
21
|
return;
|
|
22
22
|
}
|
|
23
23
|
const date = new Date().toISOString().split('T')[0];
|
|
@@ -25,7 +25,7 @@ export async function backup(config, options) {
|
|
|
25
25
|
writeFileSync(filePath, JSON.stringify(data, null, 2) + '\n');
|
|
26
26
|
// Keep last 5 backups, delete older
|
|
27
27
|
const backups = readdirSync(backupDir)
|
|
28
|
-
.filter(
|
|
28
|
+
.filter(file => file.endsWith('.json'))
|
|
29
29
|
.sort()
|
|
30
30
|
.reverse();
|
|
31
31
|
for (const old of backups.slice(5)) {
|
|
@@ -34,7 +34,7 @@ export async function backup(config, options) {
|
|
|
34
34
|
console.error(` deleted old backup: ${old}`);
|
|
35
35
|
}
|
|
36
36
|
if (!quiet) {
|
|
37
|
-
console.error(`Backed up ${data.length}
|
|
37
|
+
console.error(`Backed up ${data.length} documents to ${filePath}`);
|
|
38
38
|
}
|
|
39
39
|
console.log(filePath);
|
|
40
40
|
}
|
|
@@ -59,8 +59,8 @@ export function enableBackupCron() {
|
|
|
59
59
|
throw new Error(result.stderr?.toString() || 'crontab failed');
|
|
60
60
|
console.error('Daily backup enabled (1am). View with `crontab -l`.');
|
|
61
61
|
}
|
|
62
|
-
catch (
|
|
63
|
-
console.error(`Failed to set cron: ${
|
|
62
|
+
catch (cronError) {
|
|
63
|
+
console.error(`Failed to set cron: ${cronError.message}`);
|
|
64
64
|
console.error(`Add manually: ${cronLine}`);
|
|
65
65
|
}
|
|
66
66
|
}
|
|
@@ -83,7 +83,7 @@ export function disableBackupCron() {
|
|
|
83
83
|
throw new Error(result.stderr?.toString() || 'crontab failed');
|
|
84
84
|
console.error('Backup cron disabled.');
|
|
85
85
|
}
|
|
86
|
-
catch (
|
|
87
|
-
console.error(`Failed to update cron: ${
|
|
86
|
+
catch (cronError) {
|
|
87
|
+
console.error(`Failed to update cron: ${cronError.message}`);
|
|
88
88
|
}
|
|
89
89
|
}
|
package/dist/commands/check.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { readFileSync, readdirSync, existsSync } from 'fs';
|
|
2
|
-
import { resolve } from 'path';
|
|
3
|
-
import {
|
|
2
|
+
import { resolve, basename } from 'path';
|
|
3
|
+
import { fetchSyncableDocuments } from '../lib/documents/fetching.js';
|
|
4
4
|
import { contentHash } from '../lib/hash.js';
|
|
5
5
|
export async function check(config) {
|
|
6
6
|
const result = {
|
|
@@ -16,51 +16,55 @@ export async function check(config) {
|
|
|
16
16
|
console.error('Memory directory not found. Run `ledger pull` first.');
|
|
17
17
|
return result;
|
|
18
18
|
}
|
|
19
|
-
|
|
20
|
-
const
|
|
19
|
+
// Fetch all auto-load documents — these are the ones that sync locally
|
|
20
|
+
const syncableDocuments = await fetchSyncableDocuments(config.supabase);
|
|
21
|
+
// Build a map of local filename → document for comparison
|
|
22
|
+
const documentsByFile = new Map(syncableDocuments
|
|
23
|
+
.filter(document => document.file_path)
|
|
24
|
+
.map(document => [basename(document.file_path), document]));
|
|
21
25
|
const localFiles = readdirSync(config.memoryDir)
|
|
22
|
-
.filter(
|
|
26
|
+
.filter(file => file.endsWith('.md') && file !== 'MEMORY.md');
|
|
23
27
|
for (const file of localFiles) {
|
|
24
28
|
const filePath = resolve(config.memoryDir, file);
|
|
25
29
|
const localContent = readFileSync(filePath, 'utf-8').trim();
|
|
26
30
|
const localHash = contentHash(localContent);
|
|
27
|
-
const
|
|
28
|
-
if (!
|
|
31
|
+
const document = documentsByFile.get(file);
|
|
32
|
+
if (!document) {
|
|
29
33
|
console.error(` ${file} — unknown (not in Ledger)`);
|
|
30
34
|
result.files.push({ file, state: 'unknown' });
|
|
31
35
|
result.unknown++;
|
|
32
|
-
|
|
36
|
+
documentsByFile.delete(file);
|
|
33
37
|
continue;
|
|
34
38
|
}
|
|
35
|
-
const ledgerHash = contentHash(
|
|
36
|
-
const storedHash =
|
|
39
|
+
const ledgerHash = contentHash(document.content);
|
|
40
|
+
const storedHash = document.content_hash;
|
|
37
41
|
const localChanged = localHash !== storedHash;
|
|
38
42
|
const ledgerChanged = ledgerHash !== storedHash;
|
|
39
43
|
if (!localChanged && !ledgerChanged) {
|
|
40
44
|
console.error(` ${file} — in sync`);
|
|
41
|
-
result.files.push({ file, state: 'clean',
|
|
45
|
+
result.files.push({ file, state: 'clean', documentId: document.id });
|
|
42
46
|
result.clean++;
|
|
43
47
|
}
|
|
44
48
|
else if (localChanged && !ledgerChanged) {
|
|
45
49
|
console.error(` ${file} — modified locally`);
|
|
46
|
-
result.files.push({ file, state: 'modified',
|
|
50
|
+
result.files.push({ file, state: 'modified', documentId: document.id });
|
|
47
51
|
result.modified++;
|
|
48
52
|
}
|
|
49
53
|
else if (!localChanged && ledgerChanged) {
|
|
50
54
|
console.error(` ${file} — updated in Ledger`);
|
|
51
|
-
result.files.push({ file, state: 'upstream',
|
|
55
|
+
result.files.push({ file, state: 'upstream', documentId: document.id });
|
|
52
56
|
result.upstream++;
|
|
53
57
|
}
|
|
54
58
|
else {
|
|
55
59
|
console.error(` ${file} — CONFLICT (both changed)`);
|
|
56
|
-
result.files.push({ file, state: 'conflict',
|
|
60
|
+
result.files.push({ file, state: 'conflict', documentId: document.id });
|
|
57
61
|
result.conflicts++;
|
|
58
62
|
}
|
|
59
|
-
|
|
63
|
+
documentsByFile.delete(file);
|
|
60
64
|
}
|
|
61
|
-
for (const [file,
|
|
65
|
+
for (const [file, document] of documentsByFile) {
|
|
62
66
|
console.error(` ${file} — missing locally (exists in Ledger)`);
|
|
63
|
-
result.files.push({ file, state: 'deleted',
|
|
67
|
+
result.files.push({ file, state: 'deleted', documentId: document.id });
|
|
64
68
|
result.deleted++;
|
|
65
69
|
}
|
|
66
70
|
const summary = [
|
|
@@ -77,15 +81,3 @@ export async function check(config) {
|
|
|
77
81
|
}
|
|
78
82
|
return result;
|
|
79
83
|
}
|
|
80
|
-
export async function checkChunks(config) {
|
|
81
|
-
console.error('Checking chunk integrity...');
|
|
82
|
-
const result = await checkChunkIntegrity(config.supabase);
|
|
83
|
-
if (result.incompleteGroups.length === 0) {
|
|
84
|
-
console.log('All chunk groups are complete.');
|
|
85
|
-
return;
|
|
86
|
-
}
|
|
87
|
-
console.error(`Found ${result.incompleteGroups.length} incomplete chunk group(s):`);
|
|
88
|
-
for (const group of result.incompleteGroups) {
|
|
89
|
-
console.error(` group ${group.groupId}: expected ${group.expected} chunks, found ${group.found}`);
|
|
90
|
-
}
|
|
91
|
-
}
|
package/dist/commands/config.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { getLedgerDir, saveConfigFile, loadConfigFile } from '../lib/config.js';
|
|
2
|
-
import { BUILTIN_TYPES, getTypeRegistry, opUpdateMetadata, validateTypeName } from '../lib/notes.js';
|
|
2
|
+
import { BUILTIN_TYPES, getTypeRegistry, opUpdateMetadata, validateTypeName, inferDomain } from '../lib/notes.js';
|
|
3
3
|
import { choose, confirm } from '../lib/prompt.js';
|
|
4
4
|
import { resolve } from 'path';
|
|
5
5
|
const CONFIG_PATH = resolve(getLedgerDir(), 'config.json');
|
|
@@ -156,7 +156,7 @@ export async function configSet(key, value, clients) {
|
|
|
156
156
|
const typeName = key.slice(6);
|
|
157
157
|
const delivery = value;
|
|
158
158
|
if (!['persona', 'project', 'knowledge', 'protected'].includes(delivery)) {
|
|
159
|
-
console.error(`Invalid delivery
|
|
159
|
+
console.error(`Invalid domain/delivery: "${value}". Must be: persona, project, knowledge, or protected.`);
|
|
160
160
|
process.exit(1);
|
|
161
161
|
}
|
|
162
162
|
const nameError = validateTypeName(typeName);
|
|
@@ -172,30 +172,31 @@ export async function configSet(key, value, clients) {
|
|
|
172
172
|
const isBuiltin = typeName in BUILTIN_TYPES;
|
|
173
173
|
const action = isBuiltin ? 'overridden' : 'registered';
|
|
174
174
|
console.error(`types.${typeName}: ${delivery} (${action})`);
|
|
175
|
-
//
|
|
175
|
+
// Domain change propagation — only if we have DB access and delivery actually changed
|
|
176
176
|
if (clients && oldDelivery && oldDelivery !== delivery) {
|
|
177
|
+
const newDomain = inferDomain(typeName);
|
|
177
178
|
const { data: notes } = await clients.supabase
|
|
178
179
|
.from('notes')
|
|
179
180
|
.select('id, metadata')
|
|
180
181
|
.eq('metadata->>type', typeName);
|
|
181
|
-
const affected = (notes ?? []).filter((n) => n.metadata.
|
|
182
|
+
const affected = (notes ?? []).filter((n) => n.metadata.domain !== newDomain);
|
|
182
183
|
if (affected.length > 0) {
|
|
183
|
-
console.error(`\n${affected.length} note(s) currently have a different
|
|
184
|
+
console.error(`\n${affected.length} note(s) currently have a different domain:`);
|
|
184
185
|
for (const note of affected) {
|
|
185
186
|
const meta = note.metadata;
|
|
186
187
|
const uKey = meta.upsert_key || `id-${note.id}`;
|
|
187
|
-
console.error(` [${note.id}] ${uKey} —
|
|
188
|
+
console.error(` [${note.id}] ${uKey} — domain: ${meta.domain}`);
|
|
188
189
|
}
|
|
189
|
-
const action = await choose('\nUpdate
|
|
190
|
+
const action = await choose('\nUpdate domain on these notes?', [
|
|
190
191
|
'all — update all notes',
|
|
191
192
|
'select — choose individually',
|
|
192
193
|
'none — only affect new notes',
|
|
193
194
|
]);
|
|
194
195
|
if (action.startsWith('all')) {
|
|
195
196
|
for (const note of affected) {
|
|
196
|
-
await opUpdateMetadata(clients, note.id, {
|
|
197
|
+
await opUpdateMetadata(clients, note.id, { domain: newDomain });
|
|
197
198
|
}
|
|
198
|
-
console.error(`Updated
|
|
199
|
+
console.error(`Updated domain to "${newDomain}" on ${affected.length} note(s).`);
|
|
199
200
|
}
|
|
200
201
|
else if (action.startsWith('select')) {
|
|
201
202
|
let updated = 0;
|
|
@@ -204,11 +205,11 @@ export async function configSet(key, value, clients) {
|
|
|
204
205
|
const uKey = meta.upsert_key || `id-${note.id}`;
|
|
205
206
|
const yes = await confirm(` Update [${note.id}] ${uKey}?`);
|
|
206
207
|
if (yes) {
|
|
207
|
-
await opUpdateMetadata(clients, note.id, {
|
|
208
|
+
await opUpdateMetadata(clients, note.id, { domain: newDomain });
|
|
208
209
|
updated++;
|
|
209
210
|
}
|
|
210
211
|
}
|
|
211
|
-
console.error(`Updated
|
|
212
|
+
console.error(`Updated domain on ${updated} note(s).`);
|
|
212
213
|
}
|
|
213
214
|
}
|
|
214
215
|
}
|
|
@@ -238,7 +239,7 @@ export async function configUnset(key, clients) {
|
|
|
238
239
|
.select('id')
|
|
239
240
|
.eq('metadata->>type', typeName);
|
|
240
241
|
if (notes && notes.length > 0) {
|
|
241
|
-
console.error(`\n${notes.length} note(s) use type "${typeName}". They will become unregistered (
|
|
242
|
+
console.error(`\n${notes.length} note(s) use type "${typeName}". They will become unregistered (domain defaults to "project").`);
|
|
242
243
|
const proceed = await confirm('Proceed?');
|
|
243
244
|
if (!proceed) {
|
|
244
245
|
console.error('Cancelled.');
|
package/dist/commands/delete.js
CHANGED
|
@@ -1,22 +1,27 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { getDocumentById } from '../lib/documents/fetching.js';
|
|
2
|
+
import { deleteDocument as deleteDocumentOperation } from '../lib/documents/operations.js';
|
|
2
3
|
import { confirm } from '../lib/prompt.js';
|
|
3
|
-
export async function
|
|
4
|
-
const
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
if (preview.status === 'error') {
|
|
8
|
-
console.error(preview.message);
|
|
4
|
+
export async function removeDocument(config, id, options = {}) {
|
|
5
|
+
const document = await getDocumentById(config.supabase, id);
|
|
6
|
+
if (!document) {
|
|
7
|
+
console.error(`Document ${id} not found.`);
|
|
9
8
|
process.exit(1);
|
|
10
9
|
}
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
if (!proceed) {
|
|
14
|
-
console.error('Cancelled.');
|
|
15
|
-
return;
|
|
16
|
-
}
|
|
17
|
-
// Second call: execute
|
|
18
|
-
const result = await opDeleteNote(clients, id, true);
|
|
19
|
-
console.error(result.message);
|
|
20
|
-
if (result.status === 'error')
|
|
10
|
+
if (document.protection === 'immutable') {
|
|
11
|
+
console.error(`Document "${document.name}" (id: ${id}) is immutable and cannot be deleted.`);
|
|
21
12
|
process.exit(1);
|
|
13
|
+
}
|
|
14
|
+
console.error(`Document: "${document.name}" (id: ${id})`);
|
|
15
|
+
console.error(`Domain: ${document.domain} | Type: ${document.document_type}`);
|
|
16
|
+
console.error(`Protection: ${document.protection}`);
|
|
17
|
+
console.error(`Content preview: ${document.content.slice(0, 200)}${document.content.length > 200 ? '...' : ''}`);
|
|
18
|
+
if (!options.yes) {
|
|
19
|
+
const proceed = await confirm('\nProceed with deletion?');
|
|
20
|
+
if (!proceed) {
|
|
21
|
+
console.error('Cancelled.');
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
await deleteDocumentOperation({ supabase: config.supabase, openai: config.openai }, id, 'cli');
|
|
26
|
+
console.error(`Document ${id} soft-deleted. Can be restored within 30 days.`);
|
|
22
27
|
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
// eval-judge.ts
|
|
2
|
+
// CLI command for the graded-relevance rejudging walkthrough.
|
|
3
|
+
import { runJudgeSession } from '../lib/eval/eval-judge-session.js';
|
|
4
|
+
export async function evalJudge(config, options) {
|
|
5
|
+
const clients = {
|
|
6
|
+
supabase: config.supabase,
|
|
7
|
+
openai: config.openai,
|
|
8
|
+
cohereApiKey: config.cohereApiKey,
|
|
9
|
+
};
|
|
10
|
+
await runJudgeSession(clients, options.query);
|
|
11
|
+
}
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
2
|
+
import { searchHybrid } from '../lib/search/ai-search.js';
|
|
3
|
+
import { scoreTestCase, computeMetrics, formatReport, compareRuns, formatComparison } from '../lib/eval/eval.js';
|
|
4
|
+
import { saveEvalRun, loadPreviousRun, loadEvalRun, CURRENT_SEARCH_CONFIG } from '../lib/eval/eval-store.js';
|
|
5
|
+
import { computeConfidenceIntervals, computeScoreCalibration, computeCoverageAnalysis, formatAdvancedReport } from '../lib/eval/eval-advanced.js';
|
|
6
|
+
import { runEvalTrace, runEvalQuerySpan, startSpan } from '../lib/observability.js';
|
|
7
|
+
// Search config imported from eval-store.ts (single source of truth)
|
|
8
|
+
// =============================================================================
|
|
9
|
+
// Command
|
|
10
|
+
// =============================================================================
|
|
11
|
+
export async function evalSearch(config, options) {
|
|
12
|
+
// Tag every search trace from this eval run with environment=eval and a
|
|
13
|
+
// shared session ID so the Langfuse dashboard can filter eval traffic from
|
|
14
|
+
// prod and group all queries in this run under one session.
|
|
15
|
+
const evalRunUuid = randomUUID();
|
|
16
|
+
const clients = {
|
|
17
|
+
supabase: config.supabase,
|
|
18
|
+
openai: config.openai,
|
|
19
|
+
cohereApiKey: config.cohereApiKey,
|
|
20
|
+
sessionId: `eval-${evalRunUuid}`,
|
|
21
|
+
observabilityEnvironment: 'eval',
|
|
22
|
+
};
|
|
23
|
+
console.log('\n' + '='.repeat(60));
|
|
24
|
+
console.log('Ledger Search Evaluation');
|
|
25
|
+
if (options.dryRun)
|
|
26
|
+
console.log('(dry run — results will not be saved)');
|
|
27
|
+
console.log('='.repeat(60) + '\n');
|
|
28
|
+
const previousRun = await loadPreviousRun(clients.supabase);
|
|
29
|
+
if (previousRun) {
|
|
30
|
+
console.log(`Previous run: ${previousRun.run_date} (id: ${previousRun.id})\n`);
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
console.log('No previous run found — this will be the first stored run.\n');
|
|
34
|
+
}
|
|
35
|
+
const { data: testCases, error } = await clients.supabase
|
|
36
|
+
.from('eval_golden_dataset')
|
|
37
|
+
.select('id, query, tags, judgments:eval_golden_judgments(document_id, grade)')
|
|
38
|
+
.order('id');
|
|
39
|
+
if (error || !testCases) {
|
|
40
|
+
process.stderr.write(`Failed to load golden dataset: ${error?.message ?? 'no data'}\n`);
|
|
41
|
+
process.exit(1);
|
|
42
|
+
}
|
|
43
|
+
console.log(`Loaded ${testCases.length} test cases.\n`);
|
|
44
|
+
await runEvalTrace({
|
|
45
|
+
sessionId: clients.sessionId,
|
|
46
|
+
tags: ['eval', 'run'],
|
|
47
|
+
config: CURRENT_SEARCH_CONFIG,
|
|
48
|
+
dryRun: options.dryRun,
|
|
49
|
+
}, async (evalTrace) => {
|
|
50
|
+
const results = [];
|
|
51
|
+
for (const testCase of testCases) {
|
|
52
|
+
const scored = await runEvalQuerySpan({
|
|
53
|
+
query: testCase.query,
|
|
54
|
+
goldenId: testCase.id,
|
|
55
|
+
tags: testCase.tags,
|
|
56
|
+
expectedDocs: testCase.judgments
|
|
57
|
+
.filter(judgment => judgment.grade >= 2)
|
|
58
|
+
.map(judgment => judgment.document_id),
|
|
59
|
+
}, async (querySpan) => {
|
|
60
|
+
const startTime = Date.now();
|
|
61
|
+
const searchResults = await searchHybrid(clients, {
|
|
62
|
+
query: testCase.query,
|
|
63
|
+
limit: CURRENT_SEARCH_CONFIG.limit,
|
|
64
|
+
reranker: CURRENT_SEARCH_CONFIG.reranker,
|
|
65
|
+
});
|
|
66
|
+
const result = scoreTestCase(testCase, searchResults, Date.now() - startTime);
|
|
67
|
+
querySpan.update({
|
|
68
|
+
output: {
|
|
69
|
+
hit: result.hit,
|
|
70
|
+
firstResultHit: result.firstResultHit,
|
|
71
|
+
position: result.position,
|
|
72
|
+
reciprocalRank: result.reciprocalRank,
|
|
73
|
+
normalizedDiscountedCumulativeGain: result.normalizedDiscountedCumulativeGain,
|
|
74
|
+
responseTimeMs: Date.now() - startTime,
|
|
75
|
+
},
|
|
76
|
+
});
|
|
77
|
+
return result;
|
|
78
|
+
});
|
|
79
|
+
results.push(scored);
|
|
80
|
+
const isOutOfScope = !testCase.judgments.some(judgment => judgment.grade >= 2);
|
|
81
|
+
if (isOutOfScope) {
|
|
82
|
+
const status = scored.hit ? 'PASS' : `NOISE (${scored.returnedIds.length} results)`;
|
|
83
|
+
console.log(` [${status}] "${testCase.query}" (out-of-scope)`);
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
const status = scored.firstResultHit ? 'TOP' : scored.hit ? 'HIT' : 'MISS';
|
|
87
|
+
const positionInfo = scored.position !== null ? `@${scored.position + 1}` : '';
|
|
88
|
+
console.log(` [${status}${positionInfo}] "${testCase.query}" → found ${scored.expectedFound}/${scored.expectedTotal}`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
const metrics = computeMetrics(results);
|
|
92
|
+
console.log('\n' + formatReport(metrics));
|
|
93
|
+
// Advanced analysis
|
|
94
|
+
const confidenceIntervals = computeConfidenceIntervals(results);
|
|
95
|
+
const scoreCalibration = computeScoreCalibration(results);
|
|
96
|
+
const coverageAnalysis = computeCoverageAnalysis(results);
|
|
97
|
+
// Eval analysis span
|
|
98
|
+
const analysisSpan = startSpan('eval-analysis');
|
|
99
|
+
analysisSpan.update({
|
|
100
|
+
input: {
|
|
101
|
+
testCaseCount: results.length,
|
|
102
|
+
normalCount: metrics.normalCases,
|
|
103
|
+
outOfScopeCount: metrics.outOfScopeCases,
|
|
104
|
+
},
|
|
105
|
+
});
|
|
106
|
+
if (!options.dryRun) {
|
|
107
|
+
const runId = await saveEvalRun(clients.supabase, {
|
|
108
|
+
metrics,
|
|
109
|
+
config: CURRENT_SEARCH_CONFIG,
|
|
110
|
+
results,
|
|
111
|
+
confidenceIntervals,
|
|
112
|
+
scoreCalibration,
|
|
113
|
+
coverageAnalysis,
|
|
114
|
+
});
|
|
115
|
+
process.stderr.write(`\nRun saved to eval_runs (id: ${runId})\n`);
|
|
116
|
+
}
|
|
117
|
+
let comparisonSeverity = 'none';
|
|
118
|
+
if (previousRun) {
|
|
119
|
+
const comparison = compareRuns({
|
|
120
|
+
hitRate: metrics.hitRate,
|
|
121
|
+
firstResultAccuracy: metrics.firstResultAccuracy,
|
|
122
|
+
recall: metrics.recall,
|
|
123
|
+
zeroResultRate: metrics.zeroResultRate,
|
|
124
|
+
meanReciprocalRank: metrics.meanReciprocalRank,
|
|
125
|
+
normalizedDiscountedCumulativeGain: metrics.normalizedDiscountedCumulativeGain,
|
|
126
|
+
avgResponseTimeMs: metrics.avgResponseTimeMs,
|
|
127
|
+
}, {
|
|
128
|
+
hitRate: previousRun.hit_rate,
|
|
129
|
+
firstResultAccuracy: previousRun.first_result_accuracy,
|
|
130
|
+
recall: previousRun.recall,
|
|
131
|
+
zeroResultRate: previousRun.zero_result_rate,
|
|
132
|
+
meanReciprocalRank: previousRun.mean_reciprocal_rank ?? 0,
|
|
133
|
+
normalizedDiscountedCumulativeGain: previousRun.normalized_discounted_cumulative_gain ?? 0,
|
|
134
|
+
avgResponseTimeMs: previousRun.avg_response_time_ms,
|
|
135
|
+
});
|
|
136
|
+
console.log('\n' + formatComparison(comparison));
|
|
137
|
+
comparisonSeverity = comparison.severity;
|
|
138
|
+
}
|
|
139
|
+
analysisSpan.update({
|
|
140
|
+
output: { metrics, comparisonSeverity },
|
|
141
|
+
});
|
|
142
|
+
analysisSpan.end();
|
|
143
|
+
evalTrace.update({
|
|
144
|
+
output: {
|
|
145
|
+
hitRate: metrics.hitRate,
|
|
146
|
+
firstResultAccuracy: metrics.firstResultAccuracy,
|
|
147
|
+
recall: metrics.recall,
|
|
148
|
+
meanReciprocalRank: metrics.meanReciprocalRank,
|
|
149
|
+
normalizedDiscountedCumulativeGain: metrics.normalizedDiscountedCumulativeGain,
|
|
150
|
+
comparisonSeverity,
|
|
151
|
+
},
|
|
152
|
+
});
|
|
153
|
+
console.log('\n' + formatAdvancedReport(confidenceIntervals, scoreCalibration, coverageAnalysis));
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
// =============================================================================
|
|
157
|
+
// Threshold sweep — test multiple thresholds to find optimal value
|
|
158
|
+
// =============================================================================
|
|
159
|
+
/**
|
|
160
|
+
* Run the golden dataset at multiple similarity thresholds and compare.
|
|
161
|
+
* Prints a table showing how each metric changes with the threshold.
|
|
162
|
+
*
|
|
163
|
+
* Usage: ledger eval:sweep
|
|
164
|
+
* ledger eval:sweep --thresholds 0.15,0.20,0.25,0.30,0.35,0.40
|
|
165
|
+
*/
|
|
166
|
+
export async function sweepThreshold(config, options) {
|
|
167
|
+
const sweepRunUuid = randomUUID();
|
|
168
|
+
const clients = {
|
|
169
|
+
supabase: config.supabase,
|
|
170
|
+
openai: config.openai,
|
|
171
|
+
cohereApiKey: config.cohereApiKey,
|
|
172
|
+
sessionId: `eval-sweep-${sweepRunUuid}`,
|
|
173
|
+
observabilityEnvironment: 'eval',
|
|
174
|
+
};
|
|
175
|
+
const thresholds = options.thresholds
|
|
176
|
+
.split(',')
|
|
177
|
+
.map(value => parseFloat(value.trim()))
|
|
178
|
+
.filter(value => !isNaN(value) && value > 0 && value < 1);
|
|
179
|
+
if (thresholds.length === 0) {
|
|
180
|
+
console.error('No valid thresholds provided. Use comma-separated values like: 0.15,0.20,0.25');
|
|
181
|
+
process.exit(1);
|
|
182
|
+
}
|
|
183
|
+
const { data: testCases, error } = await clients.supabase
|
|
184
|
+
.from('eval_golden_dataset')
|
|
185
|
+
.select('id, query, tags, judgments:eval_golden_judgments(document_id, grade)')
|
|
186
|
+
.order('id');
|
|
187
|
+
if (error || !testCases) {
|
|
188
|
+
console.error('Failed to load golden dataset:', error?.message ?? 'no data');
|
|
189
|
+
process.exit(1);
|
|
190
|
+
}
|
|
191
|
+
const goldenCases = testCases;
|
|
192
|
+
const normalCount = goldenCases.filter(testCase => testCase.judgments.some(judgment => judgment.grade >= 2)).length;
|
|
193
|
+
console.log(`\nLoaded ${goldenCases.length} test cases (${normalCount} normal)\n`);
|
|
194
|
+
console.log('threshold | hit_rate | first_result | recall | MRR | NDCG | avg_ms');
|
|
195
|
+
console.log('----------|----------|--------------|----------|--------|--------|-------');
|
|
196
|
+
for (const threshold of thresholds) {
|
|
197
|
+
const results = [];
|
|
198
|
+
for (const testCase of goldenCases) {
|
|
199
|
+
const startTime = Date.now();
|
|
200
|
+
const searchResults = await searchHybrid(clients, {
|
|
201
|
+
query: testCase.query,
|
|
202
|
+
limit: CURRENT_SEARCH_CONFIG.limit,
|
|
203
|
+
threshold,
|
|
204
|
+
reranker: CURRENT_SEARCH_CONFIG.reranker,
|
|
205
|
+
});
|
|
206
|
+
results.push(scoreTestCase(testCase, searchResults, Date.now() - startTime));
|
|
207
|
+
}
|
|
208
|
+
const metrics = computeMetrics(results);
|
|
209
|
+
// metrics.hitRate etc are already percentages (0-100) from computeMetrics
|
|
210
|
+
console.log(`${threshold.toFixed(2).padStart(9)} | ` +
|
|
211
|
+
`${metrics.hitRate.toFixed(1).padStart(6)}% | ` +
|
|
212
|
+
`${metrics.firstResultAccuracy.toFixed(1).padStart(10)}% | ` +
|
|
213
|
+
`${metrics.recall.toFixed(1).padStart(6)}% | ` +
|
|
214
|
+
`${metrics.meanReciprocalRank.toFixed(3).padStart(6)} | ` +
|
|
215
|
+
`${metrics.normalizedDiscountedCumulativeGain.toFixed(3).padStart(6)} | ` +
|
|
216
|
+
`${metrics.avgResponseTimeMs.toFixed(0).padStart(5)}`);
|
|
217
|
+
}
|
|
218
|
+
console.log(`\nCurrent threshold: ${CURRENT_SEARCH_CONFIG.threshold}`);
|
|
219
|
+
}
|
|
220
|
+
async function fetchDocLookup(supabase, docIds) {
|
|
221
|
+
const lookup = new Map();
|
|
222
|
+
if (docIds.length === 0)
|
|
223
|
+
return lookup;
|
|
224
|
+
const { data, error } = await supabase
|
|
225
|
+
.from('documents')
|
|
226
|
+
.select('id, name, content')
|
|
227
|
+
.in('id', docIds);
|
|
228
|
+
if (error || !data)
|
|
229
|
+
return lookup;
|
|
230
|
+
for (const documentRow of data) {
|
|
231
|
+
const content = documentRow.content ?? '';
|
|
232
|
+
const snippet = content.replace(/\s+/g, ' ').slice(0, 140);
|
|
233
|
+
lookup.set(documentRow.id, { id: documentRow.id, name: documentRow.name, snippet });
|
|
234
|
+
}
|
|
235
|
+
return lookup;
|
|
236
|
+
}
|
|
237
|
+
export async function showEvalRun(config, runId, options) {
|
|
238
|
+
const supabase = config.supabase;
|
|
239
|
+
const run = await loadEvalRun(supabase, runId);
|
|
240
|
+
if (!run) {
|
|
241
|
+
process.stderr.write(`Eval run ${runId} not found\n`);
|
|
242
|
+
process.exit(1);
|
|
243
|
+
}
|
|
244
|
+
console.log('\n' + '='.repeat(60));
|
|
245
|
+
console.log(`Eval Run ${run.id} — ${run.run_date}`);
|
|
246
|
+
console.log('='.repeat(60));
|
|
247
|
+
console.log(`Test cases: ${run.test_case_count}`);
|
|
248
|
+
console.log(`Hit rate: ${run.hit_rate.toFixed(1)}%`);
|
|
249
|
+
console.log(`First-result acc: ${run.first_result_accuracy.toFixed(1)}%`);
|
|
250
|
+
console.log(`Recall: ${run.recall.toFixed(1)}%`);
|
|
251
|
+
console.log(`MRR: ${(run.mean_reciprocal_rank ?? 0).toFixed(3)}`);
|
|
252
|
+
console.log(`NDCG: ${(run.normalized_discounted_cumulative_gain ?? 0).toFixed(3)}`);
|
|
253
|
+
console.log(`Zero-result rate: ${run.zero_result_rate.toFixed(1)}%`);
|
|
254
|
+
console.log(`Avg response (ms): ${run.avg_response_time_ms.toFixed(0)}`);
|
|
255
|
+
const missedQueries = run.missed_queries ?? [];
|
|
256
|
+
console.log(`\nMissed queries: ${missedQueries.length}\n`);
|
|
257
|
+
if (missedQueries.length === 0) {
|
|
258
|
+
console.log(' (none)');
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
// Support both the new graded shape (judgments) and the legacy binary shape
|
|
262
|
+
// (expected) so eval:show still works against historical runs.
|
|
263
|
+
const expectedDocsFor = (missedQuery) => {
|
|
264
|
+
const judgments = missedQuery.judgments;
|
|
265
|
+
if (judgments && judgments.length > 0) {
|
|
266
|
+
return judgments
|
|
267
|
+
.filter(judgment => judgment.grade >= 2)
|
|
268
|
+
.map(judgment => ({ id: judgment.document_id, grade: judgment.grade }));
|
|
269
|
+
}
|
|
270
|
+
const expected = missedQuery.expected;
|
|
271
|
+
return (expected ?? []).map(id => ({ id }));
|
|
272
|
+
};
|
|
273
|
+
// Resolve doc ids → names + snippets in one batch
|
|
274
|
+
const allDocIds = new Set();
|
|
275
|
+
for (const missedQuery of missedQueries) {
|
|
276
|
+
for (const expected of expectedDocsFor(missedQuery))
|
|
277
|
+
allDocIds.add(expected.id);
|
|
278
|
+
for (const returnedId of missedQuery.got.slice(0, 3))
|
|
279
|
+
allDocIds.add(returnedId);
|
|
280
|
+
}
|
|
281
|
+
const lookup = await fetchDocLookup(supabase, Array.from(allDocIds));
|
|
282
|
+
const formatDoc = (docId, score) => {
|
|
283
|
+
const document = lookup.get(docId);
|
|
284
|
+
const documentName = document?.name ?? '<unknown>';
|
|
285
|
+
const scoreLabel = score !== undefined ? ` (${score.toFixed(3)})` : '';
|
|
286
|
+
return `#${docId} ${documentName}${scoreLabel}`;
|
|
287
|
+
};
|
|
288
|
+
for (const [missedIndex, missedQuery] of missedQueries.entries()) {
|
|
289
|
+
console.log(`[${missedIndex + 1}] "${missedQuery.query}"`);
|
|
290
|
+
if (missedQuery.tags.length > 0)
|
|
291
|
+
console.log(` tags: ${missedQuery.tags.join(', ')}`);
|
|
292
|
+
const expectedDocs = expectedDocsFor(missedQuery);
|
|
293
|
+
console.log(` expected:`);
|
|
294
|
+
for (const expected of expectedDocs) {
|
|
295
|
+
const gradeLabel = expected.grade !== undefined ? ` (grade ${expected.grade})` : '';
|
|
296
|
+
console.log(` - ${formatDoc(expected.id)}${gradeLabel}`);
|
|
297
|
+
}
|
|
298
|
+
if (missedQuery.got.length === 0) {
|
|
299
|
+
console.log(` got: (none — zero results)`);
|
|
300
|
+
}
|
|
301
|
+
else {
|
|
302
|
+
console.log(` got (top 3):`);
|
|
303
|
+
const topReturned = Math.min(3, missedQuery.got.length);
|
|
304
|
+
for (let position = 0; position < topReturned; position++) {
|
|
305
|
+
console.log(` ${position + 1}. ${formatDoc(missedQuery.got[position], missedQuery.gotScores[position])}`);
|
|
306
|
+
}
|
|
307
|
+
const topDoc = lookup.get(missedQuery.got[0]);
|
|
308
|
+
if (topDoc?.snippet)
|
|
309
|
+
console.log(` top1 snippet: "${topDoc.snippet}…"`);
|
|
310
|
+
}
|
|
311
|
+
console.log('');
|
|
312
|
+
}
|
|
313
|
+
if (options.full && run.per_query_results) {
|
|
314
|
+
console.log('='.repeat(60));
|
|
315
|
+
console.log('Per-query results (full)');
|
|
316
|
+
console.log('='.repeat(60));
|
|
317
|
+
for (const queryResult of run.per_query_results) {
|
|
318
|
+
console.log(JSON.stringify(queryResult));
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|