@aperdomoll90/ledger-ai 1.4.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +177 -221
- package/dist/commands/add.js +51 -100
- package/dist/commands/backfill.js +55 -0
- package/dist/commands/backup.js +10 -10
- package/dist/commands/check.js +21 -29
- package/dist/commands/config.js +13 -12
- package/dist/commands/delete.js +22 -17
- package/dist/commands/eval-judge.js +11 -0
- package/dist/commands/eval.js +321 -0
- package/dist/commands/export.js +8 -10
- package/dist/commands/get.js +9 -0
- package/dist/commands/hunt.js +206 -0
- package/dist/commands/ingest.js +15 -14
- package/dist/commands/init.js +18 -20
- package/dist/commands/list.js +21 -7
- package/dist/commands/migrate.js +11 -11
- package/dist/commands/onboard.js +2 -2
- package/dist/commands/pull.js +3 -2
- package/dist/commands/push.js +8 -8
- package/dist/commands/restore.js +38 -38
- package/dist/commands/show.js +13 -16
- package/dist/commands/sync.js +58 -19
- package/dist/commands/tag.js +20 -14
- package/dist/commands/update.js +50 -18
- package/dist/commands/wizard.js +3 -3
- package/dist/lib/ai-search.js +163 -0
- package/dist/lib/audit.js +19 -0
- package/dist/lib/backfill.js +60 -0
- package/dist/lib/config.js +19 -2
- package/dist/lib/document-classification.js +5 -0
- package/dist/lib/document-fetching.js +77 -0
- package/dist/lib/document-operations.js +150 -0
- package/dist/lib/documents/classification.js +5 -0
- package/dist/lib/documents/fetching.js +89 -0
- package/dist/lib/documents/operations.js +304 -0
- package/dist/lib/domains.js +116 -0
- package/dist/lib/embeddings.js +190 -0
- package/dist/lib/errors.js +3 -1
- package/dist/lib/eval/eval-advanced.js +289 -0
- package/dist/lib/eval/eval-judge-session.js +233 -0
- package/dist/lib/eval/eval-store.js +105 -0
- package/dist/lib/eval/eval.js +303 -0
- package/dist/lib/file-writer.js +23 -0
- package/dist/lib/generators.js +44 -45
- package/dist/lib/hunter-db.js +235 -0
- package/dist/lib/hunter-rss.js +30 -0
- package/dist/lib/hunter-scoring.js +55 -0
- package/dist/lib/hunter-types.js +36 -0
- package/dist/lib/lint-configs.js +20 -0
- package/dist/lib/migrate.js +2 -2
- package/dist/lib/notes.js +173 -59
- package/dist/lib/observability.js +296 -0
- package/dist/lib/op-add-note-types.test.js +7 -6
- package/dist/lib/prompt.js +8 -8
- package/dist/lib/rate-limiter.js +103 -0
- package/dist/lib/search/ai-search.js +396 -0
- package/dist/lib/search/chunk-context-enrichment.js +155 -0
- package/dist/lib/search/embeddings.js +293 -0
- package/dist/lib/search/reranker.js +120 -0
- package/dist/lib/search/semantic-cache.js +53 -0
- package/dist/lib/type-registry.test.js +6 -6
- package/dist/mcp-server.js +553 -66
- package/dist/migrations/migrations/005-audit-log.sql +22 -0
- package/dist/migrations/migrations/005_opportunities.sql +48 -0
- package/dist/migrations/migrations/006-audited-operations.sql +235 -0
- package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
- package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
- package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
- package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
- package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
- package/dist/scripts/batch-grade.js +344 -0
- package/dist/scripts/benchmark-ingestion.js +376 -0
- package/dist/scripts/convert-judgments-to-graded.js +88 -0
- package/dist/scripts/diagnose-first-result.js +333 -0
- package/dist/scripts/drop-golden-query.js +53 -0
- package/dist/scripts/eval-search.js +115 -0
- package/dist/scripts/grade-unjudged-top1.js +138 -0
- package/dist/scripts/hunter-analytics.js +38 -0
- package/dist/scripts/hunter-cron.js +63 -0
- package/dist/scripts/hunter-purge.js +25 -0
- package/dist/scripts/migrate-v2.js +140 -0
- package/dist/scripts/reindex.js +74 -0
- package/dist/scripts/sync-local-docs.js +153 -0
- package/package.json +7 -1
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
// --- Domain Model ---
|
|
2
|
+
// Pure functions for domain, protection, type validation, and v1→v2 migration.
|
|
3
|
+
// No Supabase dependency — fully unit testable.
|
|
4
|
+
// --- Domain → Types mapping ---
|
|
5
|
+
export const DOMAIN_TYPES = {
|
|
6
|
+
persona: ['personality', 'behavioral-rule', 'preference', 'skill', 'claude-md', 'hook', 'plugin-config'],
|
|
7
|
+
system: ['hook', 'plugin-config', 'type-registry', 'sync-rule', 'skill'],
|
|
8
|
+
workspace: ['dashboard', 'device-registry', 'environment', 'eval-result', 'skill', 'hook', 'plugin-config'],
|
|
9
|
+
project: ['architecture', 'project-status', 'event', 'error', 'reference', 'knowledge', 'skill', 'eval-result'],
|
|
10
|
+
general: ['reference', 'knowledge', 'general'],
|
|
11
|
+
};
|
|
12
|
+
// --- Protection defaults per type ---
|
|
13
|
+
export const TYPE_DEFAULTS = {
|
|
14
|
+
// Persona
|
|
15
|
+
'personality': { protection: 'protected', autoLoad: true },
|
|
16
|
+
'behavioral-rule': { protection: 'protected', autoLoad: true },
|
|
17
|
+
'preference': { protection: 'guarded', autoLoad: true },
|
|
18
|
+
'claude-md': { protection: 'protected', autoLoad: true },
|
|
19
|
+
// System
|
|
20
|
+
'hook': { protection: 'protected', autoLoad: false },
|
|
21
|
+
'plugin-config': { protection: 'guarded', autoLoad: false },
|
|
22
|
+
'type-registry': { protection: 'immutable', autoLoad: false },
|
|
23
|
+
'sync-rule': { protection: 'immutable', autoLoad: false },
|
|
24
|
+
// Workspace
|
|
25
|
+
'dashboard': { protection: 'guarded', autoLoad: false },
|
|
26
|
+
'device-registry': { protection: 'guarded', autoLoad: false },
|
|
27
|
+
'environment': { protection: 'guarded', autoLoad: false },
|
|
28
|
+
// Project
|
|
29
|
+
'architecture': { protection: 'guarded', autoLoad: false },
|
|
30
|
+
'project-status': { protection: 'open', autoLoad: false },
|
|
31
|
+
'event': { protection: 'open', autoLoad: false },
|
|
32
|
+
'error': { protection: 'open', autoLoad: false },
|
|
33
|
+
'reference': { protection: 'open', autoLoad: false },
|
|
34
|
+
'knowledge': { protection: 'open', autoLoad: false },
|
|
35
|
+
'eval-result': { protection: 'open', autoLoad: false },
|
|
36
|
+
};
|
|
37
|
+
// Note: 'skill' has domain-dependent defaults handled by getProtectionDefault/getAutoLoadDefault
|
|
38
|
+
// --- v1 → v2 type migration map ---
|
|
39
|
+
export const TYPE_MIGRATION = {
|
|
40
|
+
'user-preference': { domain: 'persona', type: 'preference' },
|
|
41
|
+
'persona-rule': { domain: 'persona', type: 'behavioral-rule' },
|
|
42
|
+
'code-craft': { domain: 'persona', type: 'preference' },
|
|
43
|
+
'system-rule': { domain: 'system', type: 'sync-rule' },
|
|
44
|
+
'architecture-decision': { domain: 'project', type: 'architecture' },
|
|
45
|
+
'project-status': { domain: 'project', type: 'project-status' },
|
|
46
|
+
'skill-reference': { domain: 'persona', type: 'skill' },
|
|
47
|
+
'knowledge-guide': { domain: 'general', type: 'knowledge' },
|
|
48
|
+
};
|
|
49
|
+
// --- Inference functions ---
|
|
50
|
+
/**
|
|
51
|
+
* Given a v2 type name, return which domain it belongs to.
|
|
52
|
+
* For ambiguous types (skill, reference, knowledge): returns first match based on DOMAIN_TYPES order.
|
|
53
|
+
* Note: reference/knowledge appear in both project and general — project wins because it's listed first.
|
|
54
|
+
* When creating a note without a project, callers should explicitly set domain: 'general'.
|
|
55
|
+
*/
|
|
56
|
+
export function inferDomain(type) {
|
|
57
|
+
// Types that exist in multiple domains — explicit defaults
|
|
58
|
+
// Types that exist in multiple domains — explicit defaults
|
|
59
|
+
if (type === 'hook')
|
|
60
|
+
return 'system'; // most hooks are infrastructure; persona/workspace for personal ones
|
|
61
|
+
if (type === 'plugin-config')
|
|
62
|
+
return 'system'; // most plugins are system; persona/workspace for personal ones
|
|
63
|
+
if (type === 'skill')
|
|
64
|
+
return 'persona'; // most skills are personal; system/workspace/project to override
|
|
65
|
+
for (const [domain, types] of Object.entries(DOMAIN_TYPES)) {
|
|
66
|
+
if (types.includes(type))
|
|
67
|
+
return domain;
|
|
68
|
+
}
|
|
69
|
+
return 'general'; // default for unknown types
|
|
70
|
+
}
|
|
71
|
+
/** Given a type name, return the default protection level. */
|
|
72
|
+
export function getProtectionDefault(type) {
|
|
73
|
+
// Skill defaults depend on context; use protected as sensible default
|
|
74
|
+
if (type === 'skill')
|
|
75
|
+
return 'protected';
|
|
76
|
+
return TYPE_DEFAULTS[type]?.protection ?? 'open';
|
|
77
|
+
}
|
|
78
|
+
/** Given a domain and type, return the default auto_load value. */
|
|
79
|
+
export function getAutoLoadDefault(domain, _type) {
|
|
80
|
+
if (domain === 'system')
|
|
81
|
+
return true;
|
|
82
|
+
if (domain === 'persona')
|
|
83
|
+
return true;
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
/** Validate that a type belongs to the specified domain. Returns null if valid, error string if not. */
|
|
87
|
+
export function validateDomainType(domain, type) {
|
|
88
|
+
const validTypes = DOMAIN_TYPES[domain];
|
|
89
|
+
if (!validTypes)
|
|
90
|
+
return `Unknown domain: ${domain}`;
|
|
91
|
+
if (!validTypes.includes(type)) {
|
|
92
|
+
return `Type "${type}" is not valid for domain "${domain}". Valid types: ${validTypes.join(', ')}`;
|
|
93
|
+
}
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
/** Resolve a v1 type name to its v2 domain + type. Returns null if no migration needed. */
|
|
97
|
+
export function resolveV1Type(oldType) {
|
|
98
|
+
return TYPE_MIGRATION[oldType] ?? null;
|
|
99
|
+
}
|
|
100
|
+
/** Get a flat, deduplicated list of all v2 type names. */
|
|
101
|
+
export function getAllV2Types() {
|
|
102
|
+
const seen = new Set();
|
|
103
|
+
for (const types of Object.values(DOMAIN_TYPES)) {
|
|
104
|
+
for (const type of types)
|
|
105
|
+
seen.add(type);
|
|
106
|
+
}
|
|
107
|
+
return [...seen];
|
|
108
|
+
}
|
|
109
|
+
/** Check if a type name is a valid v2 type (exists in any domain). */
|
|
110
|
+
export function isV2Type(type) {
|
|
111
|
+
return getAllV2Types().includes(type);
|
|
112
|
+
}
|
|
113
|
+
/** Check if a type name is a v1 type that needs migration. */
|
|
114
|
+
export function isV1Type(type) {
|
|
115
|
+
return type in TYPE_MIGRATION;
|
|
116
|
+
}
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
// embeddings.ts
|
|
2
|
+
// Prepares data for the database: generate embeddings, chunk text, format vectors.
|
|
3
|
+
// The database can't call OpenAI or split text — that's TypeScript's job.
|
|
4
|
+
import { createHash } from 'crypto';
|
|
5
|
+
// =============================================================================
|
|
6
|
+
// Constants
|
|
7
|
+
// =============================================================================
|
|
8
|
+
const EMBEDDING_MODEL = 'text-embedding-3-small';
|
|
9
|
+
const DEFAULT_MAX_CHUNK_CHARS = 2000;
|
|
10
|
+
const DEFAULT_OVERLAP_CHARS = 200;
|
|
11
|
+
// =============================================================================
|
|
12
|
+
// Pure functions — no API calls, no database, fully testable
|
|
13
|
+
// =============================================================================
|
|
14
|
+
/**
|
|
15
|
+
* SHA-256 hash of text content.
|
|
16
|
+
* Used for change detection: "has this document's content changed since last sync?"
|
|
17
|
+
* Same algorithm used in Postgres via pgcrypto.
|
|
18
|
+
*/
|
|
19
|
+
export function contentHash(text) {
|
|
20
|
+
return createHash('sha256').update(text, 'utf-8').digest('hex');
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Format a number[] embedding as a Postgres vector string.
|
|
24
|
+
* Supabase RPC can't send number[] as vector(1536) — it needs this string format.
|
|
25
|
+
* Example: [0.021, -0.007, 0.045] → "[0.021,-0.007,0.045]"
|
|
26
|
+
*/
|
|
27
|
+
export function toVectorString(embedding) {
|
|
28
|
+
return `[${embedding.join(',')}]`;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Parse a Postgres vector back into a number[].
|
|
32
|
+
* Supabase REST API returns vector(1536) columns as strings like "[0.021,-0.007,0.045]".
|
|
33
|
+
* If the value is already a number[] (e.g. from a mock in tests), it passes through unchanged.
|
|
34
|
+
*/
|
|
35
|
+
export function parseVector(raw) {
|
|
36
|
+
if (Array.isArray(raw))
|
|
37
|
+
return raw;
|
|
38
|
+
if (typeof raw === 'string')
|
|
39
|
+
return JSON.parse(raw);
|
|
40
|
+
throw new Error(`Cannot parse vector: expected string or number[], got ${typeof raw}`);
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Split text into smaller pieces for embedding.
|
|
44
|
+
*
|
|
45
|
+
* Why chunk: embedding models produce better search results on focused text
|
|
46
|
+
* (500-2000 chars) than on large mixed-topic documents (10,000+ chars).
|
|
47
|
+
*
|
|
48
|
+
* How it works:
|
|
49
|
+
* 1. If text is under maxChars, return it as one chunk
|
|
50
|
+
* 2. Split on paragraph boundaries (\n\n)
|
|
51
|
+
* 3. Accumulate paragraphs until a chunk would exceed maxChars
|
|
52
|
+
* 4. Include overlap between chunks so context isn't lost at boundaries
|
|
53
|
+
* 5. Force-split any remaining chunks that are still too long
|
|
54
|
+
*/
|
|
55
|
+
export function chunkText(text, strategy = 'paragraph', maxChars = DEFAULT_MAX_CHUNK_CHARS, overlapChars = DEFAULT_OVERLAP_CHARS) {
|
|
56
|
+
// Short text = one chunk
|
|
57
|
+
if (text.length <= maxChars) {
|
|
58
|
+
return [{
|
|
59
|
+
content: text,
|
|
60
|
+
chunk_index: 0,
|
|
61
|
+
content_type: 'text',
|
|
62
|
+
strategy,
|
|
63
|
+
overlap_chars: 0,
|
|
64
|
+
}];
|
|
65
|
+
}
|
|
66
|
+
// Split on paragraph boundaries
|
|
67
|
+
const paragraphs = text.split(/\n\n+/);
|
|
68
|
+
const rawChunks = [];
|
|
69
|
+
let current = '';
|
|
70
|
+
// Greedy paragraph packing — three stages per chunk:
|
|
71
|
+
//
|
|
72
|
+
// 1. First paragraph: size check may pass but current is empty (length 0),
|
|
73
|
+
// so the guard (current.length > 0) fails → paragraph goes into current.
|
|
74
|
+
// 2. Next paragraphs: current + paragraph + 2 (blank-line separator) still
|
|
75
|
+
// fits under maxChars → keep appending to current.
|
|
76
|
+
// 3. Overflow: current + paragraph + 2 exceeds maxChars AND current has
|
|
77
|
+
// content → flush current as a finished chunk, slice its tail as overlap
|
|
78
|
+
// context, start a new current with that tail + the new paragraph.
|
|
79
|
+
//
|
|
80
|
+
// The guard prevents flushing an empty chunk when a single paragraph is
|
|
81
|
+
// already larger than maxChars — the force-split below handles that case.
|
|
82
|
+
for (const paragraph of paragraphs) {
|
|
83
|
+
if (current.length + paragraph.length + 2 > maxChars && current.length > 0) {
|
|
84
|
+
rawChunks.push(current.trim());
|
|
85
|
+
// Overlap: carry the end of this chunk into the start of the next
|
|
86
|
+
const overlap = current.slice(-overlapChars);
|
|
87
|
+
current = overlap + '\n\n' + paragraph;
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
current = current ? current + '\n\n' + paragraph : paragraph;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
// The loop only flushes when a paragraph overflows. After the last paragraph,
|
|
94
|
+
// current may still hold a partially filled chunk that never triggered a flush.
|
|
95
|
+
if (current.trim()) {
|
|
96
|
+
rawChunks.push(current.trim());
|
|
97
|
+
}
|
|
98
|
+
// Force-split any chunks still over maxChars.
|
|
99
|
+
// This handles text with no blank lines (e.g. a JSON blob, base64 string, or
|
|
100
|
+
// a wall of text). The paragraph loop above can't split those — it produces a
|
|
101
|
+
// single oversized chunk because the empty-box guard prevents flushing when
|
|
102
|
+
// current is empty. Here we cut at character positions as a last resort.
|
|
103
|
+
const result = [];
|
|
104
|
+
let finalIndex = 0;
|
|
105
|
+
for (const chunk of rawChunks) {
|
|
106
|
+
if (chunk.length <= maxChars) {
|
|
107
|
+
result.push({
|
|
108
|
+
content: chunk,
|
|
109
|
+
chunk_index: finalIndex,
|
|
110
|
+
content_type: 'text',
|
|
111
|
+
strategy,
|
|
112
|
+
overlap_chars: finalIndex > 0 ? overlapChars : 0,
|
|
113
|
+
});
|
|
114
|
+
finalIndex++;
|
|
115
|
+
}
|
|
116
|
+
else {
|
|
117
|
+
// Force split at character boundaries (step must be positive)
|
|
118
|
+
const step = Math.max(1, maxChars - overlapChars);
|
|
119
|
+
for (let i = 0; i < chunk.length; i += step) {
|
|
120
|
+
result.push({
|
|
121
|
+
content: chunk.slice(i, i + maxChars),
|
|
122
|
+
chunk_index: finalIndex,
|
|
123
|
+
content_type: 'text',
|
|
124
|
+
strategy: 'forced',
|
|
125
|
+
overlap_chars: i > 0 ? overlapChars : 0,
|
|
126
|
+
});
|
|
127
|
+
finalIndex++;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return result;
|
|
132
|
+
}
|
|
133
|
+
// =============================================================================
|
|
134
|
+
// API functions — call OpenAI and/or database
|
|
135
|
+
// =============================================================================
|
|
136
|
+
/**
|
|
137
|
+
* Call OpenAI to convert text into an array of 1,536 numbers.
|
|
138
|
+
* These numbers represent the "meaning" of the text in a mathematical space.
|
|
139
|
+
* Similar texts produce similar numbers — that's how search works.
|
|
140
|
+
*/
|
|
141
|
+
export async function generateEmbedding(openai, text) {
|
|
142
|
+
const response = await openai.embeddings.create({
|
|
143
|
+
model: EMBEDDING_MODEL,
|
|
144
|
+
input: text,
|
|
145
|
+
});
|
|
146
|
+
return response.data[0].embedding;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Get an embedding for a search query, using the cache to avoid repeat API calls.
|
|
150
|
+
*
|
|
151
|
+
* Flow:
|
|
152
|
+
* 1. Check query_cache table for this exact query text
|
|
153
|
+
* 2. If cached: return the cached embedding, update hit_count
|
|
154
|
+
* 3. If not cached: call OpenAI, save to cache, return embedding
|
|
155
|
+
*
|
|
156
|
+
* Why cache: each OpenAI embedding call costs money. If you search
|
|
157
|
+
* "how does auth work" three times, the cache saves 2 API calls.
|
|
158
|
+
*/
|
|
159
|
+
export async function getOrCacheQueryEmbedding(clients, query) {
|
|
160
|
+
// Normalize query to avoid cache misses from capitalization/whitespace differences
|
|
161
|
+
const normalizedQuery = query.toLowerCase().trim();
|
|
162
|
+
// Check cache
|
|
163
|
+
const { data: cached } = await clients.supabase
|
|
164
|
+
.from('query_cache')
|
|
165
|
+
.select('embedding, hit_count')
|
|
166
|
+
.eq('query_text', normalizedQuery)
|
|
167
|
+
.single();
|
|
168
|
+
if (cached?.embedding) {
|
|
169
|
+
// Update cache stats
|
|
170
|
+
await clients.supabase
|
|
171
|
+
.from('query_cache')
|
|
172
|
+
.update({
|
|
173
|
+
hit_count: cached.hit_count + 1,
|
|
174
|
+
last_used_at: new Date().toISOString(),
|
|
175
|
+
})
|
|
176
|
+
.eq('query_text', normalizedQuery);
|
|
177
|
+
return parseVector(cached.embedding);
|
|
178
|
+
}
|
|
179
|
+
// Generate and cache — send original query to OpenAI (preserves meaning),
|
|
180
|
+
// but store under normalized key (so "Auth" and "auth" share one cache entry)
|
|
181
|
+
const embedding = await generateEmbedding(clients.openai, query);
|
|
182
|
+
await clients.supabase
|
|
183
|
+
.from('query_cache')
|
|
184
|
+
.insert({
|
|
185
|
+
query_text: normalizedQuery,
|
|
186
|
+
embedding: toVectorString(embedding),
|
|
187
|
+
embedding_model_id: 'openai/text-embedding-3-small',
|
|
188
|
+
});
|
|
189
|
+
return embedding;
|
|
190
|
+
}
|
package/dist/lib/errors.js
CHANGED
|
@@ -11,11 +11,13 @@ export var ExitCode;
|
|
|
11
11
|
ExitCode[ExitCode["SUCCESS"] = 0] = "SUCCESS";
|
|
12
12
|
ExitCode[ExitCode["GENERAL_ERROR"] = 1] = "GENERAL_ERROR";
|
|
13
13
|
ExitCode[ExitCode["FILE_NOT_FOUND"] = 2] = "FILE_NOT_FOUND";
|
|
14
|
-
ExitCode[ExitCode["
|
|
14
|
+
ExitCode[ExitCode["DOCUMENT_NOT_FOUND"] = 3] = "DOCUMENT_NOT_FOUND";
|
|
15
15
|
ExitCode[ExitCode["SUPABASE_ERROR"] = 4] = "SUPABASE_ERROR";
|
|
16
16
|
ExitCode[ExitCode["EMBEDDING_ERROR"] = 5] = "EMBEDDING_ERROR";
|
|
17
17
|
ExitCode[ExitCode["CONFLICT"] = 6] = "CONFLICT";
|
|
18
18
|
ExitCode[ExitCode["INVALID_INPUT"] = 7] = "INVALID_INPUT";
|
|
19
|
+
ExitCode[ExitCode["PROTECTED"] = 8] = "PROTECTED";
|
|
20
|
+
ExitCode[ExitCode["VERIFY_MISMATCH"] = 9] = "VERIFY_MISMATCH";
|
|
19
21
|
})(ExitCode || (ExitCode = {}));
|
|
20
22
|
export function fatal(message, code = ExitCode.GENERAL_ERROR) {
|
|
21
23
|
console.error(message);
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
// eval-advanced.ts
|
|
2
|
+
// Advanced eval utilities — confidence intervals via bootstrap resampling,
|
|
3
|
+
// score calibration for relevant vs irrelevant result distributions.
|
|
4
|
+
// Pure functions — no I/O, no database calls.
|
|
5
|
+
import { computeMetrics, HIT_THRESHOLD } from './eval.js';
|
|
6
|
+
// =============================================================================
|
|
7
|
+
// resampleWithReplacement
|
|
8
|
+
// =============================================================================
|
|
9
|
+
/**
|
|
10
|
+
* Creates a new array of the same length by randomly picking items from the
|
|
11
|
+
* original with replacement. Each position independently draws a random item,
|
|
12
|
+
* so the same item may appear multiple times.
|
|
13
|
+
*/
|
|
14
|
+
export function resampleWithReplacement(results) {
|
|
15
|
+
const resampled = [];
|
|
16
|
+
for (let sampleIndex = 0; sampleIndex < results.length; sampleIndex++) {
|
|
17
|
+
const randomIndex = Math.floor(Math.random() * results.length);
|
|
18
|
+
resampled.push(results[randomIndex]);
|
|
19
|
+
}
|
|
20
|
+
return resampled;
|
|
21
|
+
}
|
|
22
|
+
// =============================================================================
|
|
23
|
+
// percentile
|
|
24
|
+
// =============================================================================
|
|
25
|
+
/**
|
|
26
|
+
* Computes the percentile of a sorted array using linear interpolation.
|
|
27
|
+
* fraction=0.025 gives the 2.5th percentile; fraction=0.975 gives 97.5th.
|
|
28
|
+
*
|
|
29
|
+
* Assumes sortedValues is already sorted ascending.
|
|
30
|
+
*/
|
|
31
|
+
export function percentile(sortedValues, fraction) {
|
|
32
|
+
if (sortedValues.length === 0)
|
|
33
|
+
return 0;
|
|
34
|
+
if (sortedValues.length === 1)
|
|
35
|
+
return sortedValues[0];
|
|
36
|
+
const position = fraction * (sortedValues.length - 1);
|
|
37
|
+
const lowerIndex = Math.floor(position);
|
|
38
|
+
const upperIndex = Math.ceil(position);
|
|
39
|
+
if (lowerIndex === upperIndex)
|
|
40
|
+
return sortedValues[lowerIndex];
|
|
41
|
+
const lowerValue = sortedValues[lowerIndex];
|
|
42
|
+
const upperValue = sortedValues[upperIndex];
|
|
43
|
+
const fraction_ = position - lowerIndex;
|
|
44
|
+
return lowerValue + fraction_ * (upperValue - lowerValue);
|
|
45
|
+
}
|
|
46
|
+
// =============================================================================
|
|
47
|
+
// buildInterval
|
|
48
|
+
// =============================================================================
|
|
49
|
+
/**
|
|
50
|
+
* Sorts bootstrap values, computes the 2.5th and 97.5th percentiles, then
|
|
51
|
+
* returns a complete IConfidenceIntervalProps with point estimate, lower,
|
|
52
|
+
* upper, and width.
|
|
53
|
+
*/
|
|
54
|
+
export function buildInterval(bootstrapValues, pointEstimate) {
|
|
55
|
+
const sortedValues = [...bootstrapValues].sort((valueA, valueB) => valueA - valueB);
|
|
56
|
+
const lower = percentile(sortedValues, 0.025);
|
|
57
|
+
const upper = percentile(sortedValues, 0.975);
|
|
58
|
+
return {
|
|
59
|
+
point: pointEstimate,
|
|
60
|
+
lower,
|
|
61
|
+
upper,
|
|
62
|
+
width: upper - lower,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
// =============================================================================
|
|
66
|
+
// computeConfidenceIntervals
|
|
67
|
+
// =============================================================================
|
|
68
|
+
/**
|
|
69
|
+
* Computes 95% confidence intervals for all eval metrics using bootstrap
|
|
70
|
+
* resampling.
|
|
71
|
+
*
|
|
72
|
+
* 1. Computes point estimates from actual results via computeMetrics().
|
|
73
|
+
* 2. Runs `iterations` bootstrap rounds, each resampling with replacement.
|
|
74
|
+
* 3. For each metric, sorts the collected bootstrap values and takes the
|
|
75
|
+
* 2.5th and 97.5th percentiles as lower and upper bounds.
|
|
76
|
+
*/
|
|
77
|
+
export function computeConfidenceIntervals(results, iterations = 1000) {
|
|
78
|
+
const pointMetrics = computeMetrics(results);
|
|
79
|
+
const bootstrapHitRates = [];
|
|
80
|
+
const bootstrapFirstResultAccuracies = [];
|
|
81
|
+
const bootstrapRecalls = [];
|
|
82
|
+
const bootstrapZeroResultRates = [];
|
|
83
|
+
const bootstrapMeanReciprocalRanks = [];
|
|
84
|
+
const bootstrapNormalizedDiscountedCumulativeGainValues = [];
|
|
85
|
+
for (let iteration = 0; iteration < iterations; iteration++) {
|
|
86
|
+
const resampledResults = resampleWithReplacement(results);
|
|
87
|
+
const resampledMetrics = computeMetrics(resampledResults);
|
|
88
|
+
bootstrapHitRates.push(resampledMetrics.hitRate);
|
|
89
|
+
bootstrapFirstResultAccuracies.push(resampledMetrics.firstResultAccuracy);
|
|
90
|
+
bootstrapRecalls.push(resampledMetrics.recall);
|
|
91
|
+
bootstrapZeroResultRates.push(resampledMetrics.zeroResultRate);
|
|
92
|
+
bootstrapMeanReciprocalRanks.push(resampledMetrics.meanReciprocalRank);
|
|
93
|
+
bootstrapNormalizedDiscountedCumulativeGainValues.push(resampledMetrics.normalizedDiscountedCumulativeGain);
|
|
94
|
+
}
|
|
95
|
+
return {
|
|
96
|
+
hitRate: buildInterval(bootstrapHitRates, pointMetrics.hitRate),
|
|
97
|
+
firstResultAccuracy: buildInterval(bootstrapFirstResultAccuracies, pointMetrics.firstResultAccuracy),
|
|
98
|
+
recall: buildInterval(bootstrapRecalls, pointMetrics.recall),
|
|
99
|
+
zeroResultRate: buildInterval(bootstrapZeroResultRates, pointMetrics.zeroResultRate),
|
|
100
|
+
meanReciprocalRank: buildInterval(bootstrapMeanReciprocalRanks, pointMetrics.meanReciprocalRank),
|
|
101
|
+
normalizedDiscountedCumulativeGain: buildInterval(bootstrapNormalizedDiscountedCumulativeGainValues, pointMetrics.normalizedDiscountedCumulativeGain),
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
// =============================================================================
|
|
105
|
+
// computeDistribution
|
|
106
|
+
// =============================================================================
|
|
107
|
+
/**
|
|
108
|
+
* Computes summary statistics for an array of numeric scores.
|
|
109
|
+
* Returns all-zero IScoreDistributionProps if the array is empty.
|
|
110
|
+
* Median is the middle value of the sorted array, or the average of the two
|
|
111
|
+
* middle values when the array has even length.
|
|
112
|
+
*/
|
|
113
|
+
export function computeDistribution(scores) {
|
|
114
|
+
if (scores.length === 0) {
|
|
115
|
+
return { count: 0, mean: 0, median: 0, min: 0, max: 0 };
|
|
116
|
+
}
|
|
117
|
+
const sorted = [...scores].sort((scoreA, scoreB) => scoreA - scoreB);
|
|
118
|
+
const count = sorted.length;
|
|
119
|
+
const sum = sorted.reduce((total, score) => total + score, 0);
|
|
120
|
+
const mean = sum / count;
|
|
121
|
+
const middleIndex = Math.floor(count / 2);
|
|
122
|
+
const median = count % 2 === 1
|
|
123
|
+
? sorted[middleIndex]
|
|
124
|
+
: (sorted[middleIndex - 1] + sorted[middleIndex]) / 2;
|
|
125
|
+
return {
|
|
126
|
+
count,
|
|
127
|
+
mean,
|
|
128
|
+
median,
|
|
129
|
+
min: sorted[0],
|
|
130
|
+
max: sorted[count - 1],
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
// =============================================================================
|
|
134
|
+
// computeScoreCalibration
|
|
135
|
+
// =============================================================================
|
|
136
|
+
/**
|
|
137
|
+
* Separates scores into relevant (returned doc graded >= HIT_THRESHOLD) and
|
|
138
|
+
* irrelevant buckets, then computes distribution stats for each.
|
|
139
|
+
*
|
|
140
|
+
* Out-of-scope results (no judgments at grade >= HIT_THRESHOLD) are skipped.
|
|
141
|
+
* Separation = relevant mean − irrelevant mean.
|
|
142
|
+
*/
|
|
143
|
+
export function computeScoreCalibration(results) {
|
|
144
|
+
const relevantScoreValues = [];
|
|
145
|
+
const irrelevantScoreValues = [];
|
|
146
|
+
for (const result of results) {
|
|
147
|
+
const relevantDocIds = new Set(result.testCase.judgments
|
|
148
|
+
.filter(judgment => judgment.grade >= HIT_THRESHOLD)
|
|
149
|
+
.map(judgment => judgment.document_id));
|
|
150
|
+
if (relevantDocIds.size === 0)
|
|
151
|
+
continue;
|
|
152
|
+
for (let position = 0; position < result.returnedIds.length; position++) {
|
|
153
|
+
const docId = result.returnedIds[position];
|
|
154
|
+
const score = result.returnedScores[position];
|
|
155
|
+
if (relevantDocIds.has(docId)) {
|
|
156
|
+
relevantScoreValues.push(score);
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
irrelevantScoreValues.push(score);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
const relevantScores = computeDistribution(relevantScoreValues);
|
|
164
|
+
const irrelevantScores = computeDistribution(irrelevantScoreValues);
|
|
165
|
+
return {
|
|
166
|
+
relevantScores,
|
|
167
|
+
irrelevantScores,
|
|
168
|
+
separation: relevantScores.mean - irrelevantScores.mean,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
// =============================================================================
|
|
172
|
+
// computeCoverageAnalysis
|
|
173
|
+
// =============================================================================
|
|
174
|
+
/**
|
|
175
|
+
* Analyses golden set coverage — which parts of the knowledge base are
|
|
176
|
+
* well-tested vs blind spots.
|
|
177
|
+
*
|
|
178
|
+
* - Normal queries: at least one judgment at grade >= HIT_THRESHOLD
|
|
179
|
+
* - Out-of-scope queries: no judgments at grade >= HIT_THRESHOLD
|
|
180
|
+
* - queriesPerTag: how many queries carry each tag
|
|
181
|
+
* - expectedDocumentIds: deduplicated union of all grade>=HIT_THRESHOLD doc ids, sorted ascending
|
|
182
|
+
*/
|
|
183
|
+
export function computeCoverageAnalysis(results) {
|
|
184
|
+
let normalCount = 0;
|
|
185
|
+
let outOfScopeCount = 0;
|
|
186
|
+
const queriesPerTag = {};
|
|
187
|
+
const seenDocumentIds = new Set();
|
|
188
|
+
for (const result of results) {
|
|
189
|
+
const relevantJudgments = result.testCase.judgments.filter(judgment => judgment.grade >= HIT_THRESHOLD);
|
|
190
|
+
if (relevantJudgments.length === 0) {
|
|
191
|
+
outOfScopeCount++;
|
|
192
|
+
}
|
|
193
|
+
else {
|
|
194
|
+
normalCount++;
|
|
195
|
+
}
|
|
196
|
+
for (const tag of result.testCase.tags) {
|
|
197
|
+
queriesPerTag[tag] = (queriesPerTag[tag] ?? 0) + 1;
|
|
198
|
+
}
|
|
199
|
+
for (const judgment of relevantJudgments) {
|
|
200
|
+
seenDocumentIds.add(judgment.document_id);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
const expectedDocumentIds = [...seenDocumentIds].sort((documentIdA, documentIdB) => documentIdA - documentIdB);
|
|
204
|
+
return {
|
|
205
|
+
totalQueries: results.length,
|
|
206
|
+
normalCount,
|
|
207
|
+
outOfScopeCount,
|
|
208
|
+
totalTags: Object.keys(queriesPerTag).length,
|
|
209
|
+
queriesPerTag,
|
|
210
|
+
uniqueExpectedDocuments: expectedDocumentIds.length,
|
|
211
|
+
expectedDocumentIds,
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
// =============================================================================
|
|
215
|
+
// formatAdvancedReport
|
|
216
|
+
// =============================================================================
|
|
217
|
+
/**
|
|
218
|
+
* Formats confidence intervals, score calibration, and coverage analysis into
|
|
219
|
+
* a human-readable string for display in the CLI or eval runner.
|
|
220
|
+
*
|
|
221
|
+
* - Percentage metrics (hitRate, firstResultAccuracy, recall, zeroResultRate)
|
|
222
|
+
* render as: 88.5% (±4.2%, 95% CI: 84.3–92.7%)
|
|
223
|
+
* - Ratio metrics (meanReciprocalRank, normalizedDiscountedCumulativeGain)
|
|
224
|
+
* render as: 0.601 (±0.052, 95% CI: 0.549–0.653)
|
|
225
|
+
* - Tags with fewer than 3 queries are marked as undertested.
|
|
226
|
+
*/
|
|
227
|
+
export function formatAdvancedReport(intervals, calibration, coverage) {
|
|
228
|
+
const lines = [];
|
|
229
|
+
// ---------------------------------------------------------------------------
|
|
230
|
+
// Section 1 — CONFIDENCE INTERVALS
|
|
231
|
+
// ---------------------------------------------------------------------------
|
|
232
|
+
lines.push('='.repeat(60));
|
|
233
|
+
lines.push('CONFIDENCE INTERVALS (95%, bootstrap)');
|
|
234
|
+
lines.push('='.repeat(60));
|
|
235
|
+
lines.push('');
|
|
236
|
+
function formatPercentInterval(interval) {
|
|
237
|
+
const halfWidth = interval.width / 2;
|
|
238
|
+
return `${interval.point.toFixed(1)}% (±${halfWidth.toFixed(1)}%, 95% CI: ${interval.lower.toFixed(1)}–${interval.upper.toFixed(1)}%)`;
|
|
239
|
+
}
|
|
240
|
+
function formatRatioInterval(interval) {
|
|
241
|
+
const intervalWidth = interval.width / 2;
|
|
242
|
+
return `${interval.point.toFixed(3)} (±${intervalWidth.toFixed(3)}, 95% CI: ${interval.lower.toFixed(3)}–${interval.upper.toFixed(3)})`;
|
|
243
|
+
}
|
|
244
|
+
lines.push(` Hit rate: ${formatPercentInterval(intervals.hitRate)}`);
|
|
245
|
+
lines.push(` First-result accuracy: ${formatPercentInterval(intervals.firstResultAccuracy)}`);
|
|
246
|
+
lines.push(` Recall: ${formatPercentInterval(intervals.recall)}`);
|
|
247
|
+
lines.push(` Zero-result rate: ${formatPercentInterval(intervals.zeroResultRate)}`);
|
|
248
|
+
lines.push(` MRR: ${formatRatioInterval(intervals.meanReciprocalRank)}`);
|
|
249
|
+
lines.push(` NDCG@k: ${formatRatioInterval(intervals.normalizedDiscountedCumulativeGain)}`);
|
|
250
|
+
lines.push('');
|
|
251
|
+
// ---------------------------------------------------------------------------
|
|
252
|
+
// Section 2 — SCORE CALIBRATION
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
lines.push('='.repeat(60));
|
|
255
|
+
lines.push('SCORE CALIBRATION');
|
|
256
|
+
lines.push('='.repeat(60));
|
|
257
|
+
lines.push('');
|
|
258
|
+
const relevant = calibration.relevantScores;
|
|
259
|
+
const irrelevant = calibration.irrelevantScores;
|
|
260
|
+
lines.push(` Relevant scores (n=${relevant.count}):`);
|
|
261
|
+
lines.push(` mean: ${relevant.mean.toFixed(3)}, median: ${relevant.median.toFixed(3)}, range: [${relevant.min.toFixed(3)}–${relevant.max.toFixed(3)}]`);
|
|
262
|
+
lines.push('');
|
|
263
|
+
lines.push(` Irrelevant scores (n=${irrelevant.count}):`);
|
|
264
|
+
lines.push(` mean: ${irrelevant.mean.toFixed(3)}, median: ${irrelevant.median.toFixed(3)}, range: [${irrelevant.min.toFixed(3)}–${irrelevant.max.toFixed(3)}]`);
|
|
265
|
+
lines.push('');
|
|
266
|
+
lines.push(` separation: ${calibration.separation.toFixed(3)} (higher = better distinction between relevant and irrelevant)`);
|
|
267
|
+
lines.push('');
|
|
268
|
+
// ---------------------------------------------------------------------------
|
|
269
|
+
// Section 3 — COVERAGE ANALYSIS
|
|
270
|
+
// ---------------------------------------------------------------------------
|
|
271
|
+
lines.push('='.repeat(60));
|
|
272
|
+
lines.push('COVERAGE ANALYSIS');
|
|
273
|
+
lines.push('='.repeat(60));
|
|
274
|
+
lines.push('');
|
|
275
|
+
lines.push(` Total queries: ${coverage.totalQueries} (${coverage.normalCount} normal, ${coverage.outOfScopeCount} out-of-scope)`);
|
|
276
|
+
lines.push(` Unique docs: ${coverage.uniqueExpectedDocuments}`);
|
|
277
|
+
lines.push(` Tags covered: ${coverage.totalTags}`);
|
|
278
|
+
lines.push('');
|
|
279
|
+
const sortedTagEntries = Object.entries(coverage.queriesPerTag).sort(([, countA], [, countB]) => countB - countA);
|
|
280
|
+
for (const [tag, count] of sortedTagEntries) {
|
|
281
|
+
const undertested = count < 3 ? ' ← undertested' : '';
|
|
282
|
+
lines.push(` ${tag}: ${count}${undertested}`);
|
|
283
|
+
}
|
|
284
|
+
if (sortedTagEntries.length === 0) {
|
|
285
|
+
lines.push(' (no tags)');
|
|
286
|
+
}
|
|
287
|
+
lines.push('');
|
|
288
|
+
return lines.join('\n');
|
|
289
|
+
}
|