@aperdomoll90/ledger-ai 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/cli.js +177 -221
  2. package/dist/commands/add.js +51 -100
  3. package/dist/commands/backfill.js +55 -0
  4. package/dist/commands/backup.js +10 -10
  5. package/dist/commands/check.js +21 -29
  6. package/dist/commands/config.js +13 -12
  7. package/dist/commands/delete.js +22 -17
  8. package/dist/commands/eval-judge.js +11 -0
  9. package/dist/commands/eval.js +321 -0
  10. package/dist/commands/export.js +8 -10
  11. package/dist/commands/get.js +9 -0
  12. package/dist/commands/hunt.js +206 -0
  13. package/dist/commands/ingest.js +15 -14
  14. package/dist/commands/init.js +18 -20
  15. package/dist/commands/list.js +21 -7
  16. package/dist/commands/migrate.js +11 -11
  17. package/dist/commands/onboard.js +2 -2
  18. package/dist/commands/pull.js +3 -2
  19. package/dist/commands/push.js +8 -8
  20. package/dist/commands/restore.js +38 -38
  21. package/dist/commands/show.js +13 -16
  22. package/dist/commands/sync.js +58 -19
  23. package/dist/commands/tag.js +20 -14
  24. package/dist/commands/update.js +50 -18
  25. package/dist/commands/wizard.js +3 -3
  26. package/dist/lib/ai-search.js +163 -0
  27. package/dist/lib/audit.js +19 -0
  28. package/dist/lib/backfill.js +60 -0
  29. package/dist/lib/config.js +19 -2
  30. package/dist/lib/document-classification.js +5 -0
  31. package/dist/lib/document-fetching.js +77 -0
  32. package/dist/lib/document-operations.js +150 -0
  33. package/dist/lib/documents/classification.js +5 -0
  34. package/dist/lib/documents/fetching.js +89 -0
  35. package/dist/lib/documents/operations.js +304 -0
  36. package/dist/lib/domains.js +116 -0
  37. package/dist/lib/embeddings.js +190 -0
  38. package/dist/lib/errors.js +3 -1
  39. package/dist/lib/eval/eval-advanced.js +289 -0
  40. package/dist/lib/eval/eval-judge-session.js +233 -0
  41. package/dist/lib/eval/eval-store.js +105 -0
  42. package/dist/lib/eval/eval.js +303 -0
  43. package/dist/lib/file-writer.js +23 -0
  44. package/dist/lib/generators.js +44 -45
  45. package/dist/lib/hunter-db.js +235 -0
  46. package/dist/lib/hunter-rss.js +30 -0
  47. package/dist/lib/hunter-scoring.js +55 -0
  48. package/dist/lib/hunter-types.js +36 -0
  49. package/dist/lib/lint-configs.js +20 -0
  50. package/dist/lib/migrate.js +2 -2
  51. package/dist/lib/notes.js +173 -59
  52. package/dist/lib/observability.js +296 -0
  53. package/dist/lib/op-add-note-types.test.js +7 -6
  54. package/dist/lib/prompt.js +8 -8
  55. package/dist/lib/rate-limiter.js +103 -0
  56. package/dist/lib/search/ai-search.js +396 -0
  57. package/dist/lib/search/chunk-context-enrichment.js +155 -0
  58. package/dist/lib/search/embeddings.js +293 -0
  59. package/dist/lib/search/reranker.js +120 -0
  60. package/dist/lib/search/semantic-cache.js +53 -0
  61. package/dist/lib/type-registry.test.js +6 -6
  62. package/dist/mcp-server.js +553 -66
  63. package/dist/migrations/migrations/005-audit-log.sql +22 -0
  64. package/dist/migrations/migrations/005_opportunities.sql +48 -0
  65. package/dist/migrations/migrations/006-audited-operations.sql +235 -0
  66. package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
  67. package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
  68. package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
  69. package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
  70. package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
  71. package/dist/scripts/batch-grade.js +344 -0
  72. package/dist/scripts/benchmark-ingestion.js +376 -0
  73. package/dist/scripts/convert-judgments-to-graded.js +88 -0
  74. package/dist/scripts/diagnose-first-result.js +333 -0
  75. package/dist/scripts/drop-golden-query.js +53 -0
  76. package/dist/scripts/eval-search.js +115 -0
  77. package/dist/scripts/grade-unjudged-top1.js +138 -0
  78. package/dist/scripts/hunter-analytics.js +38 -0
  79. package/dist/scripts/hunter-cron.js +63 -0
  80. package/dist/scripts/hunter-purge.js +25 -0
  81. package/dist/scripts/migrate-v2.js +140 -0
  82. package/dist/scripts/reindex.js +74 -0
  83. package/dist/scripts/sync-local-docs.js +153 -0
  84. package/package.json +7 -1
@@ -0,0 +1,216 @@
1
+ -- Migration 009: Semantic Cache
2
+ -- Layer 2 cache: stores full search results keyed by query embedding.
3
+ -- Skips the full search pipeline for semantically similar queries.
4
+ --
5
+ -- Components:
6
+ -- 1. semantic_cache table with HNSW, GIN, and BTREE indexes
7
+ -- 2. semantic_cache_lookup: find cached results by vector similarity
8
+ -- 3. semantic_cache_store: save search results to cache
9
+ -- 4. semantic_cache_cleanup: purge expired entries
10
+ -- 5. Invalidation added to document_update and document_delete
11
+
12
+ -- =============================================================================
13
+ -- 1. Table
14
+ -- =============================================================================
15
+
16
+ CREATE TABLE semantic_cache (
17
+ id bigserial PRIMARY KEY,
18
+ query_text text NOT NULL,
19
+ query_embedding vector(1536) NOT NULL,
20
+ search_mode text NOT NULL CHECK (search_mode IN ('vector', 'keyword', 'hybrid')),
21
+ search_params jsonb NOT NULL,
22
+ cached_results jsonb NOT NULL,
23
+ source_doc_ids int[] NOT NULL,
24
+ embedding_model_id text NOT NULL,
25
+ created_at timestamptz NOT NULL DEFAULT now(),
26
+ expires_at timestamptz NOT NULL DEFAULT now() + interval '7 days'
27
+ );
28
+
29
+ -- RLS: service_role only (same pattern as other tables)
30
+ ALTER TABLE semantic_cache ENABLE ROW LEVEL SECURITY;
31
+ CREATE POLICY semantic_cache_service_role ON semantic_cache
32
+ FOR ALL USING (auth.role() = 'service_role');
33
+
34
+ -- =============================================================================
35
+ -- 2. Indexes
36
+ -- =============================================================================
37
+
38
+ -- HNSW for fast approximate nearest neighbor lookup
39
+ CREATE INDEX idx_semantic_cache_embedding
40
+ ON semantic_cache USING hnsw (query_embedding vector_cosine_ops)
41
+ WITH (m = 16, ef_construction = 128);
42
+
43
+ -- GIN for reverse index invalidation (source_doc_ids @> ARRAY[doc_id])
44
+ CREATE INDEX idx_semantic_cache_source_doc_ids
45
+ ON semantic_cache USING gin (source_doc_ids);
46
+
47
+ -- BTREE for TTL cleanup (expires_at < now())
48
+ CREATE INDEX idx_semantic_cache_expires_at
49
+ ON semantic_cache (expires_at);
50
+
51
+ -- =============================================================================
52
+ -- 3. semantic_cache_lookup
53
+ -- =============================================================================
54
+
55
+ CREATE OR REPLACE FUNCTION semantic_cache_lookup(
56
+ p_query_embedding vector(1536),
57
+ p_search_mode text,
58
+ p_search_params jsonb,
59
+ p_embedding_model_id text,
60
+ p_similarity_threshold float DEFAULT 0.90
61
+ ) RETURNS jsonb LANGUAGE plpgsql AS $$
62
+ DECLARE
63
+ v_result jsonb;
64
+ BEGIN
65
+ SELECT cached_results INTO v_result
66
+ FROM semantic_cache
67
+ WHERE 1 - (query_embedding <=> p_query_embedding) >= p_similarity_threshold
68
+ AND search_mode = p_search_mode
69
+ AND search_params = p_search_params
70
+ AND embedding_model_id = p_embedding_model_id
71
+ AND expires_at > now()
72
+ ORDER BY query_embedding <=> p_query_embedding
73
+ LIMIT 1;
74
+
75
+ RETURN v_result;
76
+ END;
77
+ $$;
78
+
79
+ -- =============================================================================
80
+ -- 4. semantic_cache_store
81
+ -- =============================================================================
82
+
83
+ CREATE OR REPLACE FUNCTION semantic_cache_store(
84
+ p_query_text text,
85
+ p_query_embedding vector(1536),
86
+ p_search_mode text,
87
+ p_search_params jsonb,
88
+ p_cached_results jsonb,
89
+ p_source_doc_ids int[],
90
+ p_embedding_model_id text
91
+ ) RETURNS void LANGUAGE plpgsql AS $$
92
+ BEGIN
93
+ INSERT INTO semantic_cache (
94
+ query_text, query_embedding, search_mode, search_params,
95
+ cached_results, source_doc_ids, embedding_model_id
96
+ ) VALUES (
97
+ p_query_text, p_query_embedding, p_search_mode, p_search_params,
98
+ p_cached_results, p_source_doc_ids, p_embedding_model_id
99
+ );
100
+ END;
101
+ $$;
102
+
103
+ -- =============================================================================
104
+ -- 5. semantic_cache_cleanup
105
+ -- =============================================================================
106
+
107
+ CREATE OR REPLACE FUNCTION semantic_cache_cleanup()
108
+ RETURNS int LANGUAGE plpgsql AS $$
109
+ DECLARE
110
+ v_count int;
111
+ BEGIN
112
+ DELETE FROM semantic_cache WHERE expires_at < now();
113
+ GET DIAGNOSTICS v_count = ROW_COUNT;
114
+ RETURN v_count;
115
+ END;
116
+ $$;
117
+
118
+ -- =============================================================================
119
+ -- 6. Invalidation: add cache clearing to document_update
120
+ -- =============================================================================
121
+
122
+ CREATE OR REPLACE FUNCTION public.document_update(
123
+ p_id bigint, p_content text, p_content_hash text,
124
+ p_agent text DEFAULT NULL, p_description text DEFAULT NULL,
125
+ p_status text DEFAULT NULL, p_embedding_model_id text DEFAULT NULL,
126
+ p_chunk_contents text[] DEFAULT NULL, p_chunk_embeddings vector[] DEFAULT NULL,
127
+ p_chunk_strategy text DEFAULT 'recursive',
128
+ p_chunk_summaries text[] DEFAULT NULL,
129
+ p_chunk_token_counts int[] DEFAULT NULL,
130
+ p_chunk_overlap int DEFAULT 0
131
+ ) RETURNS void LANGUAGE plpgsql AS $$
132
+ DECLARE
133
+ v_old_content text;
134
+ v_old_domain text;
135
+ v_version_num int;
136
+ i int;
137
+ BEGIN
138
+ SELECT content, domain INTO v_old_content, v_old_domain
139
+ FROM documents WHERE id = p_id AND deleted_at IS NULL;
140
+ IF NOT FOUND THEN RAISE EXCEPTION 'Document % not found', p_id; END IF;
141
+
142
+ -- Invalidate semantic cache entries that included this document
143
+ DELETE FROM semantic_cache WHERE source_doc_ids @> ARRAY[p_id::int];
144
+
145
+ SELECT COALESCE(MAX(version_number), 0) + 1 INTO v_version_num
146
+ FROM document_versions WHERE document_id = p_id;
147
+
148
+ INSERT INTO document_versions (document_id, version_number, content, content_hash, agent)
149
+ VALUES (p_id, v_version_num, v_old_content, encode(digest(v_old_content, 'sha256'), 'hex'), COALESCE(p_agent, 'unknown'));
150
+
151
+ UPDATE documents SET
152
+ content = p_content, content_hash = p_content_hash,
153
+ agent = COALESCE(p_agent, agent), description = COALESCE(p_description, description),
154
+ status = COALESCE(p_status, status), embedding_model_id = COALESCE(p_embedding_model_id, embedding_model_id)
155
+ WHERE id = p_id;
156
+
157
+ IF p_chunk_contents IS NOT NULL THEN
158
+ DELETE FROM document_chunks WHERE document_id = p_id;
159
+ FOR i IN 1..array_length(p_chunk_contents, 1) LOOP
160
+ INSERT INTO document_chunks (
161
+ document_id, chunk_index, content, domain, embedding,
162
+ embedding_model_id, chunk_strategy, context_summary, token_count, overlap_chars
163
+ )
164
+ VALUES (
165
+ p_id, i - 1, p_chunk_contents[i], v_old_domain, p_chunk_embeddings[i],
166
+ p_embedding_model_id, p_chunk_strategy,
167
+ CASE WHEN p_chunk_summaries IS NOT NULL THEN p_chunk_summaries[i] ELSE NULL END,
168
+ CASE WHEN p_chunk_token_counts IS NOT NULL THEN p_chunk_token_counts[i] ELSE NULL END,
169
+ p_chunk_overlap
170
+ );
171
+ END LOOP;
172
+ UPDATE documents SET chunk_count = array_length(p_chunk_contents, 1) WHERE id = p_id;
173
+ END IF;
174
+
175
+ INSERT INTO audit_log (document_id, domain, operation, agent, diff, created_at)
176
+ VALUES (p_id, v_old_domain, 'update', COALESCE(p_agent, 'unknown'), jsonb_build_object('content', v_old_content), now());
177
+ END;
178
+ $$;
179
+
180
+ -- =============================================================================
181
+ -- 7. Invalidation: add cache clearing to document_delete
182
+ -- =============================================================================
183
+
184
+ CREATE OR REPLACE FUNCTION public.document_delete(p_id bigint, p_agent text)
185
+ RETURNS void LANGUAGE plpgsql AS $$
186
+ DECLARE
187
+ v_content text;
188
+ v_domain text;
189
+ v_fields jsonb;
190
+ BEGIN
191
+ SELECT content, domain,
192
+ jsonb_build_object(
193
+ 'name', name, 'domain', domain, 'document_type', document_type,
194
+ 'project', project, 'protection', protection,
195
+ 'description', description, 'agent', agent, 'status', status,
196
+ 'file_path', file_path, 'file_permissions', file_permissions,
197
+ 'skill_ref', skill_ref, 'owner_type', owner_type, 'owner_id', owner_id,
198
+ 'is_auto_load', is_auto_load, 'source_type', source_type,
199
+ 'source_url', source_url, 'embedding_model_id', embedding_model_id,
200
+ 'content_hash', content_hash, 'schema_version', schema_version,
201
+ 'created_at', created_at
202
+ )
203
+ INTO v_content, v_domain, v_fields
204
+ FROM documents WHERE id = p_id AND deleted_at IS NULL;
205
+ IF NOT FOUND THEN RAISE EXCEPTION 'Document % not found', p_id; END IF;
206
+
207
+ -- Invalidate semantic cache entries that included this document
208
+ DELETE FROM semantic_cache WHERE source_doc_ids @> ARRAY[p_id::int];
209
+
210
+ INSERT INTO audit_log (document_id, domain, operation, agent, diff, created_at)
211
+ VALUES (p_id, v_domain, 'delete', p_agent, jsonb_build_object('content', v_content, 'fields', v_fields), now());
212
+
213
+ UPDATE documents SET deleted_at = now() WHERE id = p_id;
214
+ DELETE FROM document_chunks WHERE document_id = p_id;
215
+ END;
216
+ $$;
@@ -0,0 +1,344 @@
1
+ // batch-grade.ts
2
+ // Phase 4.6.2 — Batch grading of top-10 search results for all golden queries.
3
+ // Uses Charlie's corpus knowledge to assign TREC 0-3 grades.
4
+ //
5
+ // Run: npx tsx src/scripts/batch-grade.ts
6
+ // Dry run (print only): npx tsx src/scripts/batch-grade.ts --dry-run
7
+ import 'dotenv/config';
8
+ import { createClient } from '@supabase/supabase-js';
9
+ import OpenAI from 'openai';
10
+ import { searchHybrid } from '../lib/search/ai-search.js';
11
+ import { CURRENT_SEARCH_CONFIG } from '../lib/eval/eval-store.js';
12
+ // =============================================================================
13
+ // Setup
14
+ // =============================================================================
15
+ const supabaseUrl = process.env.SUPABASE_URL;
16
+ const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY;
17
+ const openaiKey = process.env.OPENAI_API_KEY;
18
+ if (!supabaseUrl || !supabaseKey || !openaiKey) {
19
+ console.error('Missing SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, or OPENAI_API_KEY');
20
+ process.exit(1);
21
+ }
22
+ const supabase = createClient(supabaseUrl, supabaseKey);
23
+ const openai = new OpenAI({ apiKey: openaiKey });
24
+ const dryRun = process.argv.includes('--dry-run');
25
+ const clients = {
26
+ supabase,
27
+ openai,
28
+ cohereApiKey: undefined,
29
+ };
30
+ // =============================================================================
31
+ // Grading logic — maps (query topic, doc identity) to a grade
32
+ // =============================================================================
33
+ // Topic extraction from query text
34
+ function extractQueryTopic(query) {
35
+ const lowerQuery = query.toLowerCase();
36
+ // Detect project scope
37
+ let project = null;
38
+ if (lowerQuery.includes('ledger'))
39
+ project = 'ledger';
40
+ if (lowerQuery.includes('atelier'))
41
+ project = 'atelier';
42
+ if (lowerQuery.includes('starbrite'))
43
+ project = 'starbrite';
44
+ if (lowerQuery.includes('css-forge') || lowerQuery.includes('css forge'))
45
+ project = 'css-forge';
46
+ if (lowerQuery.includes('adrian'))
47
+ project = 'persona';
48
+ // Detect query type by pattern
49
+ let queryType = 'other';
50
+ if (lowerQuery.startsWith('how') || lowerQuery.startsWith('what') || lowerQuery.startsWith('why') || lowerQuery.startsWith('when')) {
51
+ queryType = 'conceptual';
52
+ }
53
+ else if (lowerQuery.includes(' and ') || lowerQuery.includes('all ')) {
54
+ queryType = 'multi-doc';
55
+ }
56
+ else {
57
+ queryType = 'simple';
58
+ }
59
+ return { project, subject: lowerQuery, queryType };
60
+ }
61
+ // Core grading function
62
+ function gradeResult(query, queryTopic, docId, docName, docDomain, docProject) {
63
+ const lowerQuery = queryTopic.subject;
64
+ const lowerName = docName.toLowerCase();
65
+ // ==========================================================================
66
+ // Rule 1: Canonical match — doc name closely matches the query subject
67
+ // ==========================================================================
68
+ // "ledger architecture overview" -> "ledger-architecture" is canonical
69
+ // "user profile" -> "user-profile" is canonical
70
+ // "atelier overview" -> "atelier-overview" is canonical
71
+ const queryWords = lowerQuery
72
+ .replace(/['']/g, '')
73
+ .split(/\s+/)
74
+ .filter(word => !['the', 'a', 'an', 'in', 'of', 'for', 'how', 'does', 'do', 'is', 'what', 'are', 'to', 'my', 'i', 'should', 'can', 'when', 'where', 'which', 'about'].includes(word));
75
+ const nameWords = lowerName.split('-');
76
+ // Count how many meaningful query words appear in the doc name
77
+ const nameMatchCount = queryWords.filter(queryWord => nameWords.some(nameWord => nameWord.includes(queryWord) || queryWord.includes(nameWord))).length;
78
+ const nameMatchRatio = queryWords.length > 0 ? nameMatchCount / queryWords.length : 0;
79
+ // ==========================================================================
80
+ // Rule 2: Project scope matching
81
+ // ==========================================================================
82
+ const projectMatches = (queryTopic.project === null ||
83
+ queryTopic.project === 'persona' ||
84
+ docProject === queryTopic.project ||
85
+ (queryTopic.project === 'persona' && docDomain === 'persona'));
86
+ // ==========================================================================
87
+ // Rule 3: Known doc-type patterns
88
+ // ==========================================================================
89
+ const isDevlog = lowerName.includes('devlog');
90
+ const isErrorlog = lowerName.includes('errorlog') || lowerName.includes('error-log');
91
+ const isPhaseSpec = lowerName.includes('-phase-') || lowerName.includes('-v2-phase');
92
+ const isSessionEvent = lowerName.includes('session-');
93
+ const isClaudeMd = lowerName.includes('claude-md');
94
+ const isMemoryMd = lowerName.includes('memory-md');
95
+ const isFeedback = lowerName.includes('feedback-');
96
+ const isLintConfig = lowerName.includes('lint-');
97
+ const isSkillDoc = lowerName.includes('custom-skills-');
98
+ const isAgentSpec = lowerName.includes('atelier-agent-');
99
+ const isCodeCraft = lowerName.includes('code-craft-');
100
+ const isReference = docDomain === 'general' && lowerName.includes('reference-');
101
+ const isExploration = lowerName.includes('exploration-complete');
102
+ const isStatusDashboard = lowerName.includes('status-dashboard') || lowerName.includes('project-status');
103
+ // ==========================================================================
104
+ // Grading decision tree
105
+ // ==========================================================================
106
+ // Strong canonical match: >70% of query words match the doc name
107
+ if (nameMatchRatio >= 0.7 && projectMatches) {
108
+ return 3;
109
+ }
110
+ // Moderate match: >50% of query words match
111
+ if (nameMatchRatio >= 0.5 && projectMatches) {
112
+ // Check if this is a high-quality doc for the topic
113
+ if (isDevlog || isSessionEvent || isErrorlog)
114
+ return 1;
115
+ if (isClaudeMd || isMemoryMd)
116
+ return 0;
117
+ return 2;
118
+ }
119
+ // Devlogs are almost never relevant unless the query is specifically about the devlog
120
+ if (isDevlog && !lowerQuery.includes('devlog') && !lowerQuery.includes('development log') && !lowerQuery.includes('session history')) {
121
+ return 0;
122
+ }
123
+ // Claude.md and memory.md are internal config, almost never the answer
124
+ if (isClaudeMd && !lowerQuery.includes('claude.md') && !lowerQuery.includes('claude md') && !lowerQuery.includes('identity') && !lowerQuery.includes('orchestrator')) {
125
+ return 0;
126
+ }
127
+ if (isMemoryMd && !lowerQuery.includes('memory')) {
128
+ return 0;
129
+ }
130
+ // Feedback rules are only relevant to behavioral/feedback queries
131
+ if (isFeedback && !lowerQuery.includes('feedback') && !lowerQuery.includes('behavioral') && !lowerQuery.includes('rule')) {
132
+ return 0;
133
+ }
134
+ // Lint configs are only relevant to linting queries
135
+ if (isLintConfig && !lowerQuery.includes('lint') && !lowerQuery.includes('eslint') && !lowerQuery.includes('stylelint')) {
136
+ return 0;
137
+ }
138
+ // Phase specs: relevant only when asking about that specific phase or topic
139
+ if (isPhaseSpec) {
140
+ // Check if the query topic matches the phase subject
141
+ if (lowerQuery.includes('sync') && lowerName.includes('sync'))
142
+ return 2;
143
+ if (lowerQuery.includes('access') && lowerName.includes('access'))
144
+ return 2;
145
+ if (lowerQuery.includes('observability') && lowerName.includes('observability'))
146
+ return 2;
147
+ if (lowerQuery.includes('security') && lowerName.includes('access'))
148
+ return 1;
149
+ if (lowerQuery.includes('roadmap') || lowerQuery.includes('phase') || lowerQuery.includes('plan'))
150
+ return 1;
151
+ return 0;
152
+ }
153
+ // Session events: rarely relevant
154
+ if (isSessionEvent && !lowerQuery.includes('session')) {
155
+ return 0;
156
+ }
157
+ // Skill docs: relevant only to skill/eval queries
158
+ if (isSkillDoc) {
159
+ if (lowerQuery.includes('skill') || lowerQuery.includes('eval') || lowerQuery.includes('review'))
160
+ return 1;
161
+ return 0;
162
+ }
163
+ // Agent specs: relevant to agent/atelier queries
164
+ if (isAgentSpec) {
165
+ if (queryTopic.project === 'atelier' || lowerQuery.includes('agent'))
166
+ return 2;
167
+ if (lowerQuery.includes('developer') || lowerQuery.includes('design') || lowerQuery.includes('qa') || lowerQuery.includes('security')) {
168
+ // Specific agent might match
169
+ if (lowerQuery.includes('developer') && lowerName.includes('cody'))
170
+ return 2;
171
+ if (lowerQuery.includes('design') && lowerName.includes('ross'))
172
+ return 2;
173
+ if (lowerQuery.includes('qa') && lowerName.includes('stan'))
174
+ return 2;
175
+ if (lowerQuery.includes('accessibility') && lowerName.includes('ada'))
176
+ return 2;
177
+ if (lowerQuery.includes('security') && (lowerName.includes('marshall') || lowerName.includes('chase')))
178
+ return 2;
179
+ return 1;
180
+ }
181
+ return 0;
182
+ }
183
+ // Code-craft docs: relevant to coding convention/style queries
184
+ if (isCodeCraft) {
185
+ if (lowerQuery.includes('convention') || lowerQuery.includes('coding') || lowerQuery.includes('style') || lowerQuery.includes('pattern')) {
186
+ // Check subject match
187
+ if (lowerQuery.includes('css') && lowerName.includes('css'))
188
+ return 3;
189
+ if (lowerQuery.includes('react') && lowerName.includes('react'))
190
+ return 3;
191
+ if (lowerQuery.includes('clean code') && lowerName.includes('clean-code'))
192
+ return 3;
193
+ if (lowerQuery.includes('naming') && lowerName.includes('naming'))
194
+ return 3;
195
+ if (lowerQuery.includes('design system') && lowerName.includes('ds-'))
196
+ return 2;
197
+ if (lowerQuery.includes('design') && lowerName.includes('ds-'))
198
+ return 2;
199
+ return 1;
200
+ }
201
+ if (lowerQuery.includes('design system') && lowerName.includes('ds-'))
202
+ return 2;
203
+ if (lowerQuery.includes('design') && lowerName.includes('design'))
204
+ return 1;
205
+ return 0;
206
+ }
207
+ // Reference docs: potentially valuable for conceptual queries
208
+ if (isReference) {
209
+ if (lowerQuery.includes('rag') && lowerName.includes('rag'))
210
+ return 2;
211
+ if (lowerQuery.includes('database') && lowerName.includes('database'))
212
+ return 2;
213
+ if (lowerQuery.includes('eval') && lowerName.includes('eval'))
214
+ return 2;
215
+ if (lowerQuery.includes('color') && lowerName.includes('color'))
216
+ return 3;
217
+ if (lowerQuery.includes('portfolio') && lowerName.includes('portfolio'))
218
+ return 3;
219
+ return 0;
220
+ }
221
+ // System exploration: useful for broad Ledger queries
222
+ if (isExploration && queryTopic.project === 'ledger') {
223
+ return 2;
224
+ }
225
+ // Status dashboards: relevant to project status queries
226
+ if (isStatusDashboard) {
227
+ if (lowerQuery.includes('status') || lowerQuery.includes('progress') || lowerQuery.includes('dashboard'))
228
+ return 2;
229
+ return 0;
230
+ }
231
+ // ==========================================================================
232
+ // Weak match: some query words match, same project
233
+ // ==========================================================================
234
+ if (nameMatchRatio >= 0.3 && projectMatches) {
235
+ return 1;
236
+ }
237
+ // Same project but no name match: might be tangentially related
238
+ if (projectMatches && queryTopic.project !== null && nameMatchRatio > 0) {
239
+ return 1;
240
+ }
241
+ // ==========================================================================
242
+ // Default: not relevant
243
+ // ==========================================================================
244
+ return 0;
245
+ }
246
+ // =============================================================================
247
+ // Main
248
+ // =============================================================================
249
+ async function main() {
250
+ console.log(dryRun ? '\n[DRY RUN] Grading without writing to database.\n' : '\nBatch grading starting.\n');
251
+ // Load all golden queries with existing judgments
252
+ const { data: goldenRows, error: loadError } = await supabase
253
+ .from('eval_golden_dataset')
254
+ .select('id, query, tags, judgments:eval_golden_judgments(document_id, grade)')
255
+ .order('id');
256
+ if (loadError || !goldenRows) {
257
+ console.error('Failed to load golden dataset:', loadError?.message ?? 'no data');
258
+ process.exit(1);
259
+ }
260
+ const queries = goldenRows;
261
+ let totalGraded = 0;
262
+ let totalSkipped = 0;
263
+ let totalErrors = 0;
264
+ const gradeCounts = { 0: 0, 1: 0, 2: 0, 3: 0 };
265
+ for (const golden of queries) {
266
+ const existingGrades = new Map();
267
+ for (const judgment of golden.judgments ?? []) {
268
+ existingGrades.set(judgment.document_id, judgment.grade);
269
+ }
270
+ // Skip out-of-scope queries (no grade-2+ judgments expected)
271
+ const hasRelevant = (golden.judgments ?? []).some(judgment => judgment.grade >= 2);
272
+ const isOutOfScope = !hasRelevant && existingGrades.size === 0;
273
+ // Run search
274
+ const searchResults = await searchHybrid(clients, {
275
+ query: golden.query,
276
+ limit: CURRENT_SEARCH_CONFIG.limit,
277
+ reranker: CURRENT_SEARCH_CONFIG.reranker,
278
+ });
279
+ const queryTopic = extractQueryTopic(golden.query);
280
+ const ungradedResults = searchResults
281
+ .slice(0, 10)
282
+ .filter(result => !existingGrades.has(result.id));
283
+ if (ungradedResults.length === 0) {
284
+ continue;
285
+ }
286
+ if (dryRun) {
287
+ console.log(`\nQuery #${golden.id}: "${golden.query}"`);
288
+ }
289
+ for (const result of ungradedResults) {
290
+ const grade = gradeResult(golden.query, queryTopic, result.id, result.name ?? '<unknown>', result.domain ?? 'general', result.project ?? null);
291
+ gradeCounts[grade]++;
292
+ if (dryRun) {
293
+ console.log(` #${result.id} ${result.name ?? '<unknown>'} → grade ${grade}`);
294
+ totalGraded++;
295
+ continue;
296
+ }
297
+ // Write to database
298
+ const { error: rpcError } = await supabase.rpc('judgment_create', {
299
+ p_golden_id: golden.id,
300
+ p_document_id: result.id,
301
+ p_grade: grade,
302
+ p_judged_by: 'charlie-batch-4.6.2',
303
+ p_notes: null,
304
+ });
305
+ if (rpcError) {
306
+ const message = rpcError.message ?? '';
307
+ if (message.includes('duplicate') || message.includes('unique')) {
308
+ totalSkipped++;
309
+ }
310
+ else {
311
+ totalErrors++;
312
+ console.error(` [ERR] golden_id=${golden.id} doc_id=${result.id}: ${message}`);
313
+ }
314
+ }
315
+ else {
316
+ totalGraded++;
317
+ }
318
+ }
319
+ }
320
+ console.log('');
321
+ console.log('='.repeat(60));
322
+ console.log('Batch grading summary');
323
+ console.log('='.repeat(60));
324
+ console.log(` Queries processed: ${queries.length}`);
325
+ console.log(` Judgments created: ${totalGraded}`);
326
+ console.log(` Skipped (duplicate): ${totalSkipped}`);
327
+ console.log(` Errors: ${totalErrors}`);
328
+ console.log('');
329
+ console.log(' Grade distribution:');
330
+ console.log(` 0 (not relevant): ${gradeCounts[0]}`);
331
+ console.log(` 1 (related): ${gradeCounts[1]}`);
332
+ console.log(` 2 (relevant): ${gradeCounts[2]}`);
333
+ console.log(` 3 (highly relevant): ${gradeCounts[3]}`);
334
+ console.log('');
335
+ if (totalErrors > 0) {
336
+ console.error('Completed with errors.');
337
+ process.exit(1);
338
+ }
339
+ console.log(dryRun ? '[DRY RUN] No writes performed.' : 'Batch grading complete.');
340
+ }
341
+ main().catch((error) => {
342
+ console.error(error);
343
+ process.exit(1);
344
+ });