@aperdomoll90/ledger-ai 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/cli.js +177 -221
  2. package/dist/commands/add.js +51 -100
  3. package/dist/commands/backfill.js +55 -0
  4. package/dist/commands/backup.js +10 -10
  5. package/dist/commands/check.js +21 -29
  6. package/dist/commands/config.js +13 -12
  7. package/dist/commands/delete.js +22 -17
  8. package/dist/commands/eval-judge.js +11 -0
  9. package/dist/commands/eval.js +321 -0
  10. package/dist/commands/export.js +8 -10
  11. package/dist/commands/get.js +9 -0
  12. package/dist/commands/hunt.js +206 -0
  13. package/dist/commands/ingest.js +15 -14
  14. package/dist/commands/init.js +18 -20
  15. package/dist/commands/list.js +21 -7
  16. package/dist/commands/migrate.js +11 -11
  17. package/dist/commands/onboard.js +2 -2
  18. package/dist/commands/pull.js +3 -2
  19. package/dist/commands/push.js +8 -8
  20. package/dist/commands/restore.js +38 -38
  21. package/dist/commands/show.js +13 -16
  22. package/dist/commands/sync.js +58 -19
  23. package/dist/commands/tag.js +20 -14
  24. package/dist/commands/update.js +50 -18
  25. package/dist/commands/wizard.js +3 -3
  26. package/dist/lib/ai-search.js +163 -0
  27. package/dist/lib/audit.js +19 -0
  28. package/dist/lib/backfill.js +60 -0
  29. package/dist/lib/config.js +19 -2
  30. package/dist/lib/document-classification.js +5 -0
  31. package/dist/lib/document-fetching.js +77 -0
  32. package/dist/lib/document-operations.js +150 -0
  33. package/dist/lib/documents/classification.js +5 -0
  34. package/dist/lib/documents/fetching.js +89 -0
  35. package/dist/lib/documents/operations.js +304 -0
  36. package/dist/lib/domains.js +116 -0
  37. package/dist/lib/embeddings.js +190 -0
  38. package/dist/lib/errors.js +3 -1
  39. package/dist/lib/eval/eval-advanced.js +289 -0
  40. package/dist/lib/eval/eval-judge-session.js +233 -0
  41. package/dist/lib/eval/eval-store.js +105 -0
  42. package/dist/lib/eval/eval.js +303 -0
  43. package/dist/lib/file-writer.js +23 -0
  44. package/dist/lib/generators.js +44 -45
  45. package/dist/lib/hunter-db.js +235 -0
  46. package/dist/lib/hunter-rss.js +30 -0
  47. package/dist/lib/hunter-scoring.js +55 -0
  48. package/dist/lib/hunter-types.js +36 -0
  49. package/dist/lib/lint-configs.js +20 -0
  50. package/dist/lib/migrate.js +2 -2
  51. package/dist/lib/notes.js +173 -59
  52. package/dist/lib/observability.js +296 -0
  53. package/dist/lib/op-add-note-types.test.js +7 -6
  54. package/dist/lib/prompt.js +8 -8
  55. package/dist/lib/rate-limiter.js +103 -0
  56. package/dist/lib/search/ai-search.js +396 -0
  57. package/dist/lib/search/chunk-context-enrichment.js +155 -0
  58. package/dist/lib/search/embeddings.js +293 -0
  59. package/dist/lib/search/reranker.js +120 -0
  60. package/dist/lib/search/semantic-cache.js +53 -0
  61. package/dist/lib/type-registry.test.js +6 -6
  62. package/dist/mcp-server.js +553 -66
  63. package/dist/migrations/migrations/005-audit-log.sql +22 -0
  64. package/dist/migrations/migrations/005_opportunities.sql +48 -0
  65. package/dist/migrations/migrations/006-audited-operations.sql +235 -0
  66. package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
  67. package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
  68. package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
  69. package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
  70. package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
  71. package/dist/scripts/batch-grade.js +344 -0
  72. package/dist/scripts/benchmark-ingestion.js +376 -0
  73. package/dist/scripts/convert-judgments-to-graded.js +88 -0
  74. package/dist/scripts/diagnose-first-result.js +333 -0
  75. package/dist/scripts/drop-golden-query.js +53 -0
  76. package/dist/scripts/eval-search.js +115 -0
  77. package/dist/scripts/grade-unjudged-top1.js +138 -0
  78. package/dist/scripts/hunter-analytics.js +38 -0
  79. package/dist/scripts/hunter-cron.js +63 -0
  80. package/dist/scripts/hunter-purge.js +25 -0
  81. package/dist/scripts/migrate-v2.js +140 -0
  82. package/dist/scripts/reindex.js +74 -0
  83. package/dist/scripts/sync-local-docs.js +153 -0
  84. package/package.json +7 -1
@@ -0,0 +1,396 @@
1
+ // ai-search.ts
2
+ // AI-powered search — vector (meaning), keyword (exact words), hybrid (both combined).
3
+ // Each function calls a Postgres RPC function that does the actual search.
4
+ // TypeScript's job: generate the query embedding, then call the right function.
5
+ import { getOrCacheQueryEmbedding, toVectorString } from './embeddings.js';
6
+ import { rerankResults } from './reranker.js';
7
+ import { buildSearchParams, extractSourceDocIds, SEMANTIC_CACHE_MODEL_ID, SEMANTIC_CACHE_THRESHOLD, } from './semantic-cache.js';
8
+ import { runSearchTrace, startSpan, recordChildSpan, withActiveSpan } from '../observability.js';
9
+ // =============================================================================
10
+ // Search evaluation logging
11
+ // =============================================================================
12
+ /**
13
+ * Log a search to the search_evaluations table.
14
+ * Called after every search — silently records what was searched,
15
+ * what came back, and how long it took. This is the raw data
16
+ * that powers all evaluation, quality tracking, and improvement.
17
+ *
18
+ * Fire-and-forget: we don't await this. If logging fails,
19
+ * the search still returns results. The user never waits for logging.
20
+ */
21
+ function logSearchEvaluation(supabase, params) {
22
+ // Extract unique document_types and source_types from results
23
+ // These tell us which types of documents search finds well vs poorly
24
+ const documentTypes = [...new Set(params.results.map(result => result.document_type))];
25
+ // Build the results JSONB array — just IDs and scores, not full content
26
+ const resultsSummary = params.results.map(result => ({
27
+ id: result.id,
28
+ score: result.similarity ?? result.rank ?? result.score ?? null,
29
+ document_type: result.document_type,
30
+ }));
31
+ // Fire and forget — don't await, don't block the search response
32
+ supabase
33
+ .from('search_evaluations')
34
+ .insert({
35
+ query_text: params.query,
36
+ search_mode: params.searchMode,
37
+ result_count: params.results.length,
38
+ results: resultsSummary,
39
+ document_types: documentTypes,
40
+ response_time_ms: params.responseTimeMs,
41
+ })
42
+ .then(() => { })
43
+ .catch((logError) => {
44
+ process.stderr.write(`[ledger] search evaluation logging failed: ${logError.message ?? 'unknown error'}\n`);
45
+ });
46
+ }
47
+ // =============================================================================
48
+ // Search functions
49
+ // =============================================================================
50
+ /**
51
+ * Search by meaning — "how does auth work?" finds documents about OAuth.
52
+ *
53
+ * Flow:
54
+ * 1. Convert query text to an embedding (array of 1,536 numbers) via OpenAI
55
+ * 2. Check the query_cache first to avoid repeat API calls
56
+ * 3. Call match_documents RPC — Postgres compares the query embedding
57
+ * against every chunk's embedding using cosine similarity
58
+ * 4. Return matching documents sorted by similarity
59
+ */
60
+ export async function searchByVector(clients, props) {
61
+ const startTime = Date.now();
62
+ return runSearchTrace({
63
+ mode: 'vector',
64
+ query: props.query,
65
+ environment: clients.observabilityEnvironment,
66
+ sessionId: clients.sessionId,
67
+ input: {
68
+ query: props.query,
69
+ filters: { domain: props.domain, project: props.project, document_type: props.document_type },
70
+ },
71
+ metadata: { threshold: props.threshold ?? 0.38, limit: props.limit ?? 10 },
72
+ }, async (trace) => {
73
+ const queryEmbedding = await getOrCacheQueryEmbedding(clients, props.query);
74
+ const embeddingString = toVectorString(queryEmbedding);
75
+ // Semantic cache lookup (layer 2)
76
+ const searchParams = buildSearchParams({
77
+ threshold: props.threshold ?? 0.38,
78
+ limit: props.limit ?? 10,
79
+ domain: props.domain,
80
+ document_type: props.document_type,
81
+ project: props.project,
82
+ });
83
+ const cacheSpan = startSpan('semantic-cache-lookup');
84
+ const { data: cachedResults } = await clients.supabase.rpc('semantic_cache_lookup', {
85
+ p_query_embedding: embeddingString,
86
+ p_search_mode: 'vector',
87
+ p_search_params: searchParams,
88
+ p_embedding_model_id: SEMANTIC_CACHE_MODEL_ID,
89
+ p_similarity_threshold: SEMANTIC_CACHE_THRESHOLD,
90
+ });
91
+ const cacheHit = !!(cachedResults && cachedResults.length > 0);
92
+ cacheSpan.update({ output: { hit: cacheHit } });
93
+ cacheSpan.end();
94
+ if (cacheHit) {
95
+ const results = cachedResults;
96
+ trace.update({
97
+ output: {
98
+ resultCount: results.length,
99
+ topResultIds: results.slice(0, 3).map(result => result.id),
100
+ cacheHit: true,
101
+ },
102
+ });
103
+ logSearchEvaluation(clients.supabase, {
104
+ query: props.query,
105
+ searchMode: 'vector',
106
+ results,
107
+ responseTimeMs: Date.now() - startTime,
108
+ });
109
+ return results;
110
+ }
111
+ // Cache miss: run full search pipeline
112
+ const retrieveSpan = startSpan('retrieve');
113
+ const { data, error } = await clients.supabase.rpc('match_documents', {
114
+ q_emb: embeddingString,
115
+ p_threshold: props.threshold ?? 0.38,
116
+ p_max_results: props.limit ?? 10,
117
+ p_domain: props.domain ?? null,
118
+ p_document_type: props.document_type ?? null,
119
+ p_project: props.project ?? null,
120
+ });
121
+ if (error) {
122
+ retrieveSpan.update({ output: { error: error.message } });
123
+ retrieveSpan.end();
124
+ trace.update({ output: { error: error.message } });
125
+ throw new Error(`Vector search failed for "${props.query}": ${error.message}`);
126
+ }
127
+ const results = (data ?? []);
128
+ retrieveSpan.update({ output: { rowCount: results.length } });
129
+ retrieveSpan.end();
130
+ // Store in semantic cache (non-blocking)
131
+ if (results.length > 0) {
132
+ const sourceDocIds = extractSourceDocIds(results);
133
+ const storeSpan = startSpan('semantic-cache-store');
134
+ Promise.resolve(clients.supabase.rpc('semantic_cache_store', {
135
+ p_query_text: props.query,
136
+ p_query_embedding: embeddingString,
137
+ p_search_mode: 'vector',
138
+ p_search_params: searchParams,
139
+ p_cached_results: results,
140
+ p_source_doc_ids: sourceDocIds,
141
+ p_embedding_model_id: SEMANTIC_CACHE_MODEL_ID,
142
+ })).then(() => {
143
+ storeSpan.end();
144
+ }).catch((cacheStoreError) => {
145
+ storeSpan.update({ output: { error: cacheStoreError.message ?? 'unknown' } });
146
+ storeSpan.end();
147
+ process.stderr.write(`[ledger] semantic cache store failed: ${cacheStoreError.message ?? 'unknown'}\n`);
148
+ });
149
+ }
150
+ trace.update({
151
+ output: {
152
+ resultCount: results.length,
153
+ topResultIds: results.slice(0, 3).map(result => result.id),
154
+ cacheHit: false,
155
+ },
156
+ });
157
+ logSearchEvaluation(clients.supabase, {
158
+ query: props.query,
159
+ searchMode: 'vector',
160
+ results,
161
+ responseTimeMs: Date.now() - startTime,
162
+ });
163
+ return results;
164
+ });
165
+ }
166
+ /**
167
+ * Search by exact words — "pgvector HNSW" finds documents containing those words.
168
+ *
169
+ * No embedding needed — Postgres uses the search_vector column (GIN index)
170
+ * to match words directly. Good for code identifiers, proper nouns, error messages.
171
+ */
172
+ export async function searchByKeyword(clients, props) {
173
+ const startTime = Date.now();
174
+ return runSearchTrace({
175
+ mode: 'keyword',
176
+ query: props.query,
177
+ environment: clients.observabilityEnvironment,
178
+ sessionId: clients.sessionId,
179
+ input: {
180
+ query: props.query,
181
+ filters: { domain: props.domain, project: props.project, document_type: props.document_type },
182
+ },
183
+ metadata: { limit: props.limit ?? 10 },
184
+ }, async (trace) => {
185
+ const { data, error } = await clients.supabase.rpc('match_documents_keyword', {
186
+ p_query: props.query,
187
+ p_max_results: props.limit ?? 10,
188
+ p_domain: props.domain ?? null,
189
+ p_document_type: props.document_type ?? null,
190
+ p_project: props.project ?? null,
191
+ });
192
+ if (error) {
193
+ trace.update({ output: { error: error.message } });
194
+ throw new Error(`Keyword search failed for "${props.query}": ${error.message}`);
195
+ }
196
+ const results = (data ?? []);
197
+ trace.update({
198
+ output: {
199
+ resultCount: results.length,
200
+ topResultIds: results.slice(0, 3).map(result => result.id),
201
+ cacheHit: false,
202
+ },
203
+ });
204
+ logSearchEvaluation(clients.supabase, {
205
+ query: props.query,
206
+ searchMode: 'keyword',
207
+ results,
208
+ responseTimeMs: Date.now() - startTime,
209
+ });
210
+ return results;
211
+ });
212
+ }
213
+ /**
214
+ * Combined search — runs both vector AND keyword, merges results with RRF fusion.
215
+ *
216
+ * Documents found by both methods rank highest. This is the default search mode
217
+ * because it handles both meaning-based queries ("how does auth work?") and
218
+ * exact-term queries ("pgvector HNSW") well.
219
+ *
220
+ * RRF (Reciprocal Rank Fusion) formula:
221
+ * score = 1/(k + vector_rank) + 1/(k + keyword_rank)
222
+ * k=60 is a smoothing constant that prevents the #1 result from dominating.
223
+ */
224
+ export async function searchHybrid(clients, props) {
225
+ const startTime = Date.now();
226
+ // When reranking, fetch more candidates so the reranker has a bigger pool.
227
+ // The reranker will select the best N from this larger set.
228
+ const useReranker = props.reranker === 'cohere' && clients.cohereApiKey;
229
+ const desiredLimit = props.limit ?? 10;
230
+ const requestLimit = useReranker ? desiredLimit * 2 : desiredLimit;
231
+ return runSearchTrace({
232
+ mode: useReranker ? 'hybrid+rerank' : 'hybrid',
233
+ query: props.query,
234
+ environment: clients.observabilityEnvironment,
235
+ sessionId: clients.sessionId,
236
+ input: {
237
+ query: props.query,
238
+ filters: { domain: props.domain, project: props.project, document_type: props.document_type },
239
+ },
240
+ metadata: {
241
+ threshold: props.threshold ?? 0.38,
242
+ limit: desiredLimit,
243
+ rerankerEnabled: !!useReranker,
244
+ reciprocalRankFusionK: props.reciprocalRankFusionK ?? 60,
245
+ },
246
+ }, async (trace) => {
247
+ const queryEmbedding = await getOrCacheQueryEmbedding(clients, props.query);
248
+ const embeddingString = toVectorString(queryEmbedding);
249
+ // Semantic cache lookup (layer 2)
250
+ // Skip cache when reranker is enabled (reranker produces different ordering)
251
+ const searchParams = buildSearchParams({
252
+ threshold: props.threshold ?? 0.38,
253
+ limit: requestLimit,
254
+ domain: props.domain,
255
+ document_type: props.document_type,
256
+ project: props.project,
257
+ });
258
+ if (!useReranker) {
259
+ const cacheSpan = startSpan('semantic-cache-lookup');
260
+ const { data: cachedResults } = await clients.supabase.rpc('semantic_cache_lookup', {
261
+ p_query_embedding: embeddingString,
262
+ p_search_mode: 'hybrid',
263
+ p_search_params: searchParams,
264
+ p_embedding_model_id: SEMANTIC_CACHE_MODEL_ID,
265
+ p_similarity_threshold: SEMANTIC_CACHE_THRESHOLD,
266
+ });
267
+ const cacheHit = !!(cachedResults && cachedResults.length > 0);
268
+ cacheSpan.update({ output: { hit: cacheHit } });
269
+ cacheSpan.end();
270
+ if (cacheHit) {
271
+ const cachedRows = cachedResults;
272
+ trace.update({
273
+ output: {
274
+ resultCount: cachedRows.length,
275
+ topResultIds: cachedRows.slice(0, 3).map(result => result.id),
276
+ cacheHit: true,
277
+ },
278
+ });
279
+ logSearchEvaluation(clients.supabase, {
280
+ query: props.query,
281
+ searchMode: 'hybrid',
282
+ results: cachedRows,
283
+ responseTimeMs: Date.now() - startTime,
284
+ });
285
+ return cachedRows;
286
+ }
287
+ }
288
+ // Cache miss: run full search pipeline
289
+ const retrieveSpan = startSpan('retrieve');
290
+ const retrieveStart = Date.now();
291
+ const { data, error } = await clients.supabase.rpc('match_documents_hybrid', {
292
+ q_emb: embeddingString,
293
+ q_text: props.query,
294
+ p_threshold: props.threshold ?? 0.38,
295
+ p_max_results: requestLimit,
296
+ p_domain: props.domain ?? null,
297
+ p_document_type: props.document_type ?? null,
298
+ p_project: props.project ?? null,
299
+ p_rrf_k: props.reciprocalRankFusionK ?? 60,
300
+ });
301
+ if (error) {
302
+ retrieveSpan.update({ output: { error: error.message } });
303
+ retrieveSpan.end();
304
+ trace.update({ output: { error: error.message } });
305
+ throw new Error(`Hybrid search failed for "${props.query}": ${error.message}`);
306
+ }
307
+ const rows = (data ?? []);
308
+ const timing = rows[0]?.timing;
309
+ retrieveSpan.update({ output: { rowCount: rows.length, timing } });
310
+ retrieveSpan.end();
311
+ // Emit three child spans from the Postgres timing sidecar.
312
+ // Spans are backdated from retrieveStart using the measured ms deltas.
313
+ if (timing) {
314
+ let cursor = retrieveStart;
315
+ recordChildSpan('retrieve.vector', cursor, cursor + timing.vector_ms, { durationMs: timing.vector_ms });
316
+ cursor += timing.vector_ms;
317
+ recordChildSpan('retrieve.keyword', cursor, cursor + timing.keyword_ms, { durationMs: timing.keyword_ms });
318
+ cursor += timing.keyword_ms;
319
+ recordChildSpan('retrieve.fusion', cursor, cursor + timing.fusion_ms, { durationMs: timing.fusion_ms });
320
+ }
321
+ // Strip timing from rows before exposing to callers (internal sidecar only).
322
+ let results = rows.map(({ timing: _timing, ...rest }) => rest);
323
+ // Rerank: send candidates to Cohere cross-encoder for re-scoring.
324
+ // If reranking fails, results are returned unchanged (graceful degradation).
325
+ if (useReranker && results.length > 0) {
326
+ const rerankSpan = startSpan('rerank');
327
+ const inputCount = results.length;
328
+ results = await withActiveSpan(rerankSpan, async () => {
329
+ return rerankResults(props.query, results, {
330
+ apiKey: clients.cohereApiKey,
331
+ topN: desiredLimit,
332
+ });
333
+ });
334
+ rerankSpan.update({ output: { inputCount, outputCount: results.length } });
335
+ rerankSpan.end();
336
+ }
337
+ // Store in semantic cache (non-blocking, skip if reranker was used)
338
+ if (results.length > 0 && !useReranker) {
339
+ const sourceDocIds = extractSourceDocIds(results);
340
+ const storeSpan = startSpan('semantic-cache-store');
341
+ Promise.resolve(clients.supabase.rpc('semantic_cache_store', {
342
+ p_query_text: props.query,
343
+ p_query_embedding: embeddingString,
344
+ p_search_mode: 'hybrid',
345
+ p_search_params: searchParams,
346
+ p_cached_results: results,
347
+ p_source_doc_ids: sourceDocIds,
348
+ p_embedding_model_id: SEMANTIC_CACHE_MODEL_ID,
349
+ })).then(() => {
350
+ storeSpan.end();
351
+ }).catch((cacheStoreError) => {
352
+ storeSpan.update({ output: { error: cacheStoreError.message ?? 'unknown' } });
353
+ storeSpan.end();
354
+ process.stderr.write(`[ledger] semantic cache store failed: ${cacheStoreError.message ?? 'unknown'}\n`);
355
+ });
356
+ }
357
+ trace.update({
358
+ output: {
359
+ resultCount: results.length,
360
+ topResultIds: results.slice(0, 3).map(result => result.id),
361
+ cacheHit: false,
362
+ },
363
+ });
364
+ logSearchEvaluation(clients.supabase, {
365
+ query: props.query,
366
+ searchMode: useReranker ? 'hybrid+rerank' : 'hybrid',
367
+ results,
368
+ responseTimeMs: Date.now() - startTime,
369
+ });
370
+ return results;
371
+ });
372
+ }
373
+ /**
374
+ * Smart retrieval — decide how much content to send to the LLM.
375
+ *
376
+ * After search finds a matching document, this decides:
377
+ * - Small document (under context_window chars) → return full content
378
+ * - Large document → return only the matched chunk + neighbors
379
+ *
380
+ * Why: sending a 50,000-char document to the LLM when only one section
381
+ * is relevant wastes tokens and money. But sending only a 500-char chunk
382
+ * might miss context. This finds the balance.
383
+ */
384
+ export async function retrieveContext(supabase, props) {
385
+ const { data, error } = await supabase.rpc('retrieve_context', {
386
+ p_document_id: props.document_id,
387
+ p_matched_chunk_index: props.matched_chunk_index,
388
+ p_context_window: props.context_window ?? 4000,
389
+ p_neighbor_count: props.neighbor_count ?? 1,
390
+ });
391
+ if (error)
392
+ throw new Error(`Context retrieval failed for document #${props.document_id}, chunk ${props.matched_chunk_index}: ${error.message}`);
393
+ if (!data || (Array.isArray(data) && data.length === 0))
394
+ return null;
395
+ return (Array.isArray(data) ? data[0] : data);
396
+ }
@@ -0,0 +1,155 @@
1
+ // chunk-context-enrichment.ts
2
+ // Pre-embedding enrichment — generates context summaries per chunk using an LLM.
3
+ //
4
+ // Implements the "Contextual Retrieval" technique (Anthropic, 2024).
5
+ // We call it "chunk context enrichment" because the code operates at ingestion
6
+ // time, enriching chunks with document context before embedding — not at
7
+ // retrieval time. The industry name describes the goal (better retrieval),
8
+ // not the action.
9
+ //
10
+ // Optimized pipeline (S38):
11
+ // 1. Generate a document summary (one LLM call, processes full document once)
12
+ // 2. For each chunk, build context from: summary + header path + neighbor chunks
13
+ // 3. Fire all LLM calls in parallel (rate limiter controls concurrency)
14
+ // 4. Each call processes ~1K tokens instead of ~18K (95% token reduction)
15
+ //
16
+ // This reduces ingestion of a 73K document from ~12 minutes to ~30 seconds.
17
+ // The key insight: truncated context (summary + neighbors instead of full doc)
18
+ // reduces per-call tokens enough to unblock parallelism without hitting
19
+ // the TPM (Tokens Per Minute) limit.
20
+ import { openaiLimiter } from '../rate-limiter.js';
21
+ const CONTEXT_ENRICHMENT_MODEL = 'gpt-4o-mini';
22
+ const SUMMARY_PROMPT = `Summarize this document in 150-200 words. Focus on: what the document is about, its structure, and the key topics it covers. Be factual and concise.
23
+
24
+ <document>
25
+ {DOCUMENT_CONTENT}
26
+ </document>`;
27
+ const CONTEXT_PROMPT = `Here is a summary of the document:
28
+ <document_summary>
29
+ {DOCUMENT_SUMMARY}
30
+ </document_summary>
31
+
32
+ Here is the section this chunk belongs to (header path):
33
+ <section>
34
+ {HEADER_PATH}
35
+ </section>
36
+
37
+ Here are the neighboring chunks for context:
38
+ <previous_chunk>
39
+ {PREV_CHUNK}
40
+ </previous_chunk>
41
+
42
+ <chunk>
43
+ {CHUNK_CONTENT}
44
+ </chunk>
45
+
46
+ <next_chunk>
47
+ {NEXT_CHUNK}
48
+ </next_chunk>
49
+
50
+ Write a short context (2-3 sentences) that situates this chunk within the document. Include the document's topic and what specific information this chunk covers. Be concise and factual.`;
51
+ // =============================================================================
52
+ // Pure functions
53
+ // =============================================================================
54
+ /**
55
+ * Estimate token count from character length.
56
+ * Standard approximation for English text with GPT tokenizers: ~4 chars per token.
57
+ * Used for token budgeting in search results (e.g., limiting chunks to fit a context window).
58
+ */
59
+ export function estimateTokenCount(text) {
60
+ return Math.ceil(text.length / 4);
61
+ }
62
+ /**
63
+ * Find the markdown header hierarchy for a chunk's position in the document.
64
+ * Returns a path like "Database > Caching > semantic_cache".
65
+ * Uses string matching, no LLM call needed.
66
+ */
67
+ export function findHeaderPath(documentContent, chunkContent) {
68
+ const lines = documentContent.split('\n');
69
+ const headers = [];
70
+ let foundChunk = false;
71
+ for (const line of lines) {
72
+ if (/^#{1,6}\s/.test(line)) {
73
+ const level = line.match(/^(#+)/)?.[1].length ?? 1;
74
+ while (headers.length >= level)
75
+ headers.pop();
76
+ headers.push(line.replace(/^#+\s*/, '').trim());
77
+ }
78
+ if (line.includes(chunkContent.slice(0, 50))) {
79
+ foundChunk = true;
80
+ break;
81
+ }
82
+ }
83
+ return foundChunk ? headers.join(' > ') : '';
84
+ }
85
+ // =============================================================================
86
+ // LLM functions
87
+ // =============================================================================
88
+ /**
89
+ * Generate context summaries for each chunk using an LLM.
90
+ *
91
+ * Optimized pipeline:
92
+ * 1. Generate a document summary (one LLM call)
93
+ * 2. For each chunk, send summary + header path + neighbors (not the full document)
94
+ * 3. All chunk enrichment calls run in parallel via the rate limiter
95
+ *
96
+ * This produces context summaries of equivalent quality while using 95% fewer
97
+ * tokens and completing 25x faster for large documents.
98
+ */
99
+ export async function generateContextSummaries(openai, chunks, documentContent) {
100
+ if (chunks.length === 0)
101
+ return [];
102
+ // Step 1: Generate document summary (one LLM call, full document)
103
+ const summaryPrompt = SUMMARY_PROMPT.replace('{DOCUMENT_CONTENT}', documentContent);
104
+ const summaryResponse = await openaiLimiter.schedule(() => openai.chat.completions.create({
105
+ model: CONTEXT_ENRICHMENT_MODEL,
106
+ messages: [
107
+ { role: 'system', content: 'You are a precise technical writer. Output only the summary, nothing else.' },
108
+ { role: 'user', content: summaryPrompt },
109
+ ],
110
+ max_tokens: 300,
111
+ temperature: 0,
112
+ }));
113
+ const docSummary = (summaryResponse.choices[0].message.content ?? '').trim();
114
+ // Step 2: Parallel enrichment with truncated context
115
+ const promises = chunks.map((chunk, chunkIndex) => {
116
+ const prevChunk = chunkIndex > 0 ? chunks[chunkIndex - 1].content : '(start of document)';
117
+ const nextChunk = chunkIndex < chunks.length - 1 ? chunks[chunkIndex + 1].content : '(end of document)';
118
+ const headerPath = findHeaderPath(documentContent, chunk.content);
119
+ const prompt = CONTEXT_PROMPT
120
+ .replace('{DOCUMENT_SUMMARY}', docSummary)
121
+ .replace('{HEADER_PATH}', headerPath || '(unknown section)')
122
+ .replace('{PREV_CHUNK}', prevChunk)
123
+ .replace('{CHUNK_CONTENT}', chunk.content)
124
+ .replace('{NEXT_CHUNK}', nextChunk);
125
+ return openaiLimiter.schedule({ id: `enrich-${chunkIndex}` }, async () => {
126
+ try {
127
+ const response = await openai.chat.completions.create({
128
+ model: CONTEXT_ENRICHMENT_MODEL,
129
+ messages: [
130
+ { role: 'system', content: 'You are a precise technical writer. Output only the context summary, nothing else.' },
131
+ { role: 'user', content: prompt },
132
+ ],
133
+ max_tokens: 150,
134
+ temperature: 0,
135
+ });
136
+ return {
137
+ index: chunkIndex,
138
+ summary: (response.choices[0].message.content ?? '').trim(),
139
+ tokenCount: estimateTokenCount(chunk.content),
140
+ };
141
+ }
142
+ catch (error) {
143
+ const preview = chunk.content.slice(0, 60).replace(/\n/g, ' ');
144
+ throw new Error(`Context summary failed for chunk ${chunk.chunk_index} ("${preview}..."): ${error instanceof Error ? error.message : String(error)}`);
145
+ }
146
+ });
147
+ });
148
+ const results = await Promise.all(promises);
149
+ // Sort back to original order (parallel execution may complete out of order)
150
+ results.sort((first, second) => first.index - second.index);
151
+ return results.map(enrichmentResult => ({
152
+ summary: enrichmentResult.summary,
153
+ tokenCount: enrichmentResult.tokenCount,
154
+ }));
155
+ }