@aperdomoll90/ledger-ai 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/cli.js +177 -221
  2. package/dist/commands/add.js +51 -100
  3. package/dist/commands/backfill.js +55 -0
  4. package/dist/commands/backup.js +10 -10
  5. package/dist/commands/check.js +21 -29
  6. package/dist/commands/config.js +13 -12
  7. package/dist/commands/delete.js +22 -17
  8. package/dist/commands/eval-judge.js +11 -0
  9. package/dist/commands/eval.js +321 -0
  10. package/dist/commands/export.js +8 -10
  11. package/dist/commands/get.js +9 -0
  12. package/dist/commands/hunt.js +206 -0
  13. package/dist/commands/ingest.js +15 -14
  14. package/dist/commands/init.js +18 -20
  15. package/dist/commands/list.js +21 -7
  16. package/dist/commands/migrate.js +11 -11
  17. package/dist/commands/onboard.js +2 -2
  18. package/dist/commands/pull.js +3 -2
  19. package/dist/commands/push.js +8 -8
  20. package/dist/commands/restore.js +38 -38
  21. package/dist/commands/show.js +13 -16
  22. package/dist/commands/sync.js +58 -19
  23. package/dist/commands/tag.js +20 -14
  24. package/dist/commands/update.js +50 -18
  25. package/dist/commands/wizard.js +3 -3
  26. package/dist/lib/ai-search.js +163 -0
  27. package/dist/lib/audit.js +19 -0
  28. package/dist/lib/backfill.js +60 -0
  29. package/dist/lib/config.js +19 -2
  30. package/dist/lib/document-classification.js +5 -0
  31. package/dist/lib/document-fetching.js +77 -0
  32. package/dist/lib/document-operations.js +150 -0
  33. package/dist/lib/documents/classification.js +5 -0
  34. package/dist/lib/documents/fetching.js +89 -0
  35. package/dist/lib/documents/operations.js +304 -0
  36. package/dist/lib/domains.js +116 -0
  37. package/dist/lib/embeddings.js +190 -0
  38. package/dist/lib/errors.js +3 -1
  39. package/dist/lib/eval/eval-advanced.js +289 -0
  40. package/dist/lib/eval/eval-judge-session.js +233 -0
  41. package/dist/lib/eval/eval-store.js +105 -0
  42. package/dist/lib/eval/eval.js +303 -0
  43. package/dist/lib/file-writer.js +23 -0
  44. package/dist/lib/generators.js +44 -45
  45. package/dist/lib/hunter-db.js +235 -0
  46. package/dist/lib/hunter-rss.js +30 -0
  47. package/dist/lib/hunter-scoring.js +55 -0
  48. package/dist/lib/hunter-types.js +36 -0
  49. package/dist/lib/lint-configs.js +20 -0
  50. package/dist/lib/migrate.js +2 -2
  51. package/dist/lib/notes.js +173 -59
  52. package/dist/lib/observability.js +296 -0
  53. package/dist/lib/op-add-note-types.test.js +7 -6
  54. package/dist/lib/prompt.js +8 -8
  55. package/dist/lib/rate-limiter.js +103 -0
  56. package/dist/lib/search/ai-search.js +396 -0
  57. package/dist/lib/search/chunk-context-enrichment.js +155 -0
  58. package/dist/lib/search/embeddings.js +293 -0
  59. package/dist/lib/search/reranker.js +120 -0
  60. package/dist/lib/search/semantic-cache.js +53 -0
  61. package/dist/lib/type-registry.test.js +6 -6
  62. package/dist/mcp-server.js +553 -66
  63. package/dist/migrations/migrations/005-audit-log.sql +22 -0
  64. package/dist/migrations/migrations/005_opportunities.sql +48 -0
  65. package/dist/migrations/migrations/006-audited-operations.sql +235 -0
  66. package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
  67. package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
  68. package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
  69. package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
  70. package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
  71. package/dist/scripts/batch-grade.js +344 -0
  72. package/dist/scripts/benchmark-ingestion.js +376 -0
  73. package/dist/scripts/convert-judgments-to-graded.js +88 -0
  74. package/dist/scripts/diagnose-first-result.js +333 -0
  75. package/dist/scripts/drop-golden-query.js +53 -0
  76. package/dist/scripts/eval-search.js +115 -0
  77. package/dist/scripts/grade-unjudged-top1.js +138 -0
  78. package/dist/scripts/hunter-analytics.js +38 -0
  79. package/dist/scripts/hunter-cron.js +63 -0
  80. package/dist/scripts/hunter-purge.js +25 -0
  81. package/dist/scripts/migrate-v2.js +140 -0
  82. package/dist/scripts/reindex.js +74 -0
  83. package/dist/scripts/sync-local-docs.js +153 -0
  84. package/package.json +7 -1
@@ -1,10 +1,13 @@
1
1
  import dotenv from 'dotenv';
2
2
  import { createClient } from '@supabase/supabase-js';
3
3
  import OpenAI from 'openai';
4
+ import { observeOpenAI } from '@langfuse/openai';
4
5
  import { resolve } from 'path';
5
6
  import { homedir } from 'os';
6
7
  import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
7
8
  import { fatal, ExitCode } from './errors.js';
9
+ import { openaiLimiter, updateLimitsFromHeaders } from './rate-limiter.js';
10
+ import { initObservability } from './observability.js';
8
11
  // --- Defaults ---
9
12
  const LEDGER_DIR = resolve(homedir(), '.ledger');
10
13
  const LEDGER_DOTENV = resolve(LEDGER_DIR, '.env');
@@ -45,13 +48,24 @@ export function getDefaultConfig() {
45
48
  },
46
49
  };
47
50
  }
51
+ // --- Custom fetch for rate limit header interception ---
52
+ // Wraps the global fetch to read OpenAI's rate limit headers on every response.
53
+ // This works below both the OpenAI SDK and the Langfuse wrapper, so header
54
+ // reading survives regardless of how the client is wrapped.
55
+ const openaiHeaderFetch = async (input, init) => {
56
+ const response = await fetch(input, init);
57
+ await updateLimitsFromHeaders(openaiLimiter, response.headers);
58
+ return response;
59
+ };
48
60
  // --- Load Config ---
49
- export function loadConfig() {
61
+ export function loadConfig(options) {
50
62
  // Priority: env vars > DOTENV_CONFIG_PATH > ~/.ledger/.env
51
63
  const dotenvPath = process.env.DOTENV_CONFIG_PATH
52
64
  || (existsSync(LEDGER_DOTENV) ? LEDGER_DOTENV : undefined);
53
65
  if (dotenvPath)
54
66
  dotenv.config({ path: dotenvPath, quiet: true });
67
+ // Init observability after dotenv loads (Langfuse env vars are now available)
68
+ initObservability();
55
69
  if (!process.env.SUPABASE_URL || !process.env.SUPABASE_SERVICE_ROLE_KEY) {
56
70
  fatal('Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY. Run `ledger init` or check your .env file.', ExitCode.GENERAL_ERROR);
57
71
  }
@@ -63,6 +77,9 @@ export function loadConfig() {
63
77
  memoryDir: process.env.LEDGER_MEMORY_DIR || fileConfig.memoryDir || DEFAULT_MEMORY_DIR,
64
78
  claudeMdPath: process.env.LEDGER_CLAUDE_MD_PATH || fileConfig.claudeMdPath || DEFAULT_CLAUDE_MD_PATH,
65
79
  supabase: createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_ROLE_KEY),
66
- openai: new OpenAI({ apiKey: process.env.OPENAI_API_KEY }),
80
+ openai: observeOpenAI(new OpenAI({ apiKey: process.env.OPENAI_API_KEY, maxRetries: 5, fetch: openaiHeaderFetch })),
81
+ cohereApiKey: process.env.COHERE_API_KEY || undefined,
82
+ sessionId: options?.sessionId,
83
+ observabilityEnvironment: options?.observabilityEnvironment,
67
84
  };
68
85
  }
@@ -0,0 +1,5 @@
1
+ // document-classification.ts
2
+ // Types and interfaces matching the database schema.
3
+ // Pure data definitions — no logic, no I/O, no dependencies.
4
+ // Every other file imports from here.
5
+ export {};
@@ -0,0 +1,77 @@
1
+ // document-fetching.ts
2
+ // Read documents from the database. No writes, no search — just SELECT queries.
3
+ // Every query filters deleted_at IS NULL so soft-deleted documents are invisible.
4
+ /**
5
+ * Get a single document by its database ID.
6
+ * Returns null if the document doesn't exist or is soft-deleted.
7
+ */
8
+ export async function getDocumentById(supabase, id) {
9
+ const { data, error } = await supabase
10
+ .from('documents')
11
+ .select('*')
12
+ .eq('id', id)
13
+ .is('deleted_at', null)
14
+ .single();
15
+ if (error || !data)
16
+ return null;
17
+ return data;
18
+ }
19
+ /**
20
+ * Get a single document by its unique name.
21
+ * Returns null if no document has this name or it's soft-deleted.
22
+ */
23
+ export async function getDocumentByName(supabase, name) {
24
+ const { data, error } = await supabase
25
+ .from('documents')
26
+ .select('*')
27
+ .eq('name', name)
28
+ .is('deleted_at', null)
29
+ .single();
30
+ if (error || !data)
31
+ return null;
32
+ return data;
33
+ }
34
+ /**
35
+ * List documents with optional filters. Returns newest first.
36
+ * All filters are optional — no filters = list all active documents.
37
+ *
38
+ * Uses indexed columns: domain, document_type, project, created_at DESC.
39
+ * The deleted_at IS NULL filter uses the index_documents_active partial index.
40
+ */
41
+ export async function listDocuments(supabase, filters = {}) {
42
+ let query = supabase
43
+ .from('documents')
44
+ .select('*')
45
+ .is('deleted_at', null)
46
+ .order('created_at', { ascending: false })
47
+ .limit(filters.limit ?? 20);
48
+ if (filters.domain)
49
+ query = query.eq('domain', filters.domain);
50
+ if (filters.document_type)
51
+ query = query.eq('document_type', filters.document_type);
52
+ if (filters.project)
53
+ query = query.eq('project', filters.project);
54
+ const { data, error } = await query;
55
+ if (error || !data)
56
+ return [];
57
+ return data;
58
+ }
59
+ /**
60
+ * Fetch all documents that should sync to every machine.
61
+ * Sync is driven by is_auto_load, not domain — a document syncs locally because
62
+ * it needs to be in the AI's context every session (CLAUDE.md, MEMORY.md,
63
+ * personality, behavioral rules). Everything else stays in the database and is
64
+ * accessed via search on demand, regardless of domain.
65
+ *
66
+ * Uses the index_documents_is_auto_load partial index.
67
+ */
68
+ export async function fetchSyncableDocuments(supabase) {
69
+ const { data, error } = await supabase
70
+ .from('documents')
71
+ .select('*')
72
+ .eq('is_auto_load', true)
73
+ .is('deleted_at', null);
74
+ if (error || !data)
75
+ return [];
76
+ return data;
77
+ }
@@ -0,0 +1,150 @@
1
+ // document-operations.ts
2
+ // Write operations — create, update, delete, restore documents.
3
+ // Each function prepares data (chunk, embed, hash) then calls a Postgres RPC function.
4
+ // The database handles transactions (document + chunks + audit = atomic).
5
+ import { contentHash, chunkText, generateEmbedding, toVectorString } from './embeddings.js';
6
+ const DEFAULT_EMBEDDING_MODEL = 'openai/text-embedding-3-small';
7
+ /**
8
+ * Create a new document.
9
+ *
10
+ * What happens:
11
+ * 1. Hash the content (for change detection)
12
+ * 2. Split content into chunks (for better search)
13
+ * 3. Generate an embedding for each chunk (calls OpenAI — costs money)
14
+ * 4. Format embeddings as Postgres vector strings
15
+ * 5. Call document_create RPC (Postgres inserts document + chunks + audit in one transaction)
16
+ * 6. Return the new document's ID
17
+ */
18
+ export async function createDocument(clients, props) {
19
+ // Always compute hash from actual content — never accept a pre-computed hash
20
+ const hash = contentHash(props.content);
21
+ // Chunk and embed
22
+ const chunks = chunkText(props.content);
23
+ const chunkContents = chunks.map(chunk => chunk.content);
24
+ const chunkEmbeddings = [];
25
+ for (const chunk of chunks) {
26
+ const embedding = await generateEmbedding(clients.openai, chunk.content);
27
+ chunkEmbeddings.push(toVectorString(embedding));
28
+ }
29
+ const { data, error } = await clients.supabase.rpc('document_create', {
30
+ p_name: props.name,
31
+ p_domain: props.domain,
32
+ p_document_type: props.document_type,
33
+ p_project: props.project ?? null,
34
+ p_protection: props.protection ?? 'open',
35
+ p_owner_type: props.owner_type ?? 'user',
36
+ p_owner_id: props.owner_id ?? null,
37
+ p_is_auto_load: props.is_auto_load ?? false,
38
+ p_content: props.content,
39
+ p_description: props.description ?? null,
40
+ p_content_hash: hash,
41
+ p_source_type: props.source_type ?? 'text',
42
+ p_source_url: props.source_url ?? null,
43
+ p_file_path: props.file_path ?? null,
44
+ p_file_permissions: props.file_permissions ?? null,
45
+ p_agent: props.agent ?? null,
46
+ p_status: props.status ?? null,
47
+ p_skill_ref: props.skill_ref ?? null,
48
+ p_embedding_model_id: props.embedding_model_id ?? DEFAULT_EMBEDDING_MODEL,
49
+ p_chunk_contents: chunkContents,
50
+ p_chunk_embeddings: chunkEmbeddings,
51
+ p_chunk_strategy: chunks[0]?.strategy ?? 'paragraph',
52
+ });
53
+ if (error)
54
+ throw new Error(`Failed to create document: ${error.message}`);
55
+ return data;
56
+ }
57
+ /**
58
+ * Update a document's content. Triggers re-chunking and re-embedding.
59
+ *
60
+ * What happens:
61
+ * 1. Hash the new content
62
+ * 2. Split new content into chunks
63
+ * 3. Generate new embeddings for each chunk (calls OpenAI)
64
+ * 4. Call document_update RPC — Postgres handles:
65
+ * - Save old content to document_versions (version snapshot)
66
+ * - Update the document row
67
+ * - Delete old chunks, insert new chunks
68
+ * - Write audit entry
69
+ */
70
+ export async function updateDocument(clients, props) {
71
+ const hash = contentHash(props.content);
72
+ const chunks = chunkText(props.content);
73
+ const chunkContents = chunks.map(chunk => chunk.content);
74
+ const chunkEmbeddings = [];
75
+ for (const chunk of chunks) {
76
+ const embedding = await generateEmbedding(clients.openai, chunk.content);
77
+ chunkEmbeddings.push(toVectorString(embedding));
78
+ }
79
+ const { error } = await clients.supabase.rpc('document_update', {
80
+ p_id: props.id,
81
+ p_content: props.content,
82
+ p_content_hash: hash,
83
+ p_agent: props.agent ?? null,
84
+ p_description: props.description ?? null,
85
+ p_status: props.status ?? null,
86
+ p_embedding_model_id: props.embedding_model_id ?? DEFAULT_EMBEDDING_MODEL,
87
+ p_chunk_contents: chunkContents,
88
+ p_chunk_embeddings: chunkEmbeddings,
89
+ p_chunk_strategy: chunks[0]?.strategy ?? 'paragraph',
90
+ });
91
+ if (error)
92
+ throw new Error(`Failed to update document: ${error.message}`);
93
+ }
94
+ /**
95
+ * Update document fields without changing content. No re-embedding needed.
96
+ *
97
+ * This is cheap (no OpenAI calls) — just passes the fields to Postgres.
98
+ * Postgres handles: update columns, sync domain to chunks if changed, write audit.
99
+ */
100
+ export async function updateDocumentFields(clients, props) {
101
+ const { error } = await clients.supabase.rpc('document_update_fields', {
102
+ p_id: props.id,
103
+ p_agent: props.agent ?? null,
104
+ p_name: props.name ?? null,
105
+ p_domain: props.domain ?? null,
106
+ p_document_type: props.document_type ?? null,
107
+ p_project: props.project ?? null,
108
+ p_protection: props.protection ?? null,
109
+ p_owner_type: props.owner_type ?? null,
110
+ p_owner_id: props.owner_id ?? null,
111
+ p_is_auto_load: props.is_auto_load ?? null,
112
+ p_description: props.description ?? null,
113
+ p_source_type: props.source_type ?? null,
114
+ p_source_url: props.source_url ?? null,
115
+ p_file_path: props.file_path ?? null,
116
+ p_file_permissions: props.file_permissions ?? null,
117
+ p_status: props.status ?? null,
118
+ p_skill_ref: props.skill_ref ?? null,
119
+ p_embedding_model_id: props.embedding_model_id ?? null,
120
+ });
121
+ if (error)
122
+ throw new Error(`Failed to update document fields: ${error.message}`);
123
+ }
124
+ /**
125
+ * Soft delete a document. The document stays in the database with deleted_at set.
126
+ * Chunks are removed (search shouldn't find deleted documents).
127
+ * Can be restored within 30 days via restoreDocument().
128
+ * After 30 days, document_purge() permanently removes it.
129
+ */
130
+ export async function deleteDocument(clients, id, agent) {
131
+ const { error } = await clients.supabase.rpc('document_delete', {
132
+ p_id: id,
133
+ p_agent: agent,
134
+ });
135
+ if (error)
136
+ throw new Error(`Failed to delete document: ${error.message}`);
137
+ }
138
+ /**
139
+ * Undo a soft delete. The document becomes active again.
140
+ * Note: chunks were removed during delete — they need to be regenerated
141
+ * by calling updateDocument() with the same content (which re-chunks and re-embeds).
142
+ */
143
+ export async function restoreDocument(clients, id, agent) {
144
+ const { error } = await clients.supabase.rpc('document_restore', {
145
+ p_id: id,
146
+ p_agent: agent,
147
+ });
148
+ if (error)
149
+ throw new Error(`Failed to restore document: ${error.message}`);
150
+ }
@@ -0,0 +1,5 @@
1
+ // document-classification.ts
2
+ // Types and interfaces matching the database schema.
3
+ // Pure data definitions — no logic, no I/O, no dependencies.
4
+ // Every other file imports from here.
5
+ export {};
@@ -0,0 +1,89 @@
1
+ // document-fetching.ts
2
+ // Read documents from the database. No writes, no search — just SELECT queries.
3
+ // Every query filters deleted_at IS NULL so soft-deleted documents are invisible.
4
+ /**
5
+ * Get a single document by its database ID.
6
+ * Returns null if the document doesn't exist or is soft-deleted.
7
+ */
8
+ export async function getDocumentById(supabase, id) {
9
+ const { data, error } = await supabase
10
+ .from('documents')
11
+ .select('*')
12
+ .eq('id', id)
13
+ .is('deleted_at', null)
14
+ .single();
15
+ if (error) {
16
+ if (error.code !== 'PGRST116') {
17
+ process.stderr.write(`[ledger] getDocumentById(${id}) failed: ${error.message}\n`);
18
+ }
19
+ return null;
20
+ }
21
+ return data ?? null;
22
+ }
23
+ /**
24
+ * Get a single document by its unique name.
25
+ * Returns null if no document has this name or it's soft-deleted.
26
+ */
27
+ export async function getDocumentByName(supabase, name) {
28
+ const { data, error } = await supabase
29
+ .from('documents')
30
+ .select('*')
31
+ .eq('name', name)
32
+ .is('deleted_at', null)
33
+ .single();
34
+ if (error) {
35
+ if (error.code !== 'PGRST116') {
36
+ process.stderr.write(`[ledger] getDocumentByName("${name}") failed: ${error.message}\n`);
37
+ }
38
+ return null;
39
+ }
40
+ return data ?? null;
41
+ }
42
+ /**
43
+ * List documents with optional filters. Returns newest first.
44
+ * All filters are optional — no filters = list all active documents.
45
+ *
46
+ * Uses indexed columns: domain, document_type, project, created_at DESC.
47
+ * The deleted_at IS NULL filter uses the index_documents_active partial index.
48
+ */
49
+ export async function listDocuments(supabase, filters = {}) {
50
+ let query = supabase
51
+ .from('documents')
52
+ .select('*')
53
+ .is('deleted_at', null)
54
+ .order('created_at', { ascending: false })
55
+ .limit(filters.limit ?? 20);
56
+ if (filters.domain)
57
+ query = query.eq('domain', filters.domain);
58
+ if (filters.document_type)
59
+ query = query.eq('document_type', filters.document_type);
60
+ if (filters.project)
61
+ query = query.eq('project', filters.project);
62
+ const { data, error } = await query;
63
+ if (error) {
64
+ process.stderr.write(`[ledger] listDocuments failed: ${error.message}\n`);
65
+ return [];
66
+ }
67
+ return data ?? [];
68
+ }
69
+ /**
70
+ * Fetch all documents that should sync to every machine.
71
+ * Sync is driven by is_auto_load, not domain — a document syncs locally because
72
+ * it needs to be in the AI's context every session (CLAUDE.md, MEMORY.md,
73
+ * personality, behavioral rules). Everything else stays in the database and is
74
+ * accessed via search on demand, regardless of domain.
75
+ *
76
+ * Uses the index_documents_is_auto_load partial index.
77
+ */
78
+ export async function fetchSyncableDocuments(supabase) {
79
+ const { data, error } = await supabase
80
+ .from('documents')
81
+ .select('*')
82
+ .eq('is_auto_load', true)
83
+ .is('deleted_at', null);
84
+ if (error) {
85
+ process.stderr.write(`[ledger] fetchSyncableDocuments failed: ${error.message}\n`);
86
+ return [];
87
+ }
88
+ return data ?? [];
89
+ }
@@ -0,0 +1,304 @@
1
+ // document-operations.ts
2
+ // Write operations — create, update, delete, restore documents.
3
+ // Each function prepares data (chunk, embed, hash) then calls a Postgres RPC function.
4
+ // The database handles transactions (document + chunks + audit = atomic).
5
+ import { readFileSync } from 'fs';
6
+ import { contentHash, chunkText, generateEmbeddingsBatch, toVectorString } from '../search/embeddings.js';
7
+ import { generateContextSummaries } from '../search/chunk-context-enrichment.js';
8
+ import { getDocumentById } from './fetching.js';
9
+ import { startTrace, startSpan } from '../observability.js';
10
+ const DEFAULT_EMBEDDING_MODEL = 'openai/text-embedding-3-small';
11
+ const DEFAULT_CHUNK_CONFIG = {
12
+ maxChunkSize: 1000,
13
+ overlapChars: 200,
14
+ strategy: 'recursive',
15
+ };
16
+ /**
17
+ * Create a new document.
18
+ *
19
+ * Pipeline:
20
+ * 1. Hash the content (change detection)
21
+ * 2. Chunk with recursive splitter
22
+ * 3. Generate context summaries per chunk (LLM call — chunk context enrichment)
23
+ * 4. Embed summary + chunk content (OpenAI embedding call per chunk)
24
+ * 5. Call document_create RPC (atomic: document + chunks + audit)
25
+ */
26
+ export async function createDocument(clients, props, chunkConfig) {
27
+ const config = { ...DEFAULT_CHUNK_CONFIG, ...chunkConfig };
28
+ const hash = contentHash(props.content);
29
+ const trace = startTrace('document-ingestion', {
30
+ tags: ['ingestion', 'create'],
31
+ metadata: { documentName: props.name, domain: props.domain, documentType: props.document_type },
32
+ input: { contentLength: props.content.length },
33
+ });
34
+ // Chunk
35
+ const chunkSpan = startSpan('chunking', { input: { contentLength: props.content.length } });
36
+ const chunks = chunkText(props.content, config);
37
+ const chunkContents = chunks.map(chunk => chunk.content);
38
+ chunkSpan.update({ output: { chunkCount: chunks.length, avgChunkSize: Math.round(props.content.length / chunks.length) } });
39
+ chunkSpan.end();
40
+ // Enrich — generate context summaries per chunk (LLM calls auto-traced by wrapped client)
41
+ const enrichSpan = startSpan('context-enrichment', { metadata: { chunkCount: chunks.length, model: 'gpt-4o-mini' } });
42
+ const enrichmentResults = await generateContextSummaries(clients.openai, chunks, props.content);
43
+ const chunkSummaries = enrichmentResults.map(result => result.summary);
44
+ const chunkTokenCounts = enrichmentResults.map(result => result.tokenCount);
45
+ enrichSpan.end();
46
+ // Embed — summary + "\n\n" + chunk content (batch: one API call per 100 chunks, auto-traced)
47
+ const embedSpan = startSpan('batch-embedding', { metadata: { chunkCount: chunks.length, model: 'text-embedding-3-small' } });
48
+ const embeddingInputs = chunks.map((chunk, index) => chunkSummaries[index] + '\n\n' + chunk.content);
49
+ const embeddings = await generateEmbeddingsBatch(clients.openai, embeddingInputs);
50
+ const chunkEmbeddings = embeddings.map(toVectorString);
51
+ embedSpan.end();
52
+ // DB write
53
+ const dbSpan = startSpan('db-write', { input: { chunkCount: chunks.length } });
54
+ const { data, error } = await clients.supabase.rpc('document_create', {
55
+ p_name: props.name,
56
+ p_domain: props.domain,
57
+ p_document_type: props.document_type,
58
+ p_project: props.project ?? null,
59
+ p_protection: props.protection ?? 'open',
60
+ p_owner_type: props.owner_type ?? 'user',
61
+ p_owner_id: props.owner_id ?? null,
62
+ p_is_auto_load: props.is_auto_load ?? false,
63
+ p_content: props.content,
64
+ p_description: props.description ?? null,
65
+ p_content_hash: hash,
66
+ p_source_type: props.source_type ?? 'text',
67
+ p_source_url: props.source_url ?? null,
68
+ p_file_path: props.file_path ?? null,
69
+ p_file_permissions: props.file_permissions ?? null,
70
+ p_agent: props.agent ?? null,
71
+ p_status: props.status ?? null,
72
+ p_skill_ref: props.skill_ref ?? null,
73
+ p_embedding_model_id: props.embedding_model_id ?? DEFAULT_EMBEDDING_MODEL,
74
+ p_chunk_contents: chunkContents,
75
+ p_chunk_embeddings: chunkEmbeddings,
76
+ p_chunk_strategy: chunks[0]?.strategy ?? config.strategy,
77
+ p_chunk_summaries: chunkSummaries,
78
+ p_chunk_token_counts: chunkTokenCounts,
79
+ p_chunk_overlap: config.overlapChars,
80
+ });
81
+ dbSpan.update({ output: { documentId: data } });
82
+ dbSpan.end();
83
+ trace.end();
84
+ if (error)
85
+ throw new Error(`Failed to create document "${props.name}" (${props.domain}/${props.document_type}): ${error.message}`);
86
+ return data;
87
+ }
88
+ /**
89
+ * Update a document's content. Triggers re-chunking, re-enrichment, and re-embedding.
90
+ *
91
+ * Same pipeline as createDocument — hash, chunk, enrich, embed — then calls
92
+ * document_update RPC which versions old content before overwriting.
93
+ */
94
+ export async function updateDocument(clients, props, chunkConfig) {
95
+ const config = { ...DEFAULT_CHUNK_CONFIG, ...chunkConfig };
96
+ const hash = contentHash(props.content);
97
+ const trace = startTrace('document-ingestion', {
98
+ tags: ['ingestion', 'update'],
99
+ metadata: { documentId: props.id },
100
+ input: { contentLength: props.content.length },
101
+ });
102
+ // Chunk
103
+ const chunkSpan = startSpan('chunking', { input: { contentLength: props.content.length } });
104
+ const chunks = chunkText(props.content, config);
105
+ const chunkContents = chunks.map(chunk => chunk.content);
106
+ chunkSpan.update({ output: { chunkCount: chunks.length, avgChunkSize: Math.round(props.content.length / chunks.length) } });
107
+ chunkSpan.end();
108
+ // Enrich (LLM calls auto-traced)
109
+ const enrichSpan = startSpan('context-enrichment', { metadata: { chunkCount: chunks.length, model: 'gpt-4o-mini' } });
110
+ const enrichmentResults = await generateContextSummaries(clients.openai, chunks, props.content);
111
+ const chunkSummaries = enrichmentResults.map(result => result.summary);
112
+ const chunkTokenCounts = enrichmentResults.map(result => result.tokenCount);
113
+ enrichSpan.end();
114
+ // Embed (auto-traced)
115
+ const embedSpan = startSpan('batch-embedding', { metadata: { chunkCount: chunks.length, model: 'text-embedding-3-small' } });
116
+ const embeddingInputs = chunks.map((chunk, index) => chunkSummaries[index] + '\n\n' + chunk.content);
117
+ const embeddings = await generateEmbeddingsBatch(clients.openai, embeddingInputs);
118
+ const chunkEmbeddings = embeddings.map(toVectorString);
119
+ embedSpan.end();
120
+ // DB write
121
+ const dbSpan = startSpan('db-write', { input: { chunkCount: chunks.length } });
122
+ const { error } = await clients.supabase.rpc('document_update', {
123
+ p_id: props.id,
124
+ p_content: props.content,
125
+ p_content_hash: hash,
126
+ p_agent: props.agent ?? null,
127
+ p_description: props.description ?? null,
128
+ p_status: props.status ?? null,
129
+ p_embedding_model_id: props.embedding_model_id ?? DEFAULT_EMBEDDING_MODEL,
130
+ p_chunk_contents: chunkContents,
131
+ p_chunk_embeddings: chunkEmbeddings,
132
+ p_chunk_strategy: chunks[0]?.strategy ?? config.strategy,
133
+ p_chunk_summaries: chunkSummaries,
134
+ p_chunk_token_counts: chunkTokenCounts,
135
+ p_chunk_overlap: config.overlapChars,
136
+ });
137
+ dbSpan.end();
138
+ trace.end();
139
+ if (error)
140
+ throw new Error(`Failed to update document #${props.id}: ${error.message}`);
141
+ }
142
+ /**
143
+ * Update document fields without changing content. No re-embedding needed.
144
+ *
145
+ * This is cheap (no OpenAI calls) — just passes the fields to Postgres.
146
+ * Postgres handles: update columns, sync domain to chunks if changed, write audit.
147
+ */
148
+ export async function updateDocumentFields(clients, props) {
149
+ const { error } = await clients.supabase.rpc('document_update_fields', {
150
+ p_id: props.id,
151
+ p_agent: props.agent ?? null,
152
+ p_name: props.name ?? null,
153
+ p_domain: props.domain ?? null,
154
+ p_document_type: props.document_type ?? null,
155
+ p_project: props.project ?? null,
156
+ p_protection: props.protection ?? null,
157
+ p_owner_type: props.owner_type ?? null,
158
+ p_owner_id: props.owner_id ?? null,
159
+ p_is_auto_load: props.is_auto_load ?? null,
160
+ p_description: props.description ?? null,
161
+ p_source_type: props.source_type ?? null,
162
+ p_source_url: props.source_url ?? null,
163
+ p_file_path: props.file_path ?? null,
164
+ p_file_permissions: props.file_permissions ?? null,
165
+ p_status: props.status ?? null,
166
+ p_skill_ref: props.skill_ref ?? null,
167
+ p_embedding_model_id: props.embedding_model_id ?? null,
168
+ });
169
+ if (error)
170
+ throw new Error(`Failed to update fields on document #${props.id}: ${error.message}`);
171
+ }
172
+ /**
173
+ * Soft delete a document. The document stays in the database with deleted_at set.
174
+ * Chunks are removed (search shouldn't find deleted documents).
175
+ * Can be restored within 30 days via restoreDocument().
176
+ * After 30 days, document_purge() permanently removes it.
177
+ */
178
+ export async function deleteDocument(clients, id, agent) {
179
+ const { error } = await clients.supabase.rpc('document_delete', {
180
+ p_id: id,
181
+ p_agent: agent,
182
+ });
183
+ if (error)
184
+ throw new Error(`Failed to delete document #${id}: ${error.message}`);
185
+ }
186
+ /**
187
+ * Undo a soft delete. The document becomes active again.
188
+ * Note: chunks were removed during delete — they need to be regenerated
189
+ * by calling updateDocument() with the same content (which re-chunks and re-embeds).
190
+ */
191
+ export async function restoreDocument(clients, id, agent) {
192
+ const { error } = await clients.supabase.rpc('document_restore', {
193
+ p_id: id,
194
+ p_agent: agent,
195
+ });
196
+ if (error)
197
+ throw new Error(`Failed to restore document #${id}: ${error.message}`);
198
+ }
199
+ /**
200
+ * Thrown when the post-push verify pull-back does not byte-match the file we sent.
201
+ * Carries the document id, both byte counts, and a single-line diff preview that
202
+ * locates the first divergence (line / col / expected snippet / actual snippet).
203
+ *
204
+ * Caller decides how to surface this. CLI prints to stderr and exits non-zero;
205
+ * MCP returns it as an error result. Neither path attempts to rollback.
206
+ */
207
+ export class VerifyMismatchError extends Error {
208
+ id;
209
+ expectedLength;
210
+ actualLength;
211
+ diffPreview;
212
+ constructor(id, expectedLength, actualLength, diffPreview) {
213
+ super(`Verify mismatch on document ${id}: pushed ${expectedLength} bytes, pulled ${actualLength} bytes. ${diffPreview}`);
214
+ this.id = id;
215
+ this.expectedLength = expectedLength;
216
+ this.actualLength = actualLength;
217
+ this.diffPreview = diffPreview;
218
+ this.name = 'VerifyMismatchError';
219
+ }
220
+ }
221
+ // Locate the first byte at which two strings differ and produce a one-line preview
222
+ // in the form: `line L, col C: expected '<snippet>' but got '<snippet>'`.
223
+ // Returns null when the strings are byte-identical.
224
+ function buildDiffPreview(expected, actual) {
225
+ if (expected === actual)
226
+ return null;
227
+ const minLength = Math.min(expected.length, actual.length);
228
+ let diffIndex = minLength;
229
+ for (let cursor = 0; cursor < minLength; cursor++) {
230
+ if (expected[cursor] !== actual[cursor]) {
231
+ diffIndex = cursor;
232
+ break;
233
+ }
234
+ }
235
+ let line = 1;
236
+ let col = 1;
237
+ for (let cursor = 0; cursor < diffIndex; cursor++) {
238
+ if (expected[cursor] === '\n') {
239
+ line++;
240
+ col = 1;
241
+ }
242
+ else {
243
+ col++;
244
+ }
245
+ }
246
+ const SNIPPET_LENGTH = 40;
247
+ const escape = (snippet) => snippet.replace(/\n/g, '\\n').replace(/\t/g, '\\t');
248
+ const expectedSnippet = escape(expected.slice(diffIndex, diffIndex + SNIPPET_LENGTH));
249
+ const actualSnippet = escape(actual.slice(diffIndex, diffIndex + SNIPPET_LENGTH));
250
+ return `line ${line}, col ${col}: expected '${expectedSnippet}' but got '${actualSnippet}'`;
251
+ }
252
+ // Pull the document back and byte-compare against the bytes we just wrote.
253
+ // Throws VerifyMismatchError if the DB returned different bytes than we sent
254
+ // (drift / pipeline transformation / concurrent write all manifest the same way).
255
+ async function verifyAfterWrite(clients, id, expectedBody) {
256
+ const pulled = await getDocumentById(clients.supabase, id);
257
+ if (!pulled) {
258
+ throw new VerifyMismatchError(id, expectedBody.length, 0, 'document not found during verify (deleted between write and verify, or wrong id)');
259
+ }
260
+ const diffPreview = buildDiffPreview(expectedBody, pulled.content);
261
+ if (diffPreview !== null) {
262
+ throw new VerifyMismatchError(id, expectedBody.length, pulled.content.length, diffPreview);
263
+ }
264
+ }
265
+ /**
266
+ * Update a document by reading its new content from a file on disk, then verify the write.
267
+ *
268
+ * Pipeline:
269
+ * 1. Read file from `filePath` (utf8). Surfaces fs errors (ENOENT, EACCES) as-is.
270
+ * 2. Call updateDocument() with the file bytes as content.
271
+ * 3. Pull the document back and byte-compare against what we wrote.
272
+ * 4. On match: return { id, verified, bytes }. On mismatch: throw VerifyMismatchError.
273
+ *
274
+ * The file is never trimmed, normalized, or transformed — bytes-in equals bytes-out.
275
+ */
276
+ export async function updateDocumentFromFile(clients, props) {
277
+ const body = readFileSync(props.filePath, 'utf8');
278
+ await updateDocument(clients, { id: props.id, content: body, agent: props.agent });
279
+ await verifyAfterWrite(clients, props.id, body);
280
+ return { id: props.id, verified: true, bytes: body.length };
281
+ }
282
+ /**
283
+ * Create a new document by reading its content from a file on disk, then verify the write.
284
+ *
285
+ * Same shape as updateDocumentFromFile: read file, call createDocument(), pull-back, byte-compare.
286
+ * On mismatch the new document still exists; caller decides whether to delete it (audit_log
287
+ * preserves the create event for manual cleanup).
288
+ */
289
+ export async function createDocumentFromFile(clients, props) {
290
+ const body = readFileSync(props.filePath, 'utf8');
291
+ const id = await createDocument(clients, {
292
+ name: props.name,
293
+ domain: props.domain,
294
+ document_type: props.document_type,
295
+ content: body,
296
+ description: props.description,
297
+ project: props.project,
298
+ agent: props.agent,
299
+ status: props.status,
300
+ protection: props.protection,
301
+ });
302
+ await verifyAfterWrite(clients, id, body);
303
+ return { id, verified: true, bytes: body.length };
304
+ }