@aperdomoll90/ledger-ai 1.3.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +177 -221
- package/dist/commands/add.js +51 -100
- package/dist/commands/backfill.js +55 -0
- package/dist/commands/backup.js +10 -10
- package/dist/commands/check.js +21 -29
- package/dist/commands/config.js +13 -12
- package/dist/commands/delete.js +22 -17
- package/dist/commands/eval-judge.js +11 -0
- package/dist/commands/eval.js +321 -0
- package/dist/commands/export.js +8 -10
- package/dist/commands/get.js +9 -0
- package/dist/commands/hunt.js +206 -0
- package/dist/commands/ingest.js +15 -14
- package/dist/commands/init.js +18 -20
- package/dist/commands/list.js +21 -7
- package/dist/commands/migrate.js +11 -11
- package/dist/commands/onboard.js +2 -2
- package/dist/commands/pull.js +3 -2
- package/dist/commands/push.js +8 -8
- package/dist/commands/restore.js +38 -38
- package/dist/commands/show.js +13 -16
- package/dist/commands/sync.js +58 -19
- package/dist/commands/tag.js +20 -14
- package/dist/commands/update.js +50 -18
- package/dist/commands/wizard.js +3 -3
- package/dist/lib/ai-search.js +163 -0
- package/dist/lib/audit.js +19 -0
- package/dist/lib/backfill.js +60 -0
- package/dist/lib/config.js +19 -2
- package/dist/lib/document-classification.js +5 -0
- package/dist/lib/document-fetching.js +77 -0
- package/dist/lib/document-operations.js +150 -0
- package/dist/lib/documents/classification.js +5 -0
- package/dist/lib/documents/fetching.js +89 -0
- package/dist/lib/documents/operations.js +304 -0
- package/dist/lib/domains.js +116 -0
- package/dist/lib/embeddings.js +190 -0
- package/dist/lib/errors.js +3 -1
- package/dist/lib/eval/eval-advanced.js +289 -0
- package/dist/lib/eval/eval-judge-session.js +233 -0
- package/dist/lib/eval/eval-store.js +105 -0
- package/dist/lib/eval/eval.js +303 -0
- package/dist/lib/file-writer.js +23 -0
- package/dist/lib/generators.js +44 -45
- package/dist/lib/hunter-db.js +235 -0
- package/dist/lib/hunter-rss.js +30 -0
- package/dist/lib/hunter-scoring.js +55 -0
- package/dist/lib/hunter-types.js +36 -0
- package/dist/lib/lint-configs.js +20 -0
- package/dist/lib/migrate.js +2 -2
- package/dist/lib/notes.js +173 -59
- package/dist/lib/observability.js +296 -0
- package/dist/lib/op-add-note-types.test.js +7 -6
- package/dist/lib/prompt.js +8 -8
- package/dist/lib/rate-limiter.js +103 -0
- package/dist/lib/search/ai-search.js +396 -0
- package/dist/lib/search/chunk-context-enrichment.js +155 -0
- package/dist/lib/search/embeddings.js +293 -0
- package/dist/lib/search/reranker.js +120 -0
- package/dist/lib/search/semantic-cache.js +53 -0
- package/dist/lib/type-registry.test.js +6 -6
- package/dist/mcp-server.js +553 -66
- package/dist/migrations/migrations/005-audit-log.sql +22 -0
- package/dist/migrations/migrations/005_opportunities.sql +48 -0
- package/dist/migrations/migrations/006-audited-operations.sql +235 -0
- package/dist/migrations/migrations/006_hunt_analytics.sql +38 -0
- package/dist/migrations/migrations/007-eval-golden-judgments.sql +119 -0
- package/dist/migrations/migrations/008-drop-expected-doc-ids.sql +9 -0
- package/dist/migrations/migrations/008-judge-helpers.sql +21 -0
- package/dist/migrations/migrations/009-semantic-cache.sql +216 -0
- package/dist/scripts/batch-grade.js +344 -0
- package/dist/scripts/benchmark-ingestion.js +376 -0
- package/dist/scripts/convert-judgments-to-graded.js +88 -0
- package/dist/scripts/diagnose-first-result.js +333 -0
- package/dist/scripts/drop-golden-query.js +53 -0
- package/dist/scripts/eval-search.js +115 -0
- package/dist/scripts/grade-unjudged-top1.js +138 -0
- package/dist/scripts/hunter-analytics.js +38 -0
- package/dist/scripts/hunter-cron.js +63 -0
- package/dist/scripts/hunter-purge.js +25 -0
- package/dist/scripts/migrate-v2.js +140 -0
- package/dist/scripts/reindex.js +74 -0
- package/dist/scripts/sync-local-docs.js +153 -0
- package/package.json +7 -1
package/dist/lib/config.js
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import dotenv from 'dotenv';
|
|
2
2
|
import { createClient } from '@supabase/supabase-js';
|
|
3
3
|
import OpenAI from 'openai';
|
|
4
|
+
import { observeOpenAI } from '@langfuse/openai';
|
|
4
5
|
import { resolve } from 'path';
|
|
5
6
|
import { homedir } from 'os';
|
|
6
7
|
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
|
7
8
|
import { fatal, ExitCode } from './errors.js';
|
|
9
|
+
import { openaiLimiter, updateLimitsFromHeaders } from './rate-limiter.js';
|
|
10
|
+
import { initObservability } from './observability.js';
|
|
8
11
|
// --- Defaults ---
|
|
9
12
|
const LEDGER_DIR = resolve(homedir(), '.ledger');
|
|
10
13
|
const LEDGER_DOTENV = resolve(LEDGER_DIR, '.env');
|
|
@@ -45,13 +48,24 @@ export function getDefaultConfig() {
|
|
|
45
48
|
},
|
|
46
49
|
};
|
|
47
50
|
}
|
|
51
|
+
// --- Custom fetch for rate limit header interception ---
|
|
52
|
+
// Wraps the global fetch to read OpenAI's rate limit headers on every response.
|
|
53
|
+
// This works below both the OpenAI SDK and the Langfuse wrapper, so header
|
|
54
|
+
// reading survives regardless of how the client is wrapped.
|
|
55
|
+
const openaiHeaderFetch = async (input, init) => {
|
|
56
|
+
const response = await fetch(input, init);
|
|
57
|
+
await updateLimitsFromHeaders(openaiLimiter, response.headers);
|
|
58
|
+
return response;
|
|
59
|
+
};
|
|
48
60
|
// --- Load Config ---
|
|
49
|
-
export function loadConfig() {
|
|
61
|
+
export function loadConfig(options) {
|
|
50
62
|
// Priority: env vars > DOTENV_CONFIG_PATH > ~/.ledger/.env
|
|
51
63
|
const dotenvPath = process.env.DOTENV_CONFIG_PATH
|
|
52
64
|
|| (existsSync(LEDGER_DOTENV) ? LEDGER_DOTENV : undefined);
|
|
53
65
|
if (dotenvPath)
|
|
54
66
|
dotenv.config({ path: dotenvPath, quiet: true });
|
|
67
|
+
// Init observability after dotenv loads (Langfuse env vars are now available)
|
|
68
|
+
initObservability();
|
|
55
69
|
if (!process.env.SUPABASE_URL || !process.env.SUPABASE_SERVICE_ROLE_KEY) {
|
|
56
70
|
fatal('Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY. Run `ledger init` or check your .env file.', ExitCode.GENERAL_ERROR);
|
|
57
71
|
}
|
|
@@ -63,6 +77,9 @@ export function loadConfig() {
|
|
|
63
77
|
memoryDir: process.env.LEDGER_MEMORY_DIR || fileConfig.memoryDir || DEFAULT_MEMORY_DIR,
|
|
64
78
|
claudeMdPath: process.env.LEDGER_CLAUDE_MD_PATH || fileConfig.claudeMdPath || DEFAULT_CLAUDE_MD_PATH,
|
|
65
79
|
supabase: createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_ROLE_KEY),
|
|
66
|
-
openai: new OpenAI({ apiKey: process.env.OPENAI_API_KEY }),
|
|
80
|
+
openai: observeOpenAI(new OpenAI({ apiKey: process.env.OPENAI_API_KEY, maxRetries: 5, fetch: openaiHeaderFetch })),
|
|
81
|
+
cohereApiKey: process.env.COHERE_API_KEY || undefined,
|
|
82
|
+
sessionId: options?.sessionId,
|
|
83
|
+
observabilityEnvironment: options?.observabilityEnvironment,
|
|
67
84
|
};
|
|
68
85
|
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
// document-fetching.ts
|
|
2
|
+
// Read documents from the database. No writes, no search — just SELECT queries.
|
|
3
|
+
// Every query filters deleted_at IS NULL so soft-deleted documents are invisible.
|
|
4
|
+
/**
|
|
5
|
+
* Get a single document by its database ID.
|
|
6
|
+
* Returns null if the document doesn't exist or is soft-deleted.
|
|
7
|
+
*/
|
|
8
|
+
export async function getDocumentById(supabase, id) {
|
|
9
|
+
const { data, error } = await supabase
|
|
10
|
+
.from('documents')
|
|
11
|
+
.select('*')
|
|
12
|
+
.eq('id', id)
|
|
13
|
+
.is('deleted_at', null)
|
|
14
|
+
.single();
|
|
15
|
+
if (error || !data)
|
|
16
|
+
return null;
|
|
17
|
+
return data;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Get a single document by its unique name.
|
|
21
|
+
* Returns null if no document has this name or it's soft-deleted.
|
|
22
|
+
*/
|
|
23
|
+
export async function getDocumentByName(supabase, name) {
|
|
24
|
+
const { data, error } = await supabase
|
|
25
|
+
.from('documents')
|
|
26
|
+
.select('*')
|
|
27
|
+
.eq('name', name)
|
|
28
|
+
.is('deleted_at', null)
|
|
29
|
+
.single();
|
|
30
|
+
if (error || !data)
|
|
31
|
+
return null;
|
|
32
|
+
return data;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* List documents with optional filters. Returns newest first.
|
|
36
|
+
* All filters are optional — no filters = list all active documents.
|
|
37
|
+
*
|
|
38
|
+
* Uses indexed columns: domain, document_type, project, created_at DESC.
|
|
39
|
+
* The deleted_at IS NULL filter uses the index_documents_active partial index.
|
|
40
|
+
*/
|
|
41
|
+
export async function listDocuments(supabase, filters = {}) {
|
|
42
|
+
let query = supabase
|
|
43
|
+
.from('documents')
|
|
44
|
+
.select('*')
|
|
45
|
+
.is('deleted_at', null)
|
|
46
|
+
.order('created_at', { ascending: false })
|
|
47
|
+
.limit(filters.limit ?? 20);
|
|
48
|
+
if (filters.domain)
|
|
49
|
+
query = query.eq('domain', filters.domain);
|
|
50
|
+
if (filters.document_type)
|
|
51
|
+
query = query.eq('document_type', filters.document_type);
|
|
52
|
+
if (filters.project)
|
|
53
|
+
query = query.eq('project', filters.project);
|
|
54
|
+
const { data, error } = await query;
|
|
55
|
+
if (error || !data)
|
|
56
|
+
return [];
|
|
57
|
+
return data;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Fetch all documents that should sync to every machine.
|
|
61
|
+
* Sync is driven by is_auto_load, not domain — a document syncs locally because
|
|
62
|
+
* it needs to be in the AI's context every session (CLAUDE.md, MEMORY.md,
|
|
63
|
+
* personality, behavioral rules). Everything else stays in the database and is
|
|
64
|
+
* accessed via search on demand, regardless of domain.
|
|
65
|
+
*
|
|
66
|
+
* Uses the index_documents_is_auto_load partial index.
|
|
67
|
+
*/
|
|
68
|
+
export async function fetchSyncableDocuments(supabase) {
|
|
69
|
+
const { data, error } = await supabase
|
|
70
|
+
.from('documents')
|
|
71
|
+
.select('*')
|
|
72
|
+
.eq('is_auto_load', true)
|
|
73
|
+
.is('deleted_at', null);
|
|
74
|
+
if (error || !data)
|
|
75
|
+
return [];
|
|
76
|
+
return data;
|
|
77
|
+
}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
// document-operations.ts
|
|
2
|
+
// Write operations — create, update, delete, restore documents.
|
|
3
|
+
// Each function prepares data (chunk, embed, hash) then calls a Postgres RPC function.
|
|
4
|
+
// The database handles transactions (document + chunks + audit = atomic).
|
|
5
|
+
import { contentHash, chunkText, generateEmbedding, toVectorString } from './embeddings.js';
|
|
6
|
+
const DEFAULT_EMBEDDING_MODEL = 'openai/text-embedding-3-small';
|
|
7
|
+
/**
|
|
8
|
+
* Create a new document.
|
|
9
|
+
*
|
|
10
|
+
* What happens:
|
|
11
|
+
* 1. Hash the content (for change detection)
|
|
12
|
+
* 2. Split content into chunks (for better search)
|
|
13
|
+
* 3. Generate an embedding for each chunk (calls OpenAI — costs money)
|
|
14
|
+
* 4. Format embeddings as Postgres vector strings
|
|
15
|
+
* 5. Call document_create RPC (Postgres inserts document + chunks + audit in one transaction)
|
|
16
|
+
* 6. Return the new document's ID
|
|
17
|
+
*/
|
|
18
|
+
export async function createDocument(clients, props) {
|
|
19
|
+
// Always compute hash from actual content — never accept a pre-computed hash
|
|
20
|
+
const hash = contentHash(props.content);
|
|
21
|
+
// Chunk and embed
|
|
22
|
+
const chunks = chunkText(props.content);
|
|
23
|
+
const chunkContents = chunks.map(chunk => chunk.content);
|
|
24
|
+
const chunkEmbeddings = [];
|
|
25
|
+
for (const chunk of chunks) {
|
|
26
|
+
const embedding = await generateEmbedding(clients.openai, chunk.content);
|
|
27
|
+
chunkEmbeddings.push(toVectorString(embedding));
|
|
28
|
+
}
|
|
29
|
+
const { data, error } = await clients.supabase.rpc('document_create', {
|
|
30
|
+
p_name: props.name,
|
|
31
|
+
p_domain: props.domain,
|
|
32
|
+
p_document_type: props.document_type,
|
|
33
|
+
p_project: props.project ?? null,
|
|
34
|
+
p_protection: props.protection ?? 'open',
|
|
35
|
+
p_owner_type: props.owner_type ?? 'user',
|
|
36
|
+
p_owner_id: props.owner_id ?? null,
|
|
37
|
+
p_is_auto_load: props.is_auto_load ?? false,
|
|
38
|
+
p_content: props.content,
|
|
39
|
+
p_description: props.description ?? null,
|
|
40
|
+
p_content_hash: hash,
|
|
41
|
+
p_source_type: props.source_type ?? 'text',
|
|
42
|
+
p_source_url: props.source_url ?? null,
|
|
43
|
+
p_file_path: props.file_path ?? null,
|
|
44
|
+
p_file_permissions: props.file_permissions ?? null,
|
|
45
|
+
p_agent: props.agent ?? null,
|
|
46
|
+
p_status: props.status ?? null,
|
|
47
|
+
p_skill_ref: props.skill_ref ?? null,
|
|
48
|
+
p_embedding_model_id: props.embedding_model_id ?? DEFAULT_EMBEDDING_MODEL,
|
|
49
|
+
p_chunk_contents: chunkContents,
|
|
50
|
+
p_chunk_embeddings: chunkEmbeddings,
|
|
51
|
+
p_chunk_strategy: chunks[0]?.strategy ?? 'paragraph',
|
|
52
|
+
});
|
|
53
|
+
if (error)
|
|
54
|
+
throw new Error(`Failed to create document: ${error.message}`);
|
|
55
|
+
return data;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Update a document's content. Triggers re-chunking and re-embedding.
|
|
59
|
+
*
|
|
60
|
+
* What happens:
|
|
61
|
+
* 1. Hash the new content
|
|
62
|
+
* 2. Split new content into chunks
|
|
63
|
+
* 3. Generate new embeddings for each chunk (calls OpenAI)
|
|
64
|
+
* 4. Call document_update RPC — Postgres handles:
|
|
65
|
+
* - Save old content to document_versions (version snapshot)
|
|
66
|
+
* - Update the document row
|
|
67
|
+
* - Delete old chunks, insert new chunks
|
|
68
|
+
* - Write audit entry
|
|
69
|
+
*/
|
|
70
|
+
export async function updateDocument(clients, props) {
|
|
71
|
+
const hash = contentHash(props.content);
|
|
72
|
+
const chunks = chunkText(props.content);
|
|
73
|
+
const chunkContents = chunks.map(chunk => chunk.content);
|
|
74
|
+
const chunkEmbeddings = [];
|
|
75
|
+
for (const chunk of chunks) {
|
|
76
|
+
const embedding = await generateEmbedding(clients.openai, chunk.content);
|
|
77
|
+
chunkEmbeddings.push(toVectorString(embedding));
|
|
78
|
+
}
|
|
79
|
+
const { error } = await clients.supabase.rpc('document_update', {
|
|
80
|
+
p_id: props.id,
|
|
81
|
+
p_content: props.content,
|
|
82
|
+
p_content_hash: hash,
|
|
83
|
+
p_agent: props.agent ?? null,
|
|
84
|
+
p_description: props.description ?? null,
|
|
85
|
+
p_status: props.status ?? null,
|
|
86
|
+
p_embedding_model_id: props.embedding_model_id ?? DEFAULT_EMBEDDING_MODEL,
|
|
87
|
+
p_chunk_contents: chunkContents,
|
|
88
|
+
p_chunk_embeddings: chunkEmbeddings,
|
|
89
|
+
p_chunk_strategy: chunks[0]?.strategy ?? 'paragraph',
|
|
90
|
+
});
|
|
91
|
+
if (error)
|
|
92
|
+
throw new Error(`Failed to update document: ${error.message}`);
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Update document fields without changing content. No re-embedding needed.
|
|
96
|
+
*
|
|
97
|
+
* This is cheap (no OpenAI calls) — just passes the fields to Postgres.
|
|
98
|
+
* Postgres handles: update columns, sync domain to chunks if changed, write audit.
|
|
99
|
+
*/
|
|
100
|
+
export async function updateDocumentFields(clients, props) {
|
|
101
|
+
const { error } = await clients.supabase.rpc('document_update_fields', {
|
|
102
|
+
p_id: props.id,
|
|
103
|
+
p_agent: props.agent ?? null,
|
|
104
|
+
p_name: props.name ?? null,
|
|
105
|
+
p_domain: props.domain ?? null,
|
|
106
|
+
p_document_type: props.document_type ?? null,
|
|
107
|
+
p_project: props.project ?? null,
|
|
108
|
+
p_protection: props.protection ?? null,
|
|
109
|
+
p_owner_type: props.owner_type ?? null,
|
|
110
|
+
p_owner_id: props.owner_id ?? null,
|
|
111
|
+
p_is_auto_load: props.is_auto_load ?? null,
|
|
112
|
+
p_description: props.description ?? null,
|
|
113
|
+
p_source_type: props.source_type ?? null,
|
|
114
|
+
p_source_url: props.source_url ?? null,
|
|
115
|
+
p_file_path: props.file_path ?? null,
|
|
116
|
+
p_file_permissions: props.file_permissions ?? null,
|
|
117
|
+
p_status: props.status ?? null,
|
|
118
|
+
p_skill_ref: props.skill_ref ?? null,
|
|
119
|
+
p_embedding_model_id: props.embedding_model_id ?? null,
|
|
120
|
+
});
|
|
121
|
+
if (error)
|
|
122
|
+
throw new Error(`Failed to update document fields: ${error.message}`);
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Soft delete a document. The document stays in the database with deleted_at set.
|
|
126
|
+
* Chunks are removed (search shouldn't find deleted documents).
|
|
127
|
+
* Can be restored within 30 days via restoreDocument().
|
|
128
|
+
* After 30 days, document_purge() permanently removes it.
|
|
129
|
+
*/
|
|
130
|
+
export async function deleteDocument(clients, id, agent) {
|
|
131
|
+
const { error } = await clients.supabase.rpc('document_delete', {
|
|
132
|
+
p_id: id,
|
|
133
|
+
p_agent: agent,
|
|
134
|
+
});
|
|
135
|
+
if (error)
|
|
136
|
+
throw new Error(`Failed to delete document: ${error.message}`);
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Undo a soft delete. The document becomes active again.
|
|
140
|
+
* Note: chunks were removed during delete — they need to be regenerated
|
|
141
|
+
* by calling updateDocument() with the same content (which re-chunks and re-embeds).
|
|
142
|
+
*/
|
|
143
|
+
export async function restoreDocument(clients, id, agent) {
|
|
144
|
+
const { error } = await clients.supabase.rpc('document_restore', {
|
|
145
|
+
p_id: id,
|
|
146
|
+
p_agent: agent,
|
|
147
|
+
});
|
|
148
|
+
if (error)
|
|
149
|
+
throw new Error(`Failed to restore document: ${error.message}`);
|
|
150
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
// document-fetching.ts
|
|
2
|
+
// Read documents from the database. No writes, no search — just SELECT queries.
|
|
3
|
+
// Every query filters deleted_at IS NULL so soft-deleted documents are invisible.
|
|
4
|
+
/**
|
|
5
|
+
* Get a single document by its database ID.
|
|
6
|
+
* Returns null if the document doesn't exist or is soft-deleted.
|
|
7
|
+
*/
|
|
8
|
+
export async function getDocumentById(supabase, id) {
|
|
9
|
+
const { data, error } = await supabase
|
|
10
|
+
.from('documents')
|
|
11
|
+
.select('*')
|
|
12
|
+
.eq('id', id)
|
|
13
|
+
.is('deleted_at', null)
|
|
14
|
+
.single();
|
|
15
|
+
if (error) {
|
|
16
|
+
if (error.code !== 'PGRST116') {
|
|
17
|
+
process.stderr.write(`[ledger] getDocumentById(${id}) failed: ${error.message}\n`);
|
|
18
|
+
}
|
|
19
|
+
return null;
|
|
20
|
+
}
|
|
21
|
+
return data ?? null;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Get a single document by its unique name.
|
|
25
|
+
* Returns null if no document has this name or it's soft-deleted.
|
|
26
|
+
*/
|
|
27
|
+
export async function getDocumentByName(supabase, name) {
|
|
28
|
+
const { data, error } = await supabase
|
|
29
|
+
.from('documents')
|
|
30
|
+
.select('*')
|
|
31
|
+
.eq('name', name)
|
|
32
|
+
.is('deleted_at', null)
|
|
33
|
+
.single();
|
|
34
|
+
if (error) {
|
|
35
|
+
if (error.code !== 'PGRST116') {
|
|
36
|
+
process.stderr.write(`[ledger] getDocumentByName("${name}") failed: ${error.message}\n`);
|
|
37
|
+
}
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
return data ?? null;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* List documents with optional filters. Returns newest first.
|
|
44
|
+
* All filters are optional — no filters = list all active documents.
|
|
45
|
+
*
|
|
46
|
+
* Uses indexed columns: domain, document_type, project, created_at DESC.
|
|
47
|
+
* The deleted_at IS NULL filter uses the index_documents_active partial index.
|
|
48
|
+
*/
|
|
49
|
+
export async function listDocuments(supabase, filters = {}) {
|
|
50
|
+
let query = supabase
|
|
51
|
+
.from('documents')
|
|
52
|
+
.select('*')
|
|
53
|
+
.is('deleted_at', null)
|
|
54
|
+
.order('created_at', { ascending: false })
|
|
55
|
+
.limit(filters.limit ?? 20);
|
|
56
|
+
if (filters.domain)
|
|
57
|
+
query = query.eq('domain', filters.domain);
|
|
58
|
+
if (filters.document_type)
|
|
59
|
+
query = query.eq('document_type', filters.document_type);
|
|
60
|
+
if (filters.project)
|
|
61
|
+
query = query.eq('project', filters.project);
|
|
62
|
+
const { data, error } = await query;
|
|
63
|
+
if (error) {
|
|
64
|
+
process.stderr.write(`[ledger] listDocuments failed: ${error.message}\n`);
|
|
65
|
+
return [];
|
|
66
|
+
}
|
|
67
|
+
return data ?? [];
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Fetch all documents that should sync to every machine.
|
|
71
|
+
* Sync is driven by is_auto_load, not domain — a document syncs locally because
|
|
72
|
+
* it needs to be in the AI's context every session (CLAUDE.md, MEMORY.md,
|
|
73
|
+
* personality, behavioral rules). Everything else stays in the database and is
|
|
74
|
+
* accessed via search on demand, regardless of domain.
|
|
75
|
+
*
|
|
76
|
+
* Uses the index_documents_is_auto_load partial index.
|
|
77
|
+
*/
|
|
78
|
+
export async function fetchSyncableDocuments(supabase) {
|
|
79
|
+
const { data, error } = await supabase
|
|
80
|
+
.from('documents')
|
|
81
|
+
.select('*')
|
|
82
|
+
.eq('is_auto_load', true)
|
|
83
|
+
.is('deleted_at', null);
|
|
84
|
+
if (error) {
|
|
85
|
+
process.stderr.write(`[ledger] fetchSyncableDocuments failed: ${error.message}\n`);
|
|
86
|
+
return [];
|
|
87
|
+
}
|
|
88
|
+
return data ?? [];
|
|
89
|
+
}
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
// document-operations.ts
|
|
2
|
+
// Write operations — create, update, delete, restore documents.
|
|
3
|
+
// Each function prepares data (chunk, embed, hash) then calls a Postgres RPC function.
|
|
4
|
+
// The database handles transactions (document + chunks + audit = atomic).
|
|
5
|
+
import { readFileSync } from 'fs';
|
|
6
|
+
import { contentHash, chunkText, generateEmbeddingsBatch, toVectorString } from '../search/embeddings.js';
|
|
7
|
+
import { generateContextSummaries } from '../search/chunk-context-enrichment.js';
|
|
8
|
+
import { getDocumentById } from './fetching.js';
|
|
9
|
+
import { startTrace, startSpan } from '../observability.js';
|
|
10
|
+
const DEFAULT_EMBEDDING_MODEL = 'openai/text-embedding-3-small';
|
|
11
|
+
const DEFAULT_CHUNK_CONFIG = {
|
|
12
|
+
maxChunkSize: 1000,
|
|
13
|
+
overlapChars: 200,
|
|
14
|
+
strategy: 'recursive',
|
|
15
|
+
};
|
|
16
|
+
/**
|
|
17
|
+
* Create a new document.
|
|
18
|
+
*
|
|
19
|
+
* Pipeline:
|
|
20
|
+
* 1. Hash the content (change detection)
|
|
21
|
+
* 2. Chunk with recursive splitter
|
|
22
|
+
* 3. Generate context summaries per chunk (LLM call — chunk context enrichment)
|
|
23
|
+
* 4. Embed summary + chunk content (OpenAI embedding call per chunk)
|
|
24
|
+
* 5. Call document_create RPC (atomic: document + chunks + audit)
|
|
25
|
+
*/
|
|
26
|
+
export async function createDocument(clients, props, chunkConfig) {
|
|
27
|
+
const config = { ...DEFAULT_CHUNK_CONFIG, ...chunkConfig };
|
|
28
|
+
const hash = contentHash(props.content);
|
|
29
|
+
const trace = startTrace('document-ingestion', {
|
|
30
|
+
tags: ['ingestion', 'create'],
|
|
31
|
+
metadata: { documentName: props.name, domain: props.domain, documentType: props.document_type },
|
|
32
|
+
input: { contentLength: props.content.length },
|
|
33
|
+
});
|
|
34
|
+
// Chunk
|
|
35
|
+
const chunkSpan = startSpan('chunking', { input: { contentLength: props.content.length } });
|
|
36
|
+
const chunks = chunkText(props.content, config);
|
|
37
|
+
const chunkContents = chunks.map(chunk => chunk.content);
|
|
38
|
+
chunkSpan.update({ output: { chunkCount: chunks.length, avgChunkSize: Math.round(props.content.length / chunks.length) } });
|
|
39
|
+
chunkSpan.end();
|
|
40
|
+
// Enrich — generate context summaries per chunk (LLM calls auto-traced by wrapped client)
|
|
41
|
+
const enrichSpan = startSpan('context-enrichment', { metadata: { chunkCount: chunks.length, model: 'gpt-4o-mini' } });
|
|
42
|
+
const enrichmentResults = await generateContextSummaries(clients.openai, chunks, props.content);
|
|
43
|
+
const chunkSummaries = enrichmentResults.map(result => result.summary);
|
|
44
|
+
const chunkTokenCounts = enrichmentResults.map(result => result.tokenCount);
|
|
45
|
+
enrichSpan.end();
|
|
46
|
+
// Embed — summary + "\n\n" + chunk content (batch: one API call per 100 chunks, auto-traced)
|
|
47
|
+
const embedSpan = startSpan('batch-embedding', { metadata: { chunkCount: chunks.length, model: 'text-embedding-3-small' } });
|
|
48
|
+
const embeddingInputs = chunks.map((chunk, index) => chunkSummaries[index] + '\n\n' + chunk.content);
|
|
49
|
+
const embeddings = await generateEmbeddingsBatch(clients.openai, embeddingInputs);
|
|
50
|
+
const chunkEmbeddings = embeddings.map(toVectorString);
|
|
51
|
+
embedSpan.end();
|
|
52
|
+
// DB write
|
|
53
|
+
const dbSpan = startSpan('db-write', { input: { chunkCount: chunks.length } });
|
|
54
|
+
const { data, error } = await clients.supabase.rpc('document_create', {
|
|
55
|
+
p_name: props.name,
|
|
56
|
+
p_domain: props.domain,
|
|
57
|
+
p_document_type: props.document_type,
|
|
58
|
+
p_project: props.project ?? null,
|
|
59
|
+
p_protection: props.protection ?? 'open',
|
|
60
|
+
p_owner_type: props.owner_type ?? 'user',
|
|
61
|
+
p_owner_id: props.owner_id ?? null,
|
|
62
|
+
p_is_auto_load: props.is_auto_load ?? false,
|
|
63
|
+
p_content: props.content,
|
|
64
|
+
p_description: props.description ?? null,
|
|
65
|
+
p_content_hash: hash,
|
|
66
|
+
p_source_type: props.source_type ?? 'text',
|
|
67
|
+
p_source_url: props.source_url ?? null,
|
|
68
|
+
p_file_path: props.file_path ?? null,
|
|
69
|
+
p_file_permissions: props.file_permissions ?? null,
|
|
70
|
+
p_agent: props.agent ?? null,
|
|
71
|
+
p_status: props.status ?? null,
|
|
72
|
+
p_skill_ref: props.skill_ref ?? null,
|
|
73
|
+
p_embedding_model_id: props.embedding_model_id ?? DEFAULT_EMBEDDING_MODEL,
|
|
74
|
+
p_chunk_contents: chunkContents,
|
|
75
|
+
p_chunk_embeddings: chunkEmbeddings,
|
|
76
|
+
p_chunk_strategy: chunks[0]?.strategy ?? config.strategy,
|
|
77
|
+
p_chunk_summaries: chunkSummaries,
|
|
78
|
+
p_chunk_token_counts: chunkTokenCounts,
|
|
79
|
+
p_chunk_overlap: config.overlapChars,
|
|
80
|
+
});
|
|
81
|
+
dbSpan.update({ output: { documentId: data } });
|
|
82
|
+
dbSpan.end();
|
|
83
|
+
trace.end();
|
|
84
|
+
if (error)
|
|
85
|
+
throw new Error(`Failed to create document "${props.name}" (${props.domain}/${props.document_type}): ${error.message}`);
|
|
86
|
+
return data;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Update a document's content. Triggers re-chunking, re-enrichment, and re-embedding.
|
|
90
|
+
*
|
|
91
|
+
* Same pipeline as createDocument — hash, chunk, enrich, embed — then calls
|
|
92
|
+
* document_update RPC which versions old content before overwriting.
|
|
93
|
+
*/
|
|
94
|
+
export async function updateDocument(clients, props, chunkConfig) {
|
|
95
|
+
const config = { ...DEFAULT_CHUNK_CONFIG, ...chunkConfig };
|
|
96
|
+
const hash = contentHash(props.content);
|
|
97
|
+
const trace = startTrace('document-ingestion', {
|
|
98
|
+
tags: ['ingestion', 'update'],
|
|
99
|
+
metadata: { documentId: props.id },
|
|
100
|
+
input: { contentLength: props.content.length },
|
|
101
|
+
});
|
|
102
|
+
// Chunk
|
|
103
|
+
const chunkSpan = startSpan('chunking', { input: { contentLength: props.content.length } });
|
|
104
|
+
const chunks = chunkText(props.content, config);
|
|
105
|
+
const chunkContents = chunks.map(chunk => chunk.content);
|
|
106
|
+
chunkSpan.update({ output: { chunkCount: chunks.length, avgChunkSize: Math.round(props.content.length / chunks.length) } });
|
|
107
|
+
chunkSpan.end();
|
|
108
|
+
// Enrich (LLM calls auto-traced)
|
|
109
|
+
const enrichSpan = startSpan('context-enrichment', { metadata: { chunkCount: chunks.length, model: 'gpt-4o-mini' } });
|
|
110
|
+
const enrichmentResults = await generateContextSummaries(clients.openai, chunks, props.content);
|
|
111
|
+
const chunkSummaries = enrichmentResults.map(result => result.summary);
|
|
112
|
+
const chunkTokenCounts = enrichmentResults.map(result => result.tokenCount);
|
|
113
|
+
enrichSpan.end();
|
|
114
|
+
// Embed (auto-traced)
|
|
115
|
+
const embedSpan = startSpan('batch-embedding', { metadata: { chunkCount: chunks.length, model: 'text-embedding-3-small' } });
|
|
116
|
+
const embeddingInputs = chunks.map((chunk, index) => chunkSummaries[index] + '\n\n' + chunk.content);
|
|
117
|
+
const embeddings = await generateEmbeddingsBatch(clients.openai, embeddingInputs);
|
|
118
|
+
const chunkEmbeddings = embeddings.map(toVectorString);
|
|
119
|
+
embedSpan.end();
|
|
120
|
+
// DB write
|
|
121
|
+
const dbSpan = startSpan('db-write', { input: { chunkCount: chunks.length } });
|
|
122
|
+
const { error } = await clients.supabase.rpc('document_update', {
|
|
123
|
+
p_id: props.id,
|
|
124
|
+
p_content: props.content,
|
|
125
|
+
p_content_hash: hash,
|
|
126
|
+
p_agent: props.agent ?? null,
|
|
127
|
+
p_description: props.description ?? null,
|
|
128
|
+
p_status: props.status ?? null,
|
|
129
|
+
p_embedding_model_id: props.embedding_model_id ?? DEFAULT_EMBEDDING_MODEL,
|
|
130
|
+
p_chunk_contents: chunkContents,
|
|
131
|
+
p_chunk_embeddings: chunkEmbeddings,
|
|
132
|
+
p_chunk_strategy: chunks[0]?.strategy ?? config.strategy,
|
|
133
|
+
p_chunk_summaries: chunkSummaries,
|
|
134
|
+
p_chunk_token_counts: chunkTokenCounts,
|
|
135
|
+
p_chunk_overlap: config.overlapChars,
|
|
136
|
+
});
|
|
137
|
+
dbSpan.end();
|
|
138
|
+
trace.end();
|
|
139
|
+
if (error)
|
|
140
|
+
throw new Error(`Failed to update document #${props.id}: ${error.message}`);
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Update document fields without changing content. No re-embedding needed.
|
|
144
|
+
*
|
|
145
|
+
* This is cheap (no OpenAI calls) — just passes the fields to Postgres.
|
|
146
|
+
* Postgres handles: update columns, sync domain to chunks if changed, write audit.
|
|
147
|
+
*/
|
|
148
|
+
export async function updateDocumentFields(clients, props) {
|
|
149
|
+
const { error } = await clients.supabase.rpc('document_update_fields', {
|
|
150
|
+
p_id: props.id,
|
|
151
|
+
p_agent: props.agent ?? null,
|
|
152
|
+
p_name: props.name ?? null,
|
|
153
|
+
p_domain: props.domain ?? null,
|
|
154
|
+
p_document_type: props.document_type ?? null,
|
|
155
|
+
p_project: props.project ?? null,
|
|
156
|
+
p_protection: props.protection ?? null,
|
|
157
|
+
p_owner_type: props.owner_type ?? null,
|
|
158
|
+
p_owner_id: props.owner_id ?? null,
|
|
159
|
+
p_is_auto_load: props.is_auto_load ?? null,
|
|
160
|
+
p_description: props.description ?? null,
|
|
161
|
+
p_source_type: props.source_type ?? null,
|
|
162
|
+
p_source_url: props.source_url ?? null,
|
|
163
|
+
p_file_path: props.file_path ?? null,
|
|
164
|
+
p_file_permissions: props.file_permissions ?? null,
|
|
165
|
+
p_status: props.status ?? null,
|
|
166
|
+
p_skill_ref: props.skill_ref ?? null,
|
|
167
|
+
p_embedding_model_id: props.embedding_model_id ?? null,
|
|
168
|
+
});
|
|
169
|
+
if (error)
|
|
170
|
+
throw new Error(`Failed to update fields on document #${props.id}: ${error.message}`);
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Soft delete a document. The document stays in the database with deleted_at set.
|
|
174
|
+
* Chunks are removed (search shouldn't find deleted documents).
|
|
175
|
+
* Can be restored within 30 days via restoreDocument().
|
|
176
|
+
* After 30 days, document_purge() permanently removes it.
|
|
177
|
+
*/
|
|
178
|
+
export async function deleteDocument(clients, id, agent) {
|
|
179
|
+
const { error } = await clients.supabase.rpc('document_delete', {
|
|
180
|
+
p_id: id,
|
|
181
|
+
p_agent: agent,
|
|
182
|
+
});
|
|
183
|
+
if (error)
|
|
184
|
+
throw new Error(`Failed to delete document #${id}: ${error.message}`);
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Undo a soft delete. The document becomes active again.
|
|
188
|
+
* Note: chunks were removed during delete — they need to be regenerated
|
|
189
|
+
* by calling updateDocument() with the same content (which re-chunks and re-embeds).
|
|
190
|
+
*/
|
|
191
|
+
export async function restoreDocument(clients, id, agent) {
|
|
192
|
+
const { error } = await clients.supabase.rpc('document_restore', {
|
|
193
|
+
p_id: id,
|
|
194
|
+
p_agent: agent,
|
|
195
|
+
});
|
|
196
|
+
if (error)
|
|
197
|
+
throw new Error(`Failed to restore document #${id}: ${error.message}`);
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Thrown when the post-push verify pull-back does not byte-match the file we sent.
|
|
201
|
+
* Carries the document id, both byte counts, and a single-line diff preview that
|
|
202
|
+
* locates the first divergence (line / col / expected snippet / actual snippet).
|
|
203
|
+
*
|
|
204
|
+
* Caller decides how to surface this. CLI prints to stderr and exits non-zero;
|
|
205
|
+
* MCP returns it as an error result. Neither path attempts to rollback.
|
|
206
|
+
*/
|
|
207
|
+
export class VerifyMismatchError extends Error {
|
|
208
|
+
id;
|
|
209
|
+
expectedLength;
|
|
210
|
+
actualLength;
|
|
211
|
+
diffPreview;
|
|
212
|
+
constructor(id, expectedLength, actualLength, diffPreview) {
|
|
213
|
+
super(`Verify mismatch on document ${id}: pushed ${expectedLength} bytes, pulled ${actualLength} bytes. ${diffPreview}`);
|
|
214
|
+
this.id = id;
|
|
215
|
+
this.expectedLength = expectedLength;
|
|
216
|
+
this.actualLength = actualLength;
|
|
217
|
+
this.diffPreview = diffPreview;
|
|
218
|
+
this.name = 'VerifyMismatchError';
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
// Locate the first byte at which two strings differ and produce a one-line preview
|
|
222
|
+
// in the form: `line L, col C: expected '<snippet>' but got '<snippet>'`.
|
|
223
|
+
// Returns null when the strings are byte-identical.
|
|
224
|
+
function buildDiffPreview(expected, actual) {
|
|
225
|
+
if (expected === actual)
|
|
226
|
+
return null;
|
|
227
|
+
const minLength = Math.min(expected.length, actual.length);
|
|
228
|
+
let diffIndex = minLength;
|
|
229
|
+
for (let cursor = 0; cursor < minLength; cursor++) {
|
|
230
|
+
if (expected[cursor] !== actual[cursor]) {
|
|
231
|
+
diffIndex = cursor;
|
|
232
|
+
break;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
let line = 1;
|
|
236
|
+
let col = 1;
|
|
237
|
+
for (let cursor = 0; cursor < diffIndex; cursor++) {
|
|
238
|
+
if (expected[cursor] === '\n') {
|
|
239
|
+
line++;
|
|
240
|
+
col = 1;
|
|
241
|
+
}
|
|
242
|
+
else {
|
|
243
|
+
col++;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
const SNIPPET_LENGTH = 40;
|
|
247
|
+
const escape = (snippet) => snippet.replace(/\n/g, '\\n').replace(/\t/g, '\\t');
|
|
248
|
+
const expectedSnippet = escape(expected.slice(diffIndex, diffIndex + SNIPPET_LENGTH));
|
|
249
|
+
const actualSnippet = escape(actual.slice(diffIndex, diffIndex + SNIPPET_LENGTH));
|
|
250
|
+
return `line ${line}, col ${col}: expected '${expectedSnippet}' but got '${actualSnippet}'`;
|
|
251
|
+
}
|
|
252
|
+
// Pull the document back and byte-compare against the bytes we just wrote.
|
|
253
|
+
// Throws VerifyMismatchError if the DB returned different bytes than we sent
|
|
254
|
+
// (drift / pipeline transformation / concurrent write all manifest the same way).
|
|
255
|
+
async function verifyAfterWrite(clients, id, expectedBody) {
|
|
256
|
+
const pulled = await getDocumentById(clients.supabase, id);
|
|
257
|
+
if (!pulled) {
|
|
258
|
+
throw new VerifyMismatchError(id, expectedBody.length, 0, 'document not found during verify (deleted between write and verify, or wrong id)');
|
|
259
|
+
}
|
|
260
|
+
const diffPreview = buildDiffPreview(expectedBody, pulled.content);
|
|
261
|
+
if (diffPreview !== null) {
|
|
262
|
+
throw new VerifyMismatchError(id, expectedBody.length, pulled.content.length, diffPreview);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Update a document by reading its new content from a file on disk, then verify the write.
|
|
267
|
+
*
|
|
268
|
+
* Pipeline:
|
|
269
|
+
* 1. Read file from `filePath` (utf8). Surfaces fs errors (ENOENT, EACCES) as-is.
|
|
270
|
+
* 2. Call updateDocument() with the file bytes as content.
|
|
271
|
+
* 3. Pull the document back and byte-compare against what we wrote.
|
|
272
|
+
* 4. On match: return { id, verified, bytes }. On mismatch: throw VerifyMismatchError.
|
|
273
|
+
*
|
|
274
|
+
* The file is never trimmed, normalized, or transformed — bytes-in equals bytes-out.
|
|
275
|
+
*/
|
|
276
|
+
export async function updateDocumentFromFile(clients, props) {
|
|
277
|
+
const body = readFileSync(props.filePath, 'utf8');
|
|
278
|
+
await updateDocument(clients, { id: props.id, content: body, agent: props.agent });
|
|
279
|
+
await verifyAfterWrite(clients, props.id, body);
|
|
280
|
+
return { id: props.id, verified: true, bytes: body.length };
|
|
281
|
+
}
|
|
282
|
+
/**
|
|
283
|
+
* Create a new document by reading its content from a file on disk, then verify the write.
|
|
284
|
+
*
|
|
285
|
+
* Same shape as updateDocumentFromFile: read file, call createDocument(), pull-back, byte-compare.
|
|
286
|
+
* On mismatch the new document still exists; caller decides whether to delete it (audit_log
|
|
287
|
+
* preserves the create event for manual cleanup).
|
|
288
|
+
*/
|
|
289
|
+
export async function createDocumentFromFile(clients, props) {
|
|
290
|
+
const body = readFileSync(props.filePath, 'utf8');
|
|
291
|
+
const id = await createDocument(clients, {
|
|
292
|
+
name: props.name,
|
|
293
|
+
domain: props.domain,
|
|
294
|
+
document_type: props.document_type,
|
|
295
|
+
content: body,
|
|
296
|
+
description: props.description,
|
|
297
|
+
project: props.project,
|
|
298
|
+
agent: props.agent,
|
|
299
|
+
status: props.status,
|
|
300
|
+
protection: props.protection,
|
|
301
|
+
});
|
|
302
|
+
await verifyAfterWrite(clients, id, body);
|
|
303
|
+
return { id, verified: true, bytes: body.length };
|
|
304
|
+
}
|