@hasna/knowledge 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,247 @@
1
+ import { Database } from 'bun:sqlite';
2
+ import { ensureParentDir } from './workspace';
3
+
4
+ export const CURRENT_SCHEMA_VERSION = 2;
5
+
6
+ export interface KnowledgeDbStats {
7
+ schema_version: number;
8
+ sources: number;
9
+ source_revisions: number;
10
+ chunks: number;
11
+ wiki_pages: number;
12
+ citations: number;
13
+ indexes: number;
14
+ runs: number;
15
+ run_events: number;
16
+ }
17
+
18
+ const MIGRATION_1 = `
19
+ PRAGMA journal_mode = WAL;
20
+ PRAGMA foreign_keys = ON;
21
+
22
+ CREATE TABLE IF NOT EXISTS schema_versions (
23
+ version INTEGER PRIMARY KEY,
24
+ applied_at TEXT NOT NULL
25
+ );
26
+
27
+ CREATE TABLE IF NOT EXISTS sources (
28
+ id TEXT PRIMARY KEY,
29
+ uri TEXT NOT NULL UNIQUE,
30
+ kind TEXT NOT NULL,
31
+ title TEXT,
32
+ metadata_json TEXT NOT NULL DEFAULT '{}',
33
+ acl_json TEXT NOT NULL DEFAULT '{}',
34
+ created_at TEXT NOT NULL,
35
+ updated_at TEXT NOT NULL
36
+ );
37
+
38
+ CREATE TABLE IF NOT EXISTS source_revisions (
39
+ id TEXT PRIMARY KEY,
40
+ source_id TEXT NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
41
+ revision TEXT NOT NULL,
42
+ hash TEXT,
43
+ extracted_text_uri TEXT,
44
+ metadata_json TEXT NOT NULL DEFAULT '{}',
45
+ created_at TEXT NOT NULL,
46
+ UNIQUE(source_id, revision)
47
+ );
48
+
49
+ CREATE TABLE IF NOT EXISTS chunks (
50
+ id TEXT PRIMARY KEY,
51
+ source_revision_id TEXT REFERENCES source_revisions(id) ON DELETE CASCADE,
52
+ wiki_page_id TEXT,
53
+ kind TEXT NOT NULL,
54
+ ordinal INTEGER NOT NULL,
55
+ text TEXT NOT NULL,
56
+ token_count INTEGER,
57
+ start_offset INTEGER,
58
+ end_offset INTEGER,
59
+ metadata_json TEXT NOT NULL DEFAULT '{}',
60
+ created_at TEXT NOT NULL
61
+ );
62
+
63
+ CREATE TABLE IF NOT EXISTS chunk_embeddings (
64
+ id TEXT PRIMARY KEY,
65
+ chunk_id TEXT NOT NULL REFERENCES chunks(id) ON DELETE CASCADE,
66
+ provider TEXT NOT NULL,
67
+ model TEXT NOT NULL,
68
+ dimensions INTEGER NOT NULL,
69
+ vector_json TEXT NOT NULL,
70
+ created_at TEXT NOT NULL,
71
+ UNIQUE(chunk_id, provider, model)
72
+ );
73
+
74
+ CREATE TABLE IF NOT EXISTS wiki_pages (
75
+ id TEXT PRIMARY KEY,
76
+ path TEXT NOT NULL UNIQUE,
77
+ title TEXT NOT NULL,
78
+ artifact_uri TEXT,
79
+ content_hash TEXT,
80
+ status TEXT NOT NULL DEFAULT 'active',
81
+ metadata_json TEXT NOT NULL DEFAULT '{}',
82
+ created_at TEXT NOT NULL,
83
+ updated_at TEXT NOT NULL
84
+ );
85
+
86
+ CREATE TABLE IF NOT EXISTS wiki_backlinks (
87
+ from_page_id TEXT NOT NULL REFERENCES wiki_pages(id) ON DELETE CASCADE,
88
+ to_page_id TEXT NOT NULL REFERENCES wiki_pages(id) ON DELETE CASCADE,
89
+ label TEXT,
90
+ created_at TEXT NOT NULL,
91
+ PRIMARY KEY(from_page_id, to_page_id)
92
+ );
93
+
94
+ CREATE TABLE IF NOT EXISTS citations (
95
+ id TEXT PRIMARY KEY,
96
+ wiki_page_id TEXT REFERENCES wiki_pages(id) ON DELETE CASCADE,
97
+ chunk_id TEXT REFERENCES chunks(id) ON DELETE SET NULL,
98
+ source_uri TEXT NOT NULL,
99
+ quote TEXT,
100
+ start_offset INTEGER,
101
+ end_offset INTEGER,
102
+ metadata_json TEXT NOT NULL DEFAULT '{}',
103
+ created_at TEXT NOT NULL
104
+ );
105
+
106
+ CREATE TABLE IF NOT EXISTS knowledge_indexes (
107
+ id TEXT PRIMARY KEY,
108
+ kind TEXT NOT NULL,
109
+ name TEXT NOT NULL,
110
+ artifact_uri TEXT,
111
+ shard_key TEXT,
112
+ metadata_json TEXT NOT NULL DEFAULT '{}',
113
+ created_at TEXT NOT NULL,
114
+ updated_at TEXT NOT NULL,
115
+ UNIQUE(kind, name, shard_key)
116
+ );
117
+
118
+ CREATE TABLE IF NOT EXISTS runs (
119
+ id TEXT PRIMARY KEY,
120
+ type TEXT NOT NULL,
121
+ prompt TEXT,
122
+ status TEXT NOT NULL,
123
+ provider TEXT,
124
+ model TEXT,
125
+ cost_tokens INTEGER NOT NULL DEFAULT 0,
126
+ cost_usd REAL NOT NULL DEFAULT 0,
127
+ metadata_json TEXT NOT NULL DEFAULT '{}',
128
+ created_at TEXT NOT NULL,
129
+ updated_at TEXT NOT NULL
130
+ );
131
+
132
+ CREATE TABLE IF NOT EXISTS run_events (
133
+ id TEXT PRIMARY KEY,
134
+ run_id TEXT NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
135
+ level TEXT NOT NULL,
136
+ event TEXT NOT NULL,
137
+ metadata_json TEXT NOT NULL DEFAULT '{}',
138
+ created_at TEXT NOT NULL
139
+ );
140
+
141
+ CREATE TABLE IF NOT EXISTS provider_usage (
142
+ id TEXT PRIMARY KEY,
143
+ run_id TEXT REFERENCES runs(id) ON DELETE SET NULL,
144
+ provider TEXT NOT NULL,
145
+ model TEXT NOT NULL,
146
+ input_tokens INTEGER NOT NULL DEFAULT 0,
147
+ output_tokens INTEGER NOT NULL DEFAULT 0,
148
+ cost_usd REAL NOT NULL DEFAULT 0,
149
+ metadata_json TEXT NOT NULL DEFAULT '{}',
150
+ created_at TEXT NOT NULL
151
+ );
152
+
153
+ CREATE TABLE IF NOT EXISTS redaction_findings (
154
+ id TEXT PRIMARY KEY,
155
+ source_uri TEXT,
156
+ run_id TEXT REFERENCES runs(id) ON DELETE SET NULL,
157
+ severity TEXT NOT NULL,
158
+ finding_type TEXT NOT NULL,
159
+ metadata_json TEXT NOT NULL DEFAULT '{}',
160
+ created_at TEXT NOT NULL
161
+ );
162
+
163
+ CREATE TABLE IF NOT EXISTS storage_objects (
164
+ id TEXT PRIMARY KEY,
165
+ artifact_uri TEXT NOT NULL UNIQUE,
166
+ kind TEXT NOT NULL,
167
+ content_type TEXT,
168
+ hash TEXT,
169
+ size_bytes INTEGER,
170
+ metadata_json TEXT NOT NULL DEFAULT '{}',
171
+ created_at TEXT NOT NULL,
172
+ updated_at TEXT NOT NULL
173
+ );
174
+
175
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
176
+ text,
177
+ title,
178
+ source_uri,
179
+ content='',
180
+ tokenize='porter unicode61'
181
+ );
182
+
183
+ INSERT OR IGNORE INTO schema_versions(version, applied_at)
184
+ VALUES (1, datetime('now'));
185
+ `;
186
+
187
+ const MIGRATION_2 = `
188
+ DROP TABLE IF EXISTS chunks_fts;
189
+
190
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
191
+ chunk_id UNINDEXED,
192
+ text,
193
+ title,
194
+ source_uri,
195
+ tokenize='porter unicode61'
196
+ );
197
+
198
+ INSERT OR IGNORE INTO schema_versions(version, applied_at)
199
+ VALUES (2, datetime('now'));
200
+ `;
201
+
202
+ export function openKnowledgeDb(path: string): Database {
203
+ ensureParentDir(path);
204
+ const db = new Database(path);
205
+ db.exec('PRAGMA foreign_keys = ON;');
206
+ return db;
207
+ }
208
+
209
+ export function migrateKnowledgeDb(path: string): { path: string; schema_version: number } {
210
+ const db = openKnowledgeDb(path);
211
+ try {
212
+ db.exec(MIGRATION_1);
213
+ if (getSchemaVersion(db) < 2) db.exec(MIGRATION_2);
214
+ return { path, schema_version: getSchemaVersion(db) };
215
+ } finally {
216
+ db.close();
217
+ }
218
+ }
219
+
220
+ export function getSchemaVersion(db: Database): number {
221
+ const row = db.query<{ version: number }, []>('SELECT MAX(version) AS version FROM schema_versions').get();
222
+ return row?.version ?? 0;
223
+ }
224
+
225
+ function count(db: Database, table: string): number {
226
+ const row = db.query<{ n: number }, []>(`SELECT COUNT(*) AS n FROM ${table}`).get();
227
+ return row?.n ?? 0;
228
+ }
229
+
230
+ export function getKnowledgeDbStats(path: string): KnowledgeDbStats {
231
+ const db = openKnowledgeDb(path);
232
+ try {
233
+ return {
234
+ schema_version: getSchemaVersion(db),
235
+ sources: count(db, 'sources'),
236
+ source_revisions: count(db, 'source_revisions'),
237
+ chunks: count(db, 'chunks'),
238
+ wiki_pages: count(db, 'wiki_pages'),
239
+ citations: count(db, 'citations'),
240
+ indexes: count(db, 'knowledge_indexes'),
241
+ runs: count(db, 'runs'),
242
+ run_events: count(db, 'run_events'),
243
+ };
244
+ } finally {
245
+ db.close();
246
+ }
247
+ }
@@ -0,0 +1,423 @@
1
+ import { createHash } from 'node:crypto';
2
+ import { existsSync, readFileSync } from 'node:fs';
3
+ import { basename } from 'node:path';
4
+ import type { Database } from 'bun:sqlite';
5
+ import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
6
+ import { parseSourceRef, type SourceRef } from './source-ref';
7
+ import type { KnowledgeConfig } from './workspace';
8
+
9
+ export interface ManifestIngestOptions {
10
+ dbPath: string;
11
+ input: string;
12
+ config?: KnowledgeConfig;
13
+ now?: Date;
14
+ maxChunkChars?: number;
15
+ chunkOverlapChars?: number;
16
+ }
17
+
18
+ export interface ManifestIngestResult {
19
+ path: string;
20
+ db_path: string;
21
+ items_seen: number;
22
+ sources_upserted: number;
23
+ revisions_upserted: number;
24
+ chunks_inserted: number;
25
+ chunks_deleted: number;
26
+ skipped: number;
27
+ }
28
+
29
+ type ManifestObject = Record<string, unknown>;
30
+
31
+ interface NormalizedManifestItem {
32
+ raw: ManifestObject;
33
+ sourceRef: string;
34
+ sourceUri: string;
35
+ kind: SourceRef['kind'];
36
+ title: string | null;
37
+ revision: string;
38
+ hash: string | null;
39
+ extractedTextUri: string | null;
40
+ text: string | null;
41
+ metadata: ManifestObject;
42
+ acl: unknown;
43
+ status: string;
44
+ updatedAt: string;
45
+ }
46
+
47
+ function stableId(prefix: string, value: string): string {
48
+ return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
49
+ }
50
+
51
+ function asObject(value: unknown): ManifestObject | undefined {
52
+ return value && typeof value === 'object' && !Array.isArray(value) ? value as ManifestObject : undefined;
53
+ }
54
+
55
+ function asString(value: unknown): string | undefined {
56
+ return typeof value === 'string' && value.length > 0 ? value : undefined;
57
+ }
58
+
59
+ function asNumber(value: unknown): number | undefined {
60
+ return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
61
+ }
62
+
63
+ function buildSourceRefFromItem(item: ManifestObject): string {
64
+ const explicit = asString(item.source_ref) ?? asString(item.source_uri) ?? asString(item.uri);
65
+ if (explicit) return explicit;
66
+
67
+ const fileId = asString(item.file_id);
68
+ if (fileId) {
69
+ const revision = asString(item.revision_id) ?? asString(item.revision);
70
+ const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
71
+ return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
72
+ }
73
+
74
+ const sourceId = asString(item.source_id);
75
+ const path = asString(item.path);
76
+ if (sourceId && path) {
77
+ return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
78
+ }
79
+
80
+ throw new Error('Manifest item is missing source_ref, file_id, or source_id/path.');
81
+ }
82
+
83
+ function baseSourceUri(sourceRef: string, parsed: SourceRef): string {
84
+ if (parsed.kind === 'open-files' && parsed.entity === 'file' && parsed.revision_id) {
85
+ return sourceRef.replace(/\/revision\/[^/]+$/, '');
86
+ }
87
+ return sourceRef;
88
+ }
89
+
90
+ function textFromItem(item: ManifestObject): string | null {
91
+ const direct =
92
+ asString(item.extracted_text) ??
93
+ asString(item.text) ??
94
+ asString(item.content_text) ??
95
+ asString(item.markdown);
96
+ if (direct !== undefined) return direct;
97
+ const content = item.content;
98
+ return typeof content === 'string' ? content : null;
99
+ }
100
+
101
+ function extractedTextUriFromItem(item: ManifestObject): string | null {
102
+ const direct = asString(item.extracted_text_ref) ?? asString(item.extracted_text_uri) ?? asString(item.text_ref);
103
+ if (direct) return direct;
104
+ const content = asObject(item.content);
105
+ return asString(content?.extracted_text_ref) ?? asString(content?.extracted_text_uri) ?? null;
106
+ }
107
+
108
+ function titleFromItem(item: ManifestObject): string | null {
109
+ const path = asString(item.path);
110
+ return asString(item.title) ?? asString(item.name) ?? (path ? basename(path) : null);
111
+ }
112
+
113
+ function hashFromItem(item: ManifestObject): string | null {
114
+ return asString(item.hash) ?? asString(item.checksum) ?? asString(item.sha256) ?? null;
115
+ }
116
+
117
+ function revisionFromItem(item: ManifestObject, parsed: SourceRef, hash: string | null): string {
118
+ const revision =
119
+ asString(item.revision_id) ??
120
+ asString(item.revision) ??
121
+ asString(item.version_id) ??
122
+ (parsed.kind === 'open-files' ? parsed.revision_id : undefined) ??
123
+ hash ??
124
+ asString(item.updated_at);
125
+ return revision ?? 'current';
126
+ }
127
+
128
+ function metadataFromItem(item: ManifestObject, normalized: {
129
+ sourceRef: string;
130
+ sourceUri: string;
131
+ status: string;
132
+ }): ManifestObject {
133
+ const metadata: ManifestObject = {};
134
+ for (const [key, value] of Object.entries(item)) {
135
+ if (['text', 'content', 'content_text', 'extracted_text', 'markdown'].includes(key)) continue;
136
+ metadata[key] = value;
137
+ }
138
+ metadata.source_ref = normalized.sourceRef;
139
+ metadata.source_uri = normalized.sourceUri;
140
+ metadata.status = normalized.status;
141
+ return metadata;
142
+ }
143
+
144
+ function normalizeManifestItem(item: ManifestObject, now: string): NormalizedManifestItem {
145
+ const sourceRef = buildSourceRefFromItem(item);
146
+ const parsed = parseSourceRef(sourceRef);
147
+ const sourceUri = baseSourceUri(sourceRef, parsed);
148
+ const hash = hashFromItem(item);
149
+ const status = asString(item.status) ?? 'active';
150
+ return {
151
+ raw: item,
152
+ sourceRef,
153
+ sourceUri,
154
+ kind: parsed.kind,
155
+ title: titleFromItem(item),
156
+ revision: revisionFromItem(item, parsed, hash),
157
+ hash,
158
+ extractedTextUri: extractedTextUriFromItem(item),
159
+ text: textFromItem(item),
160
+ metadata: metadataFromItem(item, { sourceRef, sourceUri, status }),
161
+ acl: item.permissions ?? item.acl ?? {},
162
+ status,
163
+ updatedAt: asString(item.updated_at) ?? now,
164
+ };
165
+ }
166
+
167
+ function parseManifestText(text: string): ManifestObject[] {
168
+ const trimmed = text.trim();
169
+ if (!trimmed) return [];
170
+
171
+ if (trimmed.startsWith('[')) {
172
+ const parsed = JSON.parse(trimmed);
173
+ if (!Array.isArray(parsed)) throw new Error('Manifest array parse failed.');
174
+ return parsed.map((entry) => {
175
+ const item = asObject(entry);
176
+ if (!item) throw new Error('Manifest array entries must be objects.');
177
+ return item;
178
+ });
179
+ }
180
+
181
+ if (trimmed.startsWith('{')) {
182
+ try {
183
+ const parsed = JSON.parse(trimmed);
184
+ const object = asObject(parsed);
185
+ if (!object) throw new Error('Manifest object parse failed.');
186
+ if (Array.isArray(object.items)) {
187
+ return object.items.map((entry) => {
188
+ const item = asObject(entry);
189
+ if (!item) throw new Error('Manifest items entries must be objects.');
190
+ return item;
191
+ });
192
+ }
193
+ if ('source_ref' in object || 'source_uri' in object || 'file_id' in object) return [object];
194
+ } catch (error) {
195
+ const lines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0);
196
+ if (lines.length <= 1) throw error;
197
+ return lines.map((line) => {
198
+ const item = asObject(JSON.parse(line));
199
+ if (!item) throw new Error('Manifest JSONL entries must be objects.');
200
+ return item;
201
+ });
202
+ }
203
+ }
204
+
205
+ return trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => {
206
+ const item = asObject(JSON.parse(line));
207
+ if (!item) throw new Error('Manifest JSONL entries must be objects.');
208
+ return item;
209
+ });
210
+ }
211
+
212
+ async function readS3Text(uri: string, config?: KnowledgeConfig): Promise<string> {
213
+ const parsed = new URL(uri);
214
+ const bucket = parsed.hostname;
215
+ const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ''));
216
+ if (!bucket || !key) throw new Error(`Invalid S3 manifest URI: ${uri}`);
217
+ const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
218
+ import('@aws-sdk/client-s3'),
219
+ import('@aws-sdk/credential-providers'),
220
+ ]);
221
+ const s3Config = config?.storage.type === 's3' && config.storage.s3?.bucket === bucket ? config.storage.s3 : undefined;
222
+ const client = new S3Client({
223
+ region: s3Config?.region,
224
+ credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
225
+ maxAttempts: s3Config?.max_attempts,
226
+ });
227
+ const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
228
+ if (!response.Body) return '';
229
+ return await response.Body.transformToString();
230
+ }
231
+
232
+ async function readManifestInput(input: string, config?: KnowledgeConfig): Promise<string> {
233
+ if (input.startsWith('s3://')) return readS3Text(input, config);
234
+ if (!existsSync(input)) throw new Error(`Manifest not found: ${input}`);
235
+ return readFileSync(input, 'utf8');
236
+ }
237
+
238
+ interface TextChunk {
239
+ ordinal: number;
240
+ text: string;
241
+ startOffset: number;
242
+ endOffset: number;
243
+ }
244
+
245
+ function chunkText(text: string, maxChars: number, overlapChars: number): TextChunk[] {
246
+ const normalized = text.replace(/\r\n/g, '\n');
247
+ if (!normalized.trim()) return [];
248
+ const chunks: TextChunk[] = [];
249
+ let start = 0;
250
+ while (start < normalized.length) {
251
+ const hardEnd = Math.min(normalized.length, start + maxChars);
252
+ let end = hardEnd;
253
+ if (hardEnd < normalized.length) {
254
+ const paragraphBreak = normalized.lastIndexOf('\n\n', hardEnd);
255
+ const sentenceBreak = normalized.lastIndexOf('. ', hardEnd);
256
+ const candidate = Math.max(paragraphBreak, sentenceBreak);
257
+ if (candidate > start + Math.floor(maxChars * 0.5)) end = candidate + (candidate === paragraphBreak ? 2 : 1);
258
+ }
259
+ const chunk = normalized.slice(start, end).trim();
260
+ if (chunk) {
261
+ chunks.push({
262
+ ordinal: chunks.length,
263
+ text: chunk,
264
+ startOffset: start,
265
+ endOffset: end,
266
+ });
267
+ }
268
+ if (end >= normalized.length) break;
269
+ start = Math.max(0, end - overlapChars);
270
+ }
271
+ return chunks;
272
+ }
273
+
274
+ function estimateTokenCount(text: string): number {
275
+ const words = text.trim().split(/\s+/).filter(Boolean).length;
276
+ return Math.max(1, Math.ceil(words * 1.25));
277
+ }
278
+
279
+ function deleteChunksForRevision(db: Database, sourceRevisionId: string): number {
280
+ const rows = db.query<{ id: string }, [string]>('SELECT id FROM chunks WHERE source_revision_id = ?').all(sourceRevisionId);
281
+ for (const row of rows) {
282
+ db.run('DELETE FROM chunks_fts WHERE chunk_id = ?', [row.id]);
283
+ }
284
+ db.run('DELETE FROM chunks WHERE source_revision_id = ?', [sourceRevisionId]);
285
+ return rows.length;
286
+ }
287
+
288
+ function upsertSource(db: Database, item: NormalizedManifestItem, now: string): string {
289
+ const sourceId = stableId('src', item.sourceUri);
290
+ db.run(
291
+ `INSERT INTO sources (id, uri, kind, title, metadata_json, acl_json, created_at, updated_at)
292
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
293
+ ON CONFLICT(uri) DO UPDATE SET
294
+ kind = excluded.kind,
295
+ title = excluded.title,
296
+ metadata_json = excluded.metadata_json,
297
+ acl_json = excluded.acl_json,
298
+ updated_at = excluded.updated_at`,
299
+ [
300
+ sourceId,
301
+ item.sourceUri,
302
+ item.kind,
303
+ item.title,
304
+ JSON.stringify(item.metadata),
305
+ JSON.stringify(item.acl ?? {}),
306
+ now,
307
+ item.updatedAt,
308
+ ],
309
+ );
310
+ const row = db.query<{ id: string }, [string]>('SELECT id FROM sources WHERE uri = ?').get(item.sourceUri);
311
+ if (!row) throw new Error(`Failed to upsert source: ${item.sourceUri}`);
312
+ return row.id;
313
+ }
314
+
315
+ function upsertRevision(db: Database, sourceId: string, item: NormalizedManifestItem, now: string): string {
316
+ const revisionId = stableId('rev', `${sourceId}\u0000${item.revision}`);
317
+ db.run(
318
+ `INSERT INTO source_revisions (id, source_id, revision, hash, extracted_text_uri, metadata_json, created_at)
319
+ VALUES (?, ?, ?, ?, ?, ?, ?)
320
+ ON CONFLICT(source_id, revision) DO UPDATE SET
321
+ hash = excluded.hash,
322
+ extracted_text_uri = excluded.extracted_text_uri,
323
+ metadata_json = excluded.metadata_json`,
324
+ [
325
+ revisionId,
326
+ sourceId,
327
+ item.revision,
328
+ item.hash,
329
+ item.extractedTextUri,
330
+ JSON.stringify(item.metadata),
331
+ now,
332
+ ],
333
+ );
334
+ const row = db.query<{ id: string }, [string, string]>(
335
+ 'SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?',
336
+ ).get(sourceId, item.revision);
337
+ if (!row) throw new Error(`Failed to upsert source revision: ${item.sourceRef}`);
338
+ return row.id;
339
+ }
340
+
341
+ function insertChunks(db: Database, sourceRevisionId: string, item: NormalizedManifestItem, now: string, maxChars: number, overlapChars: number): number {
342
+ if (!item.text || item.status.toLowerCase() === 'deleted') return 0;
343
+ const chunks = chunkText(item.text, maxChars, overlapChars);
344
+ for (const chunk of chunks) {
345
+ const chunkId = stableId('chk', `${sourceRevisionId}\u0000${chunk.ordinal}\u0000${chunk.text}`);
346
+ const metadata = {
347
+ source_ref: item.sourceRef,
348
+ source_uri: item.sourceUri,
349
+ hash: item.hash,
350
+ status: item.status,
351
+ path: asString(item.raw.path) ?? null,
352
+ mime: asString(item.raw.mime) ?? asString(item.raw.content_type) ?? null,
353
+ size: asNumber(item.raw.size) ?? null,
354
+ };
355
+ db.run(
356
+ `INSERT INTO chunks (id, source_revision_id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json, created_at)
357
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
358
+ [
359
+ chunkId,
360
+ sourceRevisionId,
361
+ 'source',
362
+ chunk.ordinal,
363
+ chunk.text,
364
+ estimateTokenCount(chunk.text),
365
+ chunk.startOffset,
366
+ chunk.endOffset,
367
+ JSON.stringify(metadata),
368
+ now,
369
+ ],
370
+ );
371
+ db.run(
372
+ 'INSERT INTO chunks_fts (chunk_id, text, title, source_uri) VALUES (?, ?, ?, ?)',
373
+ [chunkId, chunk.text, item.title ?? '', item.sourceUri],
374
+ );
375
+ }
376
+ return chunks.length;
377
+ }
378
+
379
+ export async function ingestOpenFilesManifest(options: ManifestIngestOptions): Promise<ManifestIngestResult> {
380
+ const now = (options.now ?? new Date()).toISOString();
381
+ const maxChunkChars = options.maxChunkChars ?? 4000;
382
+ const chunkOverlapChars = options.chunkOverlapChars ?? 200;
383
+ if (maxChunkChars < 500) throw new Error('maxChunkChars must be at least 500.');
384
+ if (chunkOverlapChars < 0 || chunkOverlapChars >= maxChunkChars) throw new Error('chunkOverlapChars must be less than maxChunkChars.');
385
+
386
+ migrateKnowledgeDb(options.dbPath);
387
+ const text = await readManifestInput(options.input, options.config);
388
+ const items = parseManifestText(text);
389
+ const db = openKnowledgeDb(options.dbPath);
390
+ try {
391
+ const result = db.transaction(() => {
392
+ const seenSources = new Set<string>();
393
+ const seenRevisions = new Set<string>();
394
+ let chunksInserted = 0;
395
+ let chunksDeleted = 0;
396
+ let skipped = 0;
397
+ for (const raw of items) {
398
+ const item = normalizeManifestItem(raw, now);
399
+ const sourceId = upsertSource(db, item, now);
400
+ const revisionId = upsertRevision(db, sourceId, item, now);
401
+ seenSources.add(sourceId);
402
+ seenRevisions.add(revisionId);
403
+ if (item.text || item.status.toLowerCase() === 'deleted') {
404
+ chunksDeleted += deleteChunksForRevision(db, revisionId);
405
+ }
406
+ chunksInserted += insertChunks(db, revisionId, item, now, maxChunkChars, chunkOverlapChars);
407
+ }
408
+ return {
409
+ path: options.input,
410
+ db_path: options.dbPath,
411
+ items_seen: items.length,
412
+ sources_upserted: seenSources.size,
413
+ revisions_upserted: seenRevisions.size,
414
+ chunks_inserted: chunksInserted,
415
+ chunks_deleted: chunksDeleted,
416
+ skipped,
417
+ };
418
+ })();
419
+ return result;
420
+ } finally {
421
+ db.close();
422
+ }
423
+ }