@hasna/knowledge 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,423 @@
1
+ import { createHash } from 'node:crypto';
2
+ import { existsSync, readFileSync } from 'node:fs';
3
+ import { basename } from 'node:path';
4
+ import type { Database } from 'bun:sqlite';
5
+ import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
6
+ import { parseSourceRef, type SourceRef } from './source-ref';
7
+ import type { KnowledgeConfig } from './workspace';
8
+
9
+ export interface ManifestIngestOptions {
10
+ dbPath: string;
11
+ input: string;
12
+ config?: KnowledgeConfig;
13
+ now?: Date;
14
+ maxChunkChars?: number;
15
+ chunkOverlapChars?: number;
16
+ }
17
+
18
+ export interface ManifestIngestResult {
19
+ path: string;
20
+ db_path: string;
21
+ items_seen: number;
22
+ sources_upserted: number;
23
+ revisions_upserted: number;
24
+ chunks_inserted: number;
25
+ chunks_deleted: number;
26
+ skipped: number;
27
+ }
28
+
29
+ type ManifestObject = Record<string, unknown>;
30
+
31
+ interface NormalizedManifestItem {
32
+ raw: ManifestObject;
33
+ sourceRef: string;
34
+ sourceUri: string;
35
+ kind: SourceRef['kind'];
36
+ title: string | null;
37
+ revision: string;
38
+ hash: string | null;
39
+ extractedTextUri: string | null;
40
+ text: string | null;
41
+ metadata: ManifestObject;
42
+ acl: unknown;
43
+ status: string;
44
+ updatedAt: string;
45
+ }
46
+
47
+ function stableId(prefix: string, value: string): string {
48
+ return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
49
+ }
50
+
51
+ function asObject(value: unknown): ManifestObject | undefined {
52
+ return value && typeof value === 'object' && !Array.isArray(value) ? value as ManifestObject : undefined;
53
+ }
54
+
55
+ function asString(value: unknown): string | undefined {
56
+ return typeof value === 'string' && value.length > 0 ? value : undefined;
57
+ }
58
+
59
+ function asNumber(value: unknown): number | undefined {
60
+ return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
61
+ }
62
+
63
+ function buildSourceRefFromItem(item: ManifestObject): string {
64
+ const explicit = asString(item.source_ref) ?? asString(item.source_uri) ?? asString(item.uri);
65
+ if (explicit) return explicit;
66
+
67
+ const fileId = asString(item.file_id);
68
+ if (fileId) {
69
+ const revision = asString(item.revision_id) ?? asString(item.revision);
70
+ const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
71
+ return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
72
+ }
73
+
74
+ const sourceId = asString(item.source_id);
75
+ const path = asString(item.path);
76
+ if (sourceId && path) {
77
+ return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
78
+ }
79
+
80
+ throw new Error('Manifest item is missing source_ref, file_id, or source_id/path.');
81
+ }
82
+
83
+ function baseSourceUri(sourceRef: string, parsed: SourceRef): string {
84
+ if (parsed.kind === 'open-files' && parsed.entity === 'file' && parsed.revision_id) {
85
+ return sourceRef.replace(/\/revision\/[^/]+$/, '');
86
+ }
87
+ return sourceRef;
88
+ }
89
+
90
+ function textFromItem(item: ManifestObject): string | null {
91
+ const direct =
92
+ asString(item.extracted_text) ??
93
+ asString(item.text) ??
94
+ asString(item.content_text) ??
95
+ asString(item.markdown);
96
+ if (direct !== undefined) return direct;
97
+ const content = item.content;
98
+ return typeof content === 'string' ? content : null;
99
+ }
100
+
101
+ function extractedTextUriFromItem(item: ManifestObject): string | null {
102
+ const direct = asString(item.extracted_text_ref) ?? asString(item.extracted_text_uri) ?? asString(item.text_ref);
103
+ if (direct) return direct;
104
+ const content = asObject(item.content);
105
+ return asString(content?.extracted_text_ref) ?? asString(content?.extracted_text_uri) ?? null;
106
+ }
107
+
108
+ function titleFromItem(item: ManifestObject): string | null {
109
+ const path = asString(item.path);
110
+ return asString(item.title) ?? asString(item.name) ?? (path ? basename(path) : null);
111
+ }
112
+
113
+ function hashFromItem(item: ManifestObject): string | null {
114
+ return asString(item.hash) ?? asString(item.checksum) ?? asString(item.sha256) ?? null;
115
+ }
116
+
117
+ function revisionFromItem(item: ManifestObject, parsed: SourceRef, hash: string | null): string {
118
+ const revision =
119
+ asString(item.revision_id) ??
120
+ asString(item.revision) ??
121
+ asString(item.version_id) ??
122
+ (parsed.kind === 'open-files' ? parsed.revision_id : undefined) ??
123
+ hash ??
124
+ asString(item.updated_at);
125
+ return revision ?? 'current';
126
+ }
127
+
128
+ function metadataFromItem(item: ManifestObject, normalized: {
129
+ sourceRef: string;
130
+ sourceUri: string;
131
+ status: string;
132
+ }): ManifestObject {
133
+ const metadata: ManifestObject = {};
134
+ for (const [key, value] of Object.entries(item)) {
135
+ if (['text', 'content', 'content_text', 'extracted_text', 'markdown'].includes(key)) continue;
136
+ metadata[key] = value;
137
+ }
138
+ metadata.source_ref = normalized.sourceRef;
139
+ metadata.source_uri = normalized.sourceUri;
140
+ metadata.status = normalized.status;
141
+ return metadata;
142
+ }
143
+
144
+ function normalizeManifestItem(item: ManifestObject, now: string): NormalizedManifestItem {
145
+ const sourceRef = buildSourceRefFromItem(item);
146
+ const parsed = parseSourceRef(sourceRef);
147
+ const sourceUri = baseSourceUri(sourceRef, parsed);
148
+ const hash = hashFromItem(item);
149
+ const status = asString(item.status) ?? 'active';
150
+ return {
151
+ raw: item,
152
+ sourceRef,
153
+ sourceUri,
154
+ kind: parsed.kind,
155
+ title: titleFromItem(item),
156
+ revision: revisionFromItem(item, parsed, hash),
157
+ hash,
158
+ extractedTextUri: extractedTextUriFromItem(item),
159
+ text: textFromItem(item),
160
+ metadata: metadataFromItem(item, { sourceRef, sourceUri, status }),
161
+ acl: item.permissions ?? item.acl ?? {},
162
+ status,
163
+ updatedAt: asString(item.updated_at) ?? now,
164
+ };
165
+ }
166
+
167
+ function parseManifestText(text: string): ManifestObject[] {
168
+ const trimmed = text.trim();
169
+ if (!trimmed) return [];
170
+
171
+ if (trimmed.startsWith('[')) {
172
+ const parsed = JSON.parse(trimmed);
173
+ if (!Array.isArray(parsed)) throw new Error('Manifest array parse failed.');
174
+ return parsed.map((entry) => {
175
+ const item = asObject(entry);
176
+ if (!item) throw new Error('Manifest array entries must be objects.');
177
+ return item;
178
+ });
179
+ }
180
+
181
+ if (trimmed.startsWith('{')) {
182
+ try {
183
+ const parsed = JSON.parse(trimmed);
184
+ const object = asObject(parsed);
185
+ if (!object) throw new Error('Manifest object parse failed.');
186
+ if (Array.isArray(object.items)) {
187
+ return object.items.map((entry) => {
188
+ const item = asObject(entry);
189
+ if (!item) throw new Error('Manifest items entries must be objects.');
190
+ return item;
191
+ });
192
+ }
193
+ if ('source_ref' in object || 'source_uri' in object || 'file_id' in object) return [object];
194
+ } catch (error) {
195
+ const lines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0);
196
+ if (lines.length <= 1) throw error;
197
+ return lines.map((line) => {
198
+ const item = asObject(JSON.parse(line));
199
+ if (!item) throw new Error('Manifest JSONL entries must be objects.');
200
+ return item;
201
+ });
202
+ }
203
+ }
204
+
205
+ return trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => {
206
+ const item = asObject(JSON.parse(line));
207
+ if (!item) throw new Error('Manifest JSONL entries must be objects.');
208
+ return item;
209
+ });
210
+ }
211
+
212
+ async function readS3Text(uri: string, config?: KnowledgeConfig): Promise<string> {
213
+ const parsed = new URL(uri);
214
+ const bucket = parsed.hostname;
215
+ const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ''));
216
+ if (!bucket || !key) throw new Error(`Invalid S3 manifest URI: ${uri}`);
217
+ const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
218
+ import('@aws-sdk/client-s3'),
219
+ import('@aws-sdk/credential-providers'),
220
+ ]);
221
+ const s3Config = config?.storage.type === 's3' && config.storage.s3?.bucket === bucket ? config.storage.s3 : undefined;
222
+ const client = new S3Client({
223
+ region: s3Config?.region,
224
+ credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
225
+ maxAttempts: s3Config?.max_attempts,
226
+ });
227
+ const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
228
+ if (!response.Body) return '';
229
+ return await response.Body.transformToString();
230
+ }
231
+
232
+ async function readManifestInput(input: string, config?: KnowledgeConfig): Promise<string> {
233
+ if (input.startsWith('s3://')) return readS3Text(input, config);
234
+ if (!existsSync(input)) throw new Error(`Manifest not found: ${input}`);
235
+ return readFileSync(input, 'utf8');
236
+ }
237
+
238
+ interface TextChunk {
239
+ ordinal: number;
240
+ text: string;
241
+ startOffset: number;
242
+ endOffset: number;
243
+ }
244
+
245
+ function chunkText(text: string, maxChars: number, overlapChars: number): TextChunk[] {
246
+ const normalized = text.replace(/\r\n/g, '\n');
247
+ if (!normalized.trim()) return [];
248
+ const chunks: TextChunk[] = [];
249
+ let start = 0;
250
+ while (start < normalized.length) {
251
+ const hardEnd = Math.min(normalized.length, start + maxChars);
252
+ let end = hardEnd;
253
+ if (hardEnd < normalized.length) {
254
+ const paragraphBreak = normalized.lastIndexOf('\n\n', hardEnd);
255
+ const sentenceBreak = normalized.lastIndexOf('. ', hardEnd);
256
+ const candidate = Math.max(paragraphBreak, sentenceBreak);
257
+ if (candidate > start + Math.floor(maxChars * 0.5)) end = candidate + (candidate === paragraphBreak ? 2 : 1);
258
+ }
259
+ const chunk = normalized.slice(start, end).trim();
260
+ if (chunk) {
261
+ chunks.push({
262
+ ordinal: chunks.length,
263
+ text: chunk,
264
+ startOffset: start,
265
+ endOffset: end,
266
+ });
267
+ }
268
+ if (end >= normalized.length) break;
269
+ start = Math.max(0, end - overlapChars);
270
+ }
271
+ return chunks;
272
+ }
273
+
274
+ function estimateTokenCount(text: string): number {
275
+ const words = text.trim().split(/\s+/).filter(Boolean).length;
276
+ return Math.max(1, Math.ceil(words * 1.25));
277
+ }
278
+
279
+ function deleteChunksForRevision(db: Database, sourceRevisionId: string): number {
280
+ const rows = db.query<{ id: string }, [string]>('SELECT id FROM chunks WHERE source_revision_id = ?').all(sourceRevisionId);
281
+ for (const row of rows) {
282
+ db.run('DELETE FROM chunks_fts WHERE chunk_id = ?', [row.id]);
283
+ }
284
+ db.run('DELETE FROM chunks WHERE source_revision_id = ?', [sourceRevisionId]);
285
+ return rows.length;
286
+ }
287
+
288
+ function upsertSource(db: Database, item: NormalizedManifestItem, now: string): string {
289
+ const sourceId = stableId('src', item.sourceUri);
290
+ db.run(
291
+ `INSERT INTO sources (id, uri, kind, title, metadata_json, acl_json, created_at, updated_at)
292
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
293
+ ON CONFLICT(uri) DO UPDATE SET
294
+ kind = excluded.kind,
295
+ title = excluded.title,
296
+ metadata_json = excluded.metadata_json,
297
+ acl_json = excluded.acl_json,
298
+ updated_at = excluded.updated_at`,
299
+ [
300
+ sourceId,
301
+ item.sourceUri,
302
+ item.kind,
303
+ item.title,
304
+ JSON.stringify(item.metadata),
305
+ JSON.stringify(item.acl ?? {}),
306
+ now,
307
+ item.updatedAt,
308
+ ],
309
+ );
310
+ const row = db.query<{ id: string }, [string]>('SELECT id FROM sources WHERE uri = ?').get(item.sourceUri);
311
+ if (!row) throw new Error(`Failed to upsert source: ${item.sourceUri}`);
312
+ return row.id;
313
+ }
314
+
315
+ function upsertRevision(db: Database, sourceId: string, item: NormalizedManifestItem, now: string): string {
316
+ const revisionId = stableId('rev', `${sourceId}\u0000${item.revision}`);
317
+ db.run(
318
+ `INSERT INTO source_revisions (id, source_id, revision, hash, extracted_text_uri, metadata_json, created_at)
319
+ VALUES (?, ?, ?, ?, ?, ?, ?)
320
+ ON CONFLICT(source_id, revision) DO UPDATE SET
321
+ hash = excluded.hash,
322
+ extracted_text_uri = excluded.extracted_text_uri,
323
+ metadata_json = excluded.metadata_json`,
324
+ [
325
+ revisionId,
326
+ sourceId,
327
+ item.revision,
328
+ item.hash,
329
+ item.extractedTextUri,
330
+ JSON.stringify(item.metadata),
331
+ now,
332
+ ],
333
+ );
334
+ const row = db.query<{ id: string }, [string, string]>(
335
+ 'SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?',
336
+ ).get(sourceId, item.revision);
337
+ if (!row) throw new Error(`Failed to upsert source revision: ${item.sourceRef}`);
338
+ return row.id;
339
+ }
340
+
341
+ function insertChunks(db: Database, sourceRevisionId: string, item: NormalizedManifestItem, now: string, maxChars: number, overlapChars: number): number {
342
+ if (!item.text || item.status.toLowerCase() === 'deleted') return 0;
343
+ const chunks = chunkText(item.text, maxChars, overlapChars);
344
+ for (const chunk of chunks) {
345
+ const chunkId = stableId('chk', `${sourceRevisionId}\u0000${chunk.ordinal}\u0000${chunk.text}`);
346
+ const metadata = {
347
+ source_ref: item.sourceRef,
348
+ source_uri: item.sourceUri,
349
+ hash: item.hash,
350
+ status: item.status,
351
+ path: asString(item.raw.path) ?? null,
352
+ mime: asString(item.raw.mime) ?? asString(item.raw.content_type) ?? null,
353
+ size: asNumber(item.raw.size) ?? null,
354
+ };
355
+ db.run(
356
+ `INSERT INTO chunks (id, source_revision_id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json, created_at)
357
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
358
+ [
359
+ chunkId,
360
+ sourceRevisionId,
361
+ 'source',
362
+ chunk.ordinal,
363
+ chunk.text,
364
+ estimateTokenCount(chunk.text),
365
+ chunk.startOffset,
366
+ chunk.endOffset,
367
+ JSON.stringify(metadata),
368
+ now,
369
+ ],
370
+ );
371
+ db.run(
372
+ 'INSERT INTO chunks_fts (chunk_id, text, title, source_uri) VALUES (?, ?, ?, ?)',
373
+ [chunkId, chunk.text, item.title ?? '', item.sourceUri],
374
+ );
375
+ }
376
+ return chunks.length;
377
+ }
378
+
379
+ export async function ingestOpenFilesManifest(options: ManifestIngestOptions): Promise<ManifestIngestResult> {
380
+ const now = (options.now ?? new Date()).toISOString();
381
+ const maxChunkChars = options.maxChunkChars ?? 4000;
382
+ const chunkOverlapChars = options.chunkOverlapChars ?? 200;
383
+ if (maxChunkChars < 500) throw new Error('maxChunkChars must be at least 500.');
384
+ if (chunkOverlapChars < 0 || chunkOverlapChars >= maxChunkChars) throw new Error('chunkOverlapChars must be less than maxChunkChars.');
385
+
386
+ migrateKnowledgeDb(options.dbPath);
387
+ const text = await readManifestInput(options.input, options.config);
388
+ const items = parseManifestText(text);
389
+ const db = openKnowledgeDb(options.dbPath);
390
+ try {
391
+ const result = db.transaction(() => {
392
+ const seenSources = new Set<string>();
393
+ const seenRevisions = new Set<string>();
394
+ let chunksInserted = 0;
395
+ let chunksDeleted = 0;
396
+ let skipped = 0;
397
+ for (const raw of items) {
398
+ const item = normalizeManifestItem(raw, now);
399
+ const sourceId = upsertSource(db, item, now);
400
+ const revisionId = upsertRevision(db, sourceId, item, now);
401
+ seenSources.add(sourceId);
402
+ seenRevisions.add(revisionId);
403
+ if (item.text || item.status.toLowerCase() === 'deleted') {
404
+ chunksDeleted += deleteChunksForRevision(db, revisionId);
405
+ }
406
+ chunksInserted += insertChunks(db, revisionId, item, now, maxChunkChars, chunkOverlapChars);
407
+ }
408
+ return {
409
+ path: options.input,
410
+ db_path: options.dbPath,
411
+ items_seen: items.length,
412
+ sources_upserted: seenSources.size,
413
+ revisions_upserted: seenRevisions.size,
414
+ chunks_inserted: chunksInserted,
415
+ chunks_deleted: chunksDeleted,
416
+ skipped,
417
+ };
418
+ })();
419
+ return result;
420
+ } finally {
421
+ db.close();
422
+ }
423
+ }