@hasna/knowledge 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/cli.ts CHANGED
@@ -10,7 +10,9 @@ import { getKnowledgeDbStats, migrateKnowledgeDb, openKnowledgeDb } from './know
10
10
  import { createArtifactStore } from './artifact-store';
11
11
  import { initializeWikiLayout } from './wiki-layout';
12
12
  import { ingestOpenFilesManifest } from './manifest-ingest';
13
+ import { ingestSourceRef } from './source-ingest';
13
14
  import { consumeOpenFilesOutbox } from './outbox-consume';
15
+ import { resolveOpenFilesSource } from './source-resolver';
14
16
  import { approvalStatus, assertS3ReadAllowed, assertWebSearchAllowed, createApprovalGate, recordAuditEvent, recordRedactionFindings, redactSecrets, resolveSafetyPolicy } from './safety';
15
17
  import pkg from '../package.json' with { type: 'json' };
16
18
 
@@ -49,6 +51,7 @@ interface Flags {
49
51
  tag?: string;
50
52
  format?: string;
51
53
  completions?: string;
54
+ purpose?: string;
52
55
  noColor?: boolean;
53
56
  scope?: string;
54
57
  olderThan?: number;
@@ -62,7 +65,7 @@ interface ParseResult {
62
65
  flags: Flags;
63
66
  }
64
67
 
65
- const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'db', 'wiki', 'ingest', 'reindex', 'safety', 'help'];
68
+ const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'db', 'wiki', 'source', 'ingest', 'reindex', 'safety', 'help'];
66
69
  const COMMAND_ALIASES: Record<string, string> = {
67
70
  ls: 'list',
68
71
  rm: 'delete',
@@ -97,6 +100,7 @@ function parseArgs(argv: string[]): ParseResult {
97
100
  case '--tag': case '-t': flags.tag = argv[i + 1]; i += 1; break;
98
101
  case '--format': flags.format = argv[i + 1]; i += 1; break;
99
102
  case '--completions': flags.completions = argv[i + 1]; i += 1; break;
103
+ case '--purpose': flags.purpose = argv[i + 1]; i += 1; break;
100
104
  case '--no-color': flags.noColor = true; break;
101
105
  case '--scope': flags.scope = argv[i + 1]; i += 1; break;
102
106
  case '--older-than': flags.olderThan = Number(argv[i + 1]); i += 1; break;
@@ -165,7 +169,9 @@ Commands:
165
169
  paths Show resolved workspace/store paths
166
170
  db init|stats Initialize or inspect local knowledge.db
167
171
  wiki init Initialize scalable wiki/schema/index/log artifacts
172
+ source resolve <source-ref> Resolve read-only source content and citation evidence
168
173
  ingest manifest <file|s3://> Ingest an open-files manifest into knowledge.db
174
+ ingest source <source-ref> Ingest a read-only source ref into knowledge.db
169
175
  reindex outbox <file|s3://> Consume open-files change events and invalidate chunks
170
176
  safety status|check|approve|audit|redact
171
177
  help [command] Show help
@@ -173,6 +179,7 @@ Commands:
173
179
  Global Options:
174
180
  --json Output JSON
175
181
  --store <path> Override store path
182
+ --purpose <name> Read-only source purpose (default: knowledge_answer)
176
183
  --scope local|global|project Store scope (default: global ~/.hasna/apps/knowledge/)
177
184
  --no-color Disable color output
178
185
  --completions <shell> Output completions for bash|zsh|fish
@@ -229,7 +236,8 @@ function printCommandHelp(command: string): void {
229
236
  if (command === 'paths') { console.log('Usage: open-knowledge paths [--scope local|global|project] [--json]'); return; }
230
237
  if (command === 'db') { console.log('Usage: open-knowledge db init|stats [--scope local|global|project] [--json]'); return; }
231
238
  if (command === 'wiki') { console.log('Usage: open-knowledge wiki init [--scope local|global|project] [--json]'); return; }
232
- if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> [--scope local|global|project] [--json]'); return; }
239
+ if (command === 'source') { console.log('Usage: open-knowledge source resolve <source-ref> [--purpose knowledge_answer|knowledge_index] [--limit <n>] [--scope local|global|project] [--json]'); return; }
240
+ if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> | source <source-ref> [--purpose knowledge_index] [--scope local|global|project] [--json]'); return; }
233
241
  if (command === 'reindex') { console.log('Usage: open-knowledge reindex outbox <file|s3://bucket/key> [--scope local|global|project] [--json]'); return; }
234
242
  if (command === 'safety') { console.log('Usage: open-knowledge safety status|check|approve|audit|redact [args] [--scope local|global|project] [--json]'); return; }
235
243
  printGlobalHelp();
@@ -276,11 +284,11 @@ async function run(argv: string[]): Promise<void> {
276
284
  if (flags.completions) {
277
285
  const shell = flags.completions;
278
286
  if (shell === 'bash') {
279
- console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths db wiki ingest reindex safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
287
+ console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths db wiki source ingest reindex safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
280
288
  } else if (shell === 'zsh') {
281
- console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths db wiki ingest reindex safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
289
+ console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths db wiki source ingest reindex safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
282
290
  } else if (shell === 'fish') {
283
- console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths db wiki ingest reindex safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
291
+ console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths db wiki source ingest reindex safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
284
292
  } else {
285
293
  throw new Error("Invalid --completions value. Use 'bash', 'zsh', or 'fish'.");
286
294
  }
@@ -476,24 +484,64 @@ async function run(argv: string[]): Promise<void> {
476
484
  }
477
485
  }
478
486
 
479
- if (command === 'ingest') {
487
+ if (command === 'source') {
480
488
  const action = positional[1] ?? '';
481
- if (action !== 'manifest') throw new Error("Invalid ingest action. Use 'manifest'.");
482
- const input = positional[2];
483
- if (!input) throw new Error('Usage: open-knowledge ingest manifest <file|s3://bucket/key>');
489
+ if (action !== 'resolve') throw new Error("Invalid source action. Use 'resolve'.");
490
+ const sourceRef = positional[2];
491
+ if (!sourceRef) throw new Error('Usage: open-knowledge source resolve <source-ref>');
484
492
  const resolvedWorkspace = ensureKnowledgeWorkspace(workspace.home);
485
493
  const config = readKnowledgeConfig(resolvedWorkspace.configPath);
486
494
  const safetyPolicy = resolveSafetyPolicy(config, resolvedWorkspace);
487
- const result = await ingestOpenFilesManifest({
495
+ const result = await resolveOpenFilesSource({
488
496
  dbPath: resolvedWorkspace.knowledgeDbPath,
489
- input,
490
- config,
497
+ sourceRef,
498
+ purpose: flags.purpose,
499
+ limit: flags.limit,
491
500
  safetyPolicy,
492
501
  });
493
- output({ ok: true, ...result, message: `Ingested ${result.items_seen} manifest item(s)` }, flags.json);
502
+ output({
503
+ ok: true,
504
+ ...result,
505
+ message: result.resolved
506
+ ? `Resolved ${result.source_ref} (${result.content.chunks_returned}/${result.content.chunks_total} chunks)`
507
+ : `Source not indexed: ${sourceRef}`,
508
+ }, flags.json);
494
509
  return;
495
510
  }
496
511
 
512
+ if (command === 'ingest') {
513
+ const action = positional[1] ?? '';
514
+ const resolvedWorkspace = ensureKnowledgeWorkspace(workspace.home);
515
+ const config = readKnowledgeConfig(resolvedWorkspace.configPath);
516
+ const safetyPolicy = resolveSafetyPolicy(config, resolvedWorkspace);
517
+ if (action === 'manifest') {
518
+ const input = positional[2];
519
+ if (!input) throw new Error('Usage: open-knowledge ingest manifest <file|s3://bucket/key>');
520
+ const result = await ingestOpenFilesManifest({
521
+ dbPath: resolvedWorkspace.knowledgeDbPath,
522
+ input,
523
+ config,
524
+ safetyPolicy,
525
+ });
526
+ output({ ok: true, ...result, message: `Ingested ${result.items_seen} manifest item(s)` }, flags.json);
527
+ return;
528
+ }
529
+ if (action === 'source') {
530
+ const sourceRef = positional[2];
531
+ if (!sourceRef) throw new Error('Usage: open-knowledge ingest source <source-ref>');
532
+ const result = await ingestSourceRef({
533
+ dbPath: resolvedWorkspace.knowledgeDbPath,
534
+ sourceRef,
535
+ purpose: flags.purpose,
536
+ config,
537
+ safetyPolicy,
538
+ });
539
+ output({ ok: true, ...result, message: `Ingested source ${result.source_ref} (${result.chunks_inserted} chunks)` }, flags.json);
540
+ return;
541
+ }
542
+ throw new Error("Invalid ingest action. Use 'manifest' or 'source'.");
543
+ }
544
+
497
545
  if (command === 'reindex') {
498
546
  const action = positional[1] ?? '';
499
547
  if (action !== 'outbox') throw new Error("Invalid reindex action. Use 'outbox'.");
@@ -24,6 +24,17 @@ export interface ManifestIngestOptions {
24
24
  chunkOverlapChars?: number;
25
25
  }
26
26
 
27
+ export interface ManifestItemsIngestOptions {
28
+ dbPath: string;
29
+ items: ManifestObject[];
30
+ sourceLabel: string;
31
+ readAction?: string;
32
+ safetyPolicy?: SafetyPolicy;
33
+ now?: Date;
34
+ maxChunkChars?: number;
35
+ chunkOverlapChars?: number;
36
+ }
37
+
27
38
  export interface ManifestIngestResult {
28
39
  path: string;
29
40
  db_path: string;
@@ -36,7 +47,7 @@ export interface ManifestIngestResult {
36
47
  skipped: number;
37
48
  }
38
49
 
39
- type ManifestObject = Record<string, unknown>;
50
+ export type ManifestObject = Record<string, unknown>;
40
51
 
41
52
  interface NormalizedManifestItem {
42
53
  raw: ManifestObject;
@@ -405,6 +416,23 @@ function insertChunks(db: Database, sourceRevisionId: string, item: NormalizedMa
405
416
  }
406
417
 
407
418
  export async function ingestOpenFilesManifest(options: ManifestIngestOptions): Promise<ManifestIngestResult> {
419
+ const now = options.now ?? new Date();
420
+ if (options.safetyPolicy) assertWriteAllowed(options.dbPath, options.safetyPolicy);
421
+ migrateKnowledgeDb(options.dbPath);
422
+ const text = await readManifestInput(options.input, options.config, options.safetyPolicy);
423
+ const items = parseManifestText(text);
424
+ return ingestOpenFilesManifestItems({
425
+ dbPath: options.dbPath,
426
+ items,
427
+ sourceLabel: options.input,
428
+ safetyPolicy: options.safetyPolicy,
429
+ now,
430
+ maxChunkChars: options.maxChunkChars,
431
+ chunkOverlapChars: options.chunkOverlapChars,
432
+ });
433
+ }
434
+
435
+ export async function ingestOpenFilesManifestItems(options: ManifestItemsIngestOptions): Promise<ManifestIngestResult> {
408
436
  const now = (options.now ?? new Date()).toISOString();
409
437
  const maxChunkChars = options.maxChunkChars ?? 4000;
410
438
  const chunkOverlapChars = options.chunkOverlapChars ?? 200;
@@ -413,8 +441,6 @@ export async function ingestOpenFilesManifest(options: ManifestIngestOptions): P
413
441
 
414
442
  if (options.safetyPolicy) assertWriteAllowed(options.dbPath, options.safetyPolicy);
415
443
  migrateKnowledgeDb(options.dbPath);
416
- const text = await readManifestInput(options.input, options.config, options.safetyPolicy);
417
- const items = parseManifestText(text);
418
444
  const db = openKnowledgeDb(options.dbPath);
419
445
  try {
420
446
  const result = db.transaction(() => {
@@ -426,13 +452,13 @@ export async function ingestOpenFilesManifest(options: ManifestIngestOptions): P
426
452
  let skipped = 0;
427
453
  recordAuditEvent(db, {
428
454
  event_type: 'source_read',
429
- action: options.input.startsWith('s3://') ? 's3_manifest_read' : 'local_manifest_read',
430
- target_uri: options.input,
455
+ action: options.readAction ?? (options.sourceLabel.startsWith('s3://') ? 's3_manifest_read' : 'local_manifest_read'),
456
+ target_uri: options.sourceLabel,
431
457
  decision: 'allow',
432
- metadata: { items: items.length, read_only: true },
458
+ metadata: { items: options.items.length, read_only: true },
433
459
  created_at: now,
434
460
  });
435
- for (const raw of items) {
461
+ for (const raw of options.items) {
436
462
  const item = normalizeManifestItem(raw, now);
437
463
  const sourceId = upsertSource(db, item, now);
438
464
  const revisionId = upsertRevision(db, sourceId, item, now);
@@ -450,13 +476,13 @@ export async function ingestOpenFilesManifest(options: ManifestIngestOptions): P
450
476
  action: 'knowledge_manifest_ingest',
451
477
  target_uri: options.dbPath,
452
478
  decision: 'allow',
453
- metadata: { items: items.length, sources: seenSources.size, revisions: seenRevisions.size, chunks_inserted: chunksInserted, redactions },
479
+ metadata: { items: options.items.length, sources: seenSources.size, revisions: seenRevisions.size, chunks_inserted: chunksInserted, redactions },
454
480
  created_at: now,
455
481
  });
456
482
  return {
457
- path: options.input,
483
+ path: options.sourceLabel,
458
484
  db_path: options.dbPath,
459
- items_seen: items.length,
485
+ items_seen: options.items.length,
460
486
  sources_upserted: seenSources.size,
461
487
  revisions_upserted: seenRevisions.size,
462
488
  chunks_inserted: chunksInserted,
package/src/mcp.js CHANGED
@@ -7,6 +7,8 @@ import pkg from '../package.json' with { type: 'json' };
7
7
  import { defaultStorePath, loadStore, saveStore, makeId, withLock } from './store.ts';
8
8
  import { ensureKnowledgeWorkspace, readKnowledgeConfig, resolveScopedWorkspace } from './workspace.ts';
9
9
  import { parseSourceRef } from './source-ref.ts';
10
+ import { resolveOpenFilesSource } from './source-resolver.ts';
11
+ import { resolveSafetyPolicy } from './safety.ts';
10
12
 
11
13
  const storePathField = z.string().optional().describe('Path to the JSON store file');
12
14
  const scopeField = z.enum(['local', 'global', 'project']).optional().describe('Workspace scope');
@@ -102,6 +104,29 @@ export function buildServer() {
102
104
  }
103
105
  });
104
106
 
107
+ registerTool(server, 'ok_resolve_source', 'Resolve source content', 'Resolve an indexed source ref through the read-only open-files boundary and return chunk citation evidence', {
108
+ source_ref: z.string().describe('Source reference URI, preferably open-files://...'),
109
+ purpose: z.string().optional().describe('Read-only purpose label, default knowledge_answer'),
110
+ limit: z.number().optional().describe('Maximum chunks to return, default 10'),
111
+ scope: scopeField,
112
+ }, async ({ source_ref, purpose, limit, scope }) => {
113
+ const workspace = ensureKnowledgeWorkspace(resolveScopedWorkspace(scope).home);
114
+ const config = readKnowledgeConfig(workspace.configPath);
115
+ const safetyPolicy = resolveSafetyPolicy(config, workspace);
116
+ try {
117
+ const result = await resolveOpenFilesSource({
118
+ dbPath: workspace.knowledgeDbPath,
119
+ sourceRef: source_ref,
120
+ purpose,
121
+ limit,
122
+ safetyPolicy,
123
+ });
124
+ return jsonText({ ok: true, ...result });
125
+ } catch (error) {
126
+ return errorText(error instanceof Error ? error.message : String(error));
127
+ }
128
+ });
129
+
105
130
  registerTool(server, 'ok_add', 'Add a knowledge item', 'Add a new item to the knowledge store', {
106
131
  title: z.string().describe('Item title'),
107
132
  content: z.string().describe('Item content/body'),
@@ -0,0 +1,268 @@
1
+ import { createHash } from 'node:crypto';
2
+ import { existsSync, readFileSync } from 'node:fs';
3
+ import { basename } from 'node:path';
4
+ import { ingestOpenFilesManifestItems, type ManifestIngestResult, type ManifestObject } from './manifest-ingest';
5
+ import { parseSourceRef, type SourceRef } from './source-ref';
6
+ import { resolveOpenFilesSource } from './source-resolver';
7
+ import type { KnowledgeConfig } from './workspace';
8
+ import { assertS3ReadAllowed, assertWebSearchAllowed, type SafetyPolicy } from './safety';
9
+
10
+ export interface SourceIngestOptions {
11
+ dbPath: string;
12
+ sourceRef: string;
13
+ purpose?: string;
14
+ config?: KnowledgeConfig;
15
+ safetyPolicy?: SafetyPolicy;
16
+ now?: Date;
17
+ }
18
+
19
+ export interface SourceIngestResult extends ManifestIngestResult {
20
+ source_ref: string;
21
+ content_source: 'catalog_chunks' | 'extracted_text_ref' | 'file' | 's3' | 'web';
22
+ read_only: true;
23
+ hash: string;
24
+ }
25
+
26
+ interface ResolvedText {
27
+ text: string;
28
+ contentSource: SourceIngestResult['content_source'];
29
+ title: string | null;
30
+ mime: string | null;
31
+ size: number | null;
32
+ hash: string | null;
33
+ revision: string | null;
34
+ extractedTextRef: string | null;
35
+ metadata: Record<string, unknown>;
36
+ permissions: Record<string, unknown>;
37
+ }
38
+
39
+ function sha256Text(text: string): string {
40
+ return `sha256:${createHash('sha256').update(text).digest('hex')}`;
41
+ }
42
+
43
+ function stripHtml(html: string): string {
44
+ return html
45
+ .replace(/<script[\s\S]*?<\/script>/gi, ' ')
46
+ .replace(/<style[\s\S]*?<\/style>/gi, ' ')
47
+ .replace(/<[^>]+>/g, ' ')
48
+ .replace(/&nbsp;/g, ' ')
49
+ .replace(/&amp;/g, '&')
50
+ .replace(/&lt;/g, '<')
51
+ .replace(/&gt;/g, '>')
52
+ .replace(/\s+\n/g, '\n')
53
+ .replace(/\n\s+/g, '\n')
54
+ .replace(/[ \t]{2,}/g, ' ')
55
+ .trim();
56
+ }
57
+
58
+ async function readS3Text(uri: string, config?: KnowledgeConfig, safetyPolicy?: SafetyPolicy): Promise<string> {
59
+ const parsed = new URL(uri);
60
+ const bucket = parsed.hostname;
61
+ const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ''));
62
+ if (!bucket || !key) throw new Error(`Invalid S3 source URI: ${uri}`);
63
+ if (safetyPolicy) assertS3ReadAllowed(uri, safetyPolicy);
64
+ const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
65
+ import('@aws-sdk/client-s3'),
66
+ import('@aws-sdk/credential-providers'),
67
+ ]);
68
+ const s3Config = config?.storage.type === 's3' && config.storage.s3?.bucket === bucket ? config.storage.s3 : undefined;
69
+ const client = new S3Client({
70
+ region: s3Config?.region,
71
+ credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
72
+ maxAttempts: s3Config?.max_attempts,
73
+ });
74
+ const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
75
+ if (!response.Body) return '';
76
+ return await response.Body.transformToString();
77
+ }
78
+
79
+ async function readWebText(uri: string, safetyPolicy?: SafetyPolicy): Promise<{ text: string; mime: string | null }> {
80
+ if (safetyPolicy) assertWebSearchAllowed(safetyPolicy);
81
+ const response = await fetch(uri, {
82
+ headers: {
83
+ accept: 'text/markdown,text/plain,text/html,application/json;q=0.8,*/*;q=0.5',
84
+ 'user-agent': '@hasna/knowledge source-ingest',
85
+ },
86
+ });
87
+ if (!response.ok) throw new Error(`Web source read failed ${response.status}: ${uri}`);
88
+ const mime = response.headers.get('content-type');
89
+ const body = await response.text();
90
+ return { text: mime?.includes('html') ? stripHtml(body) : body, mime };
91
+ }
92
+
93
+ function titleForRef(parsed: SourceRef): string | null {
94
+ if (parsed.kind === 'file') return basename(parsed.path);
95
+ if (parsed.kind === 's3') return basename(parsed.key);
96
+ if (parsed.kind === 'web') return basename(new URL(parsed.url).pathname) || parsed.url;
97
+ return parsed.path ? basename(parsed.path) : parsed.id;
98
+ }
99
+
100
+ async function readDirectSourceText(parsed: SourceRef, config?: KnowledgeConfig, safetyPolicy?: SafetyPolicy): Promise<ResolvedText> {
101
+ if (parsed.kind === 'file') {
102
+ if (!existsSync(parsed.path)) throw new Error(`Source file not found: ${parsed.path}`);
103
+ const text = readFileSync(parsed.path, 'utf8');
104
+ return {
105
+ text,
106
+ contentSource: 'file',
107
+ title: titleForRef(parsed),
108
+ mime: 'text/plain',
109
+ size: text.length,
110
+ hash: sha256Text(text),
111
+ revision: null,
112
+ extractedTextRef: null,
113
+ metadata: { path: parsed.path },
114
+ permissions: { mode: 'read_only' },
115
+ };
116
+ }
117
+
118
+ if (parsed.kind === 's3') {
119
+ const text = await readS3Text(parsed.uri, config, safetyPolicy);
120
+ return {
121
+ text,
122
+ contentSource: 's3',
123
+ title: titleForRef(parsed),
124
+ mime: 'text/plain',
125
+ size: text.length,
126
+ hash: sha256Text(text),
127
+ revision: null,
128
+ extractedTextRef: null,
129
+ metadata: { bucket: parsed.bucket, key: parsed.key },
130
+ permissions: { mode: 'read_only' },
131
+ };
132
+ }
133
+
134
+ if (parsed.kind === 'web') {
135
+ const web = await readWebText(parsed.url, safetyPolicy);
136
+ return {
137
+ text: web.text,
138
+ contentSource: 'web',
139
+ title: titleForRef(parsed),
140
+ mime: web.mime,
141
+ size: web.text.length,
142
+ hash: sha256Text(web.text),
143
+ revision: null,
144
+ extractedTextRef: null,
145
+ metadata: { url: parsed.url },
146
+ permissions: { mode: 'read_only' },
147
+ };
148
+ }
149
+
150
+ throw new Error(`Direct source reading is not available for ${parsed.uri}`);
151
+ }
152
+
153
+ async function readTextRef(uri: string, config?: KnowledgeConfig, safetyPolicy?: SafetyPolicy): Promise<{ text: string; contentSource: SourceIngestResult['content_source'] }> {
154
+ if (uri.startsWith('open-files://')) {
155
+ throw new Error('Open-files extracted text refs require an open-files resolver API. Ingest an open-files manifest with extracted_text or an extracted_text_ref using file://, s3://, or https://.');
156
+ }
157
+ const parsed = parseSourceRef(uri);
158
+ const direct = await readDirectSourceText(parsed, config, safetyPolicy);
159
+ return { text: direct.text, contentSource: 'extracted_text_ref' };
160
+ }
161
+
162
+ async function readOpenFilesSourceText(options: SourceIngestOptions): Promise<ResolvedText> {
163
+ const resolved = await resolveOpenFilesSource({
164
+ dbPath: options.dbPath,
165
+ sourceRef: options.sourceRef,
166
+ purpose: options.purpose ?? 'knowledge_index',
167
+ limit: 100,
168
+ safetyPolicy: options.safetyPolicy,
169
+ now: options.now,
170
+ });
171
+ if (!resolved.resolved) {
172
+ throw new Error('Open-files source is not in the local knowledge catalog. Ingest an open-files manifest first or use the open-files resolver API.');
173
+ }
174
+ if (resolved.revision?.extracted_text_uri && !resolved.content.text_available) {
175
+ const textRef = await readTextRef(resolved.revision.extracted_text_uri, options.config, options.safetyPolicy);
176
+ return {
177
+ text: textRef.text,
178
+ contentSource: textRef.contentSource,
179
+ title: resolved.source?.title ?? null,
180
+ mime: resolved.content.mime,
181
+ size: textRef.text.length,
182
+ hash: resolved.revision.hash ?? sha256Text(textRef.text),
183
+ revision: resolved.revision.revision,
184
+ extractedTextRef: resolved.revision.extracted_text_uri,
185
+ metadata: resolved.source?.metadata ?? {},
186
+ permissions: resolved.source?.permissions ?? { mode: 'read_only' },
187
+ };
188
+ }
189
+ if (resolved.chunks.length === 0) {
190
+ throw new Error('Open-files source has no extracted text chunks yet. Ingest an open-files manifest with extracted_text or extracted_text_ref first.');
191
+ }
192
+ const text = resolved.chunks.map((chunk) => chunk.text).join('\n\n');
193
+ return {
194
+ text,
195
+ contentSource: 'catalog_chunks',
196
+ title: resolved.source?.title ?? null,
197
+ mime: resolved.content.mime,
198
+ size: text.length,
199
+ hash: resolved.revision?.hash ?? sha256Text(text),
200
+ revision: resolved.revision?.revision ?? null,
201
+ extractedTextRef: resolved.revision?.extracted_text_uri ?? null,
202
+ metadata: resolved.source?.metadata ?? {},
203
+ permissions: resolved.source?.permissions ?? { mode: 'read_only' },
204
+ };
205
+ }
206
+
207
+ function manifestItemForSource(sourceRef: string, parsed: SourceRef, resolved: ResolvedText, purpose: string): ManifestObject {
208
+ const hash = resolved.hash ?? sha256Text(resolved.text);
209
+ const metadata = {
210
+ ...resolved.metadata,
211
+ source_ref: sourceRef,
212
+ content_source: resolved.contentSource,
213
+ read_only: true,
214
+ };
215
+ const item: ManifestObject = {
216
+ source_ref: sourceRef,
217
+ name: resolved.title ?? titleForRef(parsed),
218
+ mime: resolved.mime ?? 'text/plain',
219
+ size: resolved.size ?? resolved.text.length,
220
+ hash,
221
+ revision: resolved.revision ?? hash,
222
+ status: 'active',
223
+ updated_at: new Date().toISOString(),
224
+ permissions: {
225
+ mode: 'read_only',
226
+ allowed_purposes: [purpose],
227
+ ...resolved.permissions,
228
+ },
229
+ metadata,
230
+ extracted_text_ref: resolved.extractedTextRef,
231
+ extracted_text: resolved.text,
232
+ };
233
+ if (parsed.kind === 'open-files') {
234
+ if (parsed.entity === 'file') item.file_id = parsed.id;
235
+ if (parsed.entity === 'source') {
236
+ item.source_id = parsed.id;
237
+ item.path = parsed.path;
238
+ }
239
+ }
240
+ if (parsed.kind === 'file') item.path = parsed.path;
241
+ if (parsed.kind === 's3') item.path = parsed.key;
242
+ if (parsed.kind === 'web') item.url = parsed.url;
243
+ return item;
244
+ }
245
+
246
+ export async function ingestSourceRef(options: SourceIngestOptions): Promise<SourceIngestResult> {
247
+ const purpose = options.purpose ?? 'knowledge_index';
248
+ const parsed = parseSourceRef(options.sourceRef);
249
+ const resolved = parsed.kind === 'open-files'
250
+ ? await readOpenFilesSourceText(options)
251
+ : await readDirectSourceText(parsed, options.config, options.safetyPolicy);
252
+ const item = manifestItemForSource(options.sourceRef, parsed, resolved, purpose);
253
+ const result = await ingestOpenFilesManifestItems({
254
+ dbPath: options.dbPath,
255
+ items: [item],
256
+ sourceLabel: options.sourceRef,
257
+ readAction: 'source_ref_ingest_read',
258
+ safetyPolicy: options.safetyPolicy,
259
+ now: options.now,
260
+ });
261
+ return {
262
+ ...result,
263
+ source_ref: options.sourceRef,
264
+ content_source: resolved.contentSource,
265
+ read_only: true,
266
+ hash: String(item.hash),
267
+ };
268
+ }
package/src/source-ref.ts CHANGED
@@ -82,6 +82,18 @@ export function parseSourceRef(uri: string): SourceRef {
82
82
  throw new Error(`Unsupported source ref scheme: ${uri}`);
83
83
  }
84
84
 
85
+ export function catalogSourceUriForRef(uri: string, parsed = parseSourceRef(uri)): string {
86
+ if (parsed.kind === 'open-files' && parsed.entity === 'file' && parsed.revision_id) {
87
+ return uri.replace(/\/revision\/[^/]+$/, '');
88
+ }
89
+ return uri;
90
+ }
91
+
92
+ export function revisionIdForSourceRef(uri: string): string | null {
93
+ const parsed = parseSourceRef(uri);
94
+ return parsed.kind === 'open-files' && parsed.entity === 'file' ? parsed.revision_id ?? null : null;
95
+ }
96
+
85
97
  export function isSupportedSourceRef(uri: string): boolean {
86
98
  try {
87
99
  parseSourceRef(uri);