@hasna/knowledge 0.2.8 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ import { createHash } from 'node:crypto';
2
+ import { existsSync, readFileSync } from 'node:fs';
3
+ import { basename } from 'node:path';
4
+ import { ingestOpenFilesManifestItems, type ManifestIngestResult, type ManifestObject } from './manifest-ingest';
5
+ import { parseSourceRef, type SourceRef } from './source-ref';
6
+ import { resolveOpenFilesSource } from './source-resolver';
7
+ import type { KnowledgeConfig } from './workspace';
8
+ import { assertS3ReadAllowed, assertWebSearchAllowed, type SafetyPolicy } from './safety';
9
+
10
+ export interface SourceIngestOptions {
11
+ dbPath: string;
12
+ sourceRef: string;
13
+ purpose?: string;
14
+ config?: KnowledgeConfig;
15
+ safetyPolicy?: SafetyPolicy;
16
+ now?: Date;
17
+ }
18
+
19
+ export interface SourceIngestResult extends ManifestIngestResult {
20
+ source_ref: string;
21
+ content_source: 'catalog_chunks' | 'extracted_text_ref' | 'file' | 's3' | 'web';
22
+ read_only: true;
23
+ hash: string;
24
+ }
25
+
26
+ interface ResolvedText {
27
+ text: string;
28
+ contentSource: SourceIngestResult['content_source'];
29
+ title: string | null;
30
+ mime: string | null;
31
+ size: number | null;
32
+ hash: string | null;
33
+ revision: string | null;
34
+ extractedTextRef: string | null;
35
+ metadata: Record<string, unknown>;
36
+ permissions: Record<string, unknown>;
37
+ }
38
+
39
+ function sha256Text(text: string): string {
40
+ return `sha256:${createHash('sha256').update(text).digest('hex')}`;
41
+ }
42
+
43
+ function stripHtml(html: string): string {
44
+ return html
45
+ .replace(/<script[\s\S]*?<\/script>/gi, ' ')
46
+ .replace(/<style[\s\S]*?<\/style>/gi, ' ')
47
+ .replace(/<[^>]+>/g, ' ')
48
+ .replace(/&nbsp;/g, ' ')
49
+ .replace(/&amp;/g, '&')
50
+ .replace(/&lt;/g, '<')
51
+ .replace(/&gt;/g, '>')
52
+ .replace(/\s+\n/g, '\n')
53
+ .replace(/\n\s+/g, '\n')
54
+ .replace(/[ \t]{2,}/g, ' ')
55
+ .trim();
56
+ }
57
+
58
+ async function readS3Text(uri: string, config?: KnowledgeConfig, safetyPolicy?: SafetyPolicy): Promise<string> {
59
+ const parsed = new URL(uri);
60
+ const bucket = parsed.hostname;
61
+ const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ''));
62
+ if (!bucket || !key) throw new Error(`Invalid S3 source URI: ${uri}`);
63
+ if (safetyPolicy) assertS3ReadAllowed(uri, safetyPolicy);
64
+ const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
65
+ import('@aws-sdk/client-s3'),
66
+ import('@aws-sdk/credential-providers'),
67
+ ]);
68
+ const s3Config = config?.storage.type === 's3' && config.storage.s3?.bucket === bucket ? config.storage.s3 : undefined;
69
+ const client = new S3Client({
70
+ region: s3Config?.region,
71
+ credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
72
+ maxAttempts: s3Config?.max_attempts,
73
+ });
74
+ const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
75
+ if (!response.Body) return '';
76
+ return await response.Body.transformToString();
77
+ }
78
+
79
+ async function readWebText(uri: string, safetyPolicy?: SafetyPolicy): Promise<{ text: string; mime: string | null }> {
80
+ if (safetyPolicy) assertWebSearchAllowed(safetyPolicy);
81
+ const response = await fetch(uri, {
82
+ headers: {
83
+ accept: 'text/markdown,text/plain,text/html,application/json;q=0.8,*/*;q=0.5',
84
+ 'user-agent': '@hasna/knowledge source-ingest',
85
+ },
86
+ });
87
+ if (!response.ok) throw new Error(`Web source read failed ${response.status}: ${uri}`);
88
+ const mime = response.headers.get('content-type');
89
+ const body = await response.text();
90
+ return { text: mime?.includes('html') ? stripHtml(body) : body, mime };
91
+ }
92
+
93
+ function titleForRef(parsed: SourceRef): string | null {
94
+ if (parsed.kind === 'file') return basename(parsed.path);
95
+ if (parsed.kind === 's3') return basename(parsed.key);
96
+ if (parsed.kind === 'web') return basename(new URL(parsed.url).pathname) || parsed.url;
97
+ return parsed.path ? basename(parsed.path) : parsed.id;
98
+ }
99
+
100
+ async function readDirectSourceText(parsed: SourceRef, config?: KnowledgeConfig, safetyPolicy?: SafetyPolicy): Promise<ResolvedText> {
101
+ if (parsed.kind === 'file') {
102
+ if (!existsSync(parsed.path)) throw new Error(`Source file not found: ${parsed.path}`);
103
+ const text = readFileSync(parsed.path, 'utf8');
104
+ return {
105
+ text,
106
+ contentSource: 'file',
107
+ title: titleForRef(parsed),
108
+ mime: 'text/plain',
109
+ size: text.length,
110
+ hash: sha256Text(text),
111
+ revision: null,
112
+ extractedTextRef: null,
113
+ metadata: { path: parsed.path },
114
+ permissions: { mode: 'read_only' },
115
+ };
116
+ }
117
+
118
+ if (parsed.kind === 's3') {
119
+ const text = await readS3Text(parsed.uri, config, safetyPolicy);
120
+ return {
121
+ text,
122
+ contentSource: 's3',
123
+ title: titleForRef(parsed),
124
+ mime: 'text/plain',
125
+ size: text.length,
126
+ hash: sha256Text(text),
127
+ revision: null,
128
+ extractedTextRef: null,
129
+ metadata: { bucket: parsed.bucket, key: parsed.key },
130
+ permissions: { mode: 'read_only' },
131
+ };
132
+ }
133
+
134
+ if (parsed.kind === 'web') {
135
+ const web = await readWebText(parsed.url, safetyPolicy);
136
+ return {
137
+ text: web.text,
138
+ contentSource: 'web',
139
+ title: titleForRef(parsed),
140
+ mime: web.mime,
141
+ size: web.text.length,
142
+ hash: sha256Text(web.text),
143
+ revision: null,
144
+ extractedTextRef: null,
145
+ metadata: { url: parsed.url },
146
+ permissions: { mode: 'read_only' },
147
+ };
148
+ }
149
+
150
+ throw new Error(`Direct source reading is not available for ${parsed.uri}`);
151
+ }
152
+
153
+ async function readTextRef(uri: string, config?: KnowledgeConfig, safetyPolicy?: SafetyPolicy): Promise<{ text: string; contentSource: SourceIngestResult['content_source'] }> {
154
+ if (uri.startsWith('open-files://')) {
155
+ throw new Error('Open-files extracted text refs require an open-files resolver API. Ingest an open-files manifest with extracted_text or an extracted_text_ref using file://, s3://, or https://.');
156
+ }
157
+ const parsed = parseSourceRef(uri);
158
+ const direct = await readDirectSourceText(parsed, config, safetyPolicy);
159
+ return { text: direct.text, contentSource: 'extracted_text_ref' };
160
+ }
161
+
162
+ async function readOpenFilesSourceText(options: SourceIngestOptions): Promise<ResolvedText> {
163
+ const resolved = await resolveOpenFilesSource({
164
+ dbPath: options.dbPath,
165
+ sourceRef: options.sourceRef,
166
+ purpose: options.purpose ?? 'knowledge_index',
167
+ limit: 100,
168
+ safetyPolicy: options.safetyPolicy,
169
+ now: options.now,
170
+ });
171
+ if (!resolved.resolved) {
172
+ throw new Error('Open-files source is not in the local knowledge catalog. Ingest an open-files manifest first or use the open-files resolver API.');
173
+ }
174
+ if (resolved.revision?.extracted_text_uri && !resolved.content.text_available) {
175
+ const textRef = await readTextRef(resolved.revision.extracted_text_uri, options.config, options.safetyPolicy);
176
+ return {
177
+ text: textRef.text,
178
+ contentSource: textRef.contentSource,
179
+ title: resolved.source?.title ?? null,
180
+ mime: resolved.content.mime,
181
+ size: textRef.text.length,
182
+ hash: resolved.revision.hash ?? sha256Text(textRef.text),
183
+ revision: resolved.revision.revision,
184
+ extractedTextRef: resolved.revision.extracted_text_uri,
185
+ metadata: resolved.source?.metadata ?? {},
186
+ permissions: resolved.source?.permissions ?? { mode: 'read_only' },
187
+ };
188
+ }
189
+ if (resolved.chunks.length === 0) {
190
+ throw new Error('Open-files source has no extracted text chunks yet. Ingest an open-files manifest with extracted_text or extracted_text_ref first.');
191
+ }
192
+ const text = resolved.chunks.map((chunk) => chunk.text).join('\n\n');
193
+ return {
194
+ text,
195
+ contentSource: 'catalog_chunks',
196
+ title: resolved.source?.title ?? null,
197
+ mime: resolved.content.mime,
198
+ size: text.length,
199
+ hash: resolved.revision?.hash ?? sha256Text(text),
200
+ revision: resolved.revision?.revision ?? null,
201
+ extractedTextRef: resolved.revision?.extracted_text_uri ?? null,
202
+ metadata: resolved.source?.metadata ?? {},
203
+ permissions: resolved.source?.permissions ?? { mode: 'read_only' },
204
+ };
205
+ }
206
+
207
+ function manifestItemForSource(sourceRef: string, parsed: SourceRef, resolved: ResolvedText, purpose: string): ManifestObject {
208
+ const hash = resolved.hash ?? sha256Text(resolved.text);
209
+ const metadata = {
210
+ ...resolved.metadata,
211
+ source_ref: sourceRef,
212
+ content_source: resolved.contentSource,
213
+ read_only: true,
214
+ };
215
+ const item: ManifestObject = {
216
+ source_ref: sourceRef,
217
+ name: resolved.title ?? titleForRef(parsed),
218
+ mime: resolved.mime ?? 'text/plain',
219
+ size: resolved.size ?? resolved.text.length,
220
+ hash,
221
+ revision: resolved.revision ?? hash,
222
+ status: 'active',
223
+ updated_at: new Date().toISOString(),
224
+ permissions: {
225
+ mode: 'read_only',
226
+ allowed_purposes: [purpose],
227
+ ...resolved.permissions,
228
+ },
229
+ metadata,
230
+ extracted_text_ref: resolved.extractedTextRef,
231
+ extracted_text: resolved.text,
232
+ };
233
+ if (parsed.kind === 'open-files') {
234
+ if (parsed.entity === 'file') item.file_id = parsed.id;
235
+ if (parsed.entity === 'source') {
236
+ item.source_id = parsed.id;
237
+ item.path = parsed.path;
238
+ }
239
+ }
240
+ if (parsed.kind === 'file') item.path = parsed.path;
241
+ if (parsed.kind === 's3') item.path = parsed.key;
242
+ if (parsed.kind === 'web') item.url = parsed.url;
243
+ return item;
244
+ }
245
+
246
+ export async function ingestSourceRef(options: SourceIngestOptions): Promise<SourceIngestResult> {
247
+ const purpose = options.purpose ?? 'knowledge_index';
248
+ const parsed = parseSourceRef(options.sourceRef);
249
+ const resolved = parsed.kind === 'open-files'
250
+ ? await readOpenFilesSourceText(options)
251
+ : await readDirectSourceText(parsed, options.config, options.safetyPolicy);
252
+ const item = manifestItemForSource(options.sourceRef, parsed, resolved, purpose);
253
+ const result = await ingestOpenFilesManifestItems({
254
+ dbPath: options.dbPath,
255
+ items: [item],
256
+ sourceLabel: options.sourceRef,
257
+ readAction: 'source_ref_ingest_read',
258
+ safetyPolicy: options.safetyPolicy,
259
+ now: options.now,
260
+ });
261
+ return {
262
+ ...result,
263
+ source_ref: options.sourceRef,
264
+ content_source: resolved.contentSource,
265
+ read_only: true,
266
+ hash: String(item.hash),
267
+ };
268
+ }