@de-otio/chaoskb-client 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/dist/cli/index.d.ts.map +1 -1
  2. package/dist/cli/index.js +12 -1
  3. package/dist/cli/index.js.map +1 -1
  4. package/dist/cli/mcp-server.d.ts +16 -1
  5. package/dist/cli/mcp-server.d.ts.map +1 -1
  6. package/dist/cli/mcp-server.js +29 -12
  7. package/dist/cli/mcp-server.js.map +1 -1
  8. package/dist/cli/tools/kb-ingest.d.ts +3 -1
  9. package/dist/cli/tools/kb-ingest.d.ts.map +1 -1
  10. package/dist/cli/tools/kb-ingest.js +45 -5
  11. package/dist/cli/tools/kb-ingest.js.map +1 -1
  12. package/dist/cli/tools/kb-query.d.ts +2 -0
  13. package/dist/cli/tools/kb-query.d.ts.map +1 -1
  14. package/dist/cli/tools/kb-query.js +11 -2
  15. package/dist/cli/tools/kb-query.js.map +1 -1
  16. package/dist/pipeline/content-pipeline.d.ts +2 -0
  17. package/dist/pipeline/content-pipeline.d.ts.map +1 -1
  18. package/dist/pipeline/content-pipeline.js +27 -1
  19. package/dist/pipeline/content-pipeline.js.map +1 -1
  20. package/dist/pipeline/extract.d.ts.map +1 -1
  21. package/dist/pipeline/extract.js +129 -4
  22. package/dist/pipeline/extract.js.map +1 -1
  23. package/dist/pipeline/fetch.d.ts +11 -0
  24. package/dist/pipeline/fetch.d.ts.map +1 -1
  25. package/dist/pipeline/fetch.js +153 -1
  26. package/dist/pipeline/fetch.js.map +1 -1
  27. package/dist/pipeline/file-extract.d.ts +16 -0
  28. package/dist/pipeline/file-extract.d.ts.map +1 -0
  29. package/dist/pipeline/file-extract.js +249 -0
  30. package/dist/pipeline/file-extract.js.map +1 -0
  31. package/dist/pipeline/index.d.ts +2 -0
  32. package/dist/pipeline/index.d.ts.map +1 -1
  33. package/dist/pipeline/index.js +2 -0
  34. package/dist/pipeline/index.js.map +1 -1
  35. package/dist/pipeline/types.d.ts +6 -0
  36. package/dist/pipeline/types.d.ts.map +1 -1
  37. package/dist/pipeline/validate.d.ts +36 -0
  38. package/dist/pipeline/validate.d.ts.map +1 -0
  39. package/dist/pipeline/validate.js +632 -0
  40. package/dist/pipeline/validate.js.map +1 -0
  41. package/dist/storage/source-repo.d.ts +2 -0
  42. package/dist/storage/source-repo.d.ts.map +1 -1
  43. package/dist/storage/source-repo.js +9 -2
  44. package/dist/storage/source-repo.js.map +1 -1
  45. package/dist/storage/types.d.ts +1 -0
  46. package/dist/storage/types.d.ts.map +1 -1
  47. package/dist/sync/ssh-signer.d.ts +4 -0
  48. package/dist/sync/ssh-signer.d.ts.map +1 -1
  49. package/dist/sync/ssh-signer.js +45 -2
  50. package/dist/sync/ssh-signer.js.map +1 -1
  51. package/package.json +5 -2
@@ -0,0 +1,249 @@
1
+ /**
2
+ * Local file content extraction.
3
+ *
4
+ * Dispatches to format-specific extractors based on file extension.
5
+ * Supports PDF, DOCX, PPTX, HTML, TXT, and Markdown.
6
+ */
7
+ import * as fs from 'node:fs/promises';
8
+ import * as path from 'node:path';
9
+ // ===== Supported formats ===================================================
10
+ const EXTENSION_MAP = {
11
+ '.pdf': 'pdf',
12
+ '.docx': 'docx',
13
+ '.pptx': 'pptx',
14
+ '.html': 'html',
15
+ '.htm': 'html',
16
+ '.txt': 'txt',
17
+ '.md': 'md',
18
+ '.markdown': 'md',
19
+ };
20
+ const SUPPORTED_EXTENSIONS = Object.keys(EXTENSION_MAP).join(', ');
21
+ // ===== Limits ==============================================================
22
+ /** Maximum file size in bytes (50 MB). */
23
+ const MAX_FILE_SIZE = 50 * 1024 * 1024;
24
+ /** Maximum total uncompressed size for ZIP-based formats (100 MB). */
25
+ const MAX_UNCOMPRESSED_SIZE = 100 * 1024 * 1024;
26
+ // ===== Public API ==========================================================
27
+ /**
28
+ * Extract content from a local file.
29
+ *
30
+ * @param filePath - Path to the file (resolved to absolute).
31
+ * @returns Extracted content with title, text, and the absolute path as `url`.
32
+ * @throws On missing/unreadable file, unsupported format, or empty content.
33
+ */
34
+ export async function extractFromFile(filePath) {
35
+ // Check extension first for a clear error on unsupported formats
36
+ const ext = path.extname(filePath).toLowerCase();
37
+ const format = EXTENSION_MAP[ext];
38
+ if (!format) {
39
+ throw new Error(`Unsupported file format "${ext}". Supported formats: ${SUPPORTED_EXTENSIONS}`);
40
+ }
41
+ const absPath = path.resolve(filePath);
42
+ // Resolve symlinks to get the real path, then re-check the extension
43
+ let realPath;
44
+ try {
45
+ realPath = await fs.realpath(absPath);
46
+ }
47
+ catch {
48
+ throw new Error(`File not found or not readable: ${path.basename(filePath)}`);
49
+ }
50
+ const realExt = path.extname(realPath).toLowerCase();
51
+ if (!EXTENSION_MAP[realExt]) {
52
+ throw new Error(`Symlink target has unsupported extension "${realExt}". Supported formats: ${SUPPORTED_EXTENSIONS}`);
53
+ }
54
+ // Check file is regular and within size limit
55
+ const stat = await fs.stat(realPath);
56
+ if (!stat.isFile()) {
57
+ throw new Error('Path is not a regular file.');
58
+ }
59
+ if (stat.size > MAX_FILE_SIZE) {
60
+ throw new Error(`File is too large (${(stat.size / 1024 / 1024).toFixed(1)} MB). Maximum supported size is ${MAX_FILE_SIZE / 1024 / 1024} MB.`);
61
+ }
62
+ switch (format) {
63
+ case 'pdf':
64
+ return extractPdf(realPath);
65
+ case 'docx':
66
+ return extractDocx(realPath);
67
+ case 'pptx':
68
+ return extractPptx(realPath);
69
+ case 'html':
70
+ return extractHtmlFile(realPath);
71
+ case 'txt':
72
+ case 'md':
73
+ return extractPlainText(realPath);
74
+ default:
75
+ throw new Error(`Unsupported format: ${format}`);
76
+ }
77
+ }
78
+ // ===== Format extractors ===================================================
79
+ async function extractPdf(filePath) {
80
+ const { PDFParse } = await import('pdf-parse');
81
+ const buffer = await fs.readFile(filePath);
82
+ const uint8 = new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength);
83
+ const parser = new PDFParse({ data: uint8, isEvalSupported: false });
84
+ const textResult = await parser.getText();
85
+ // Strip the page footer markers ("-- N of M --")
86
+ const rawText = textResult.text.replace(/\n-- \d+ of \d+ --\n/g, '\n');
87
+ const content = cleanText(rawText);
88
+ if (content.length === 0) {
89
+ throw new Error('No extractable text in PDF. The file may be a scanned document without OCR.');
90
+ }
91
+ const infoResult = await parser.getInfo();
92
+ const title = infoResult?.info?.Title || filenameTitle(filePath);
93
+ parser.destroy();
94
+ return {
95
+ title,
96
+ content,
97
+ url: filePath,
98
+ byteLength: Buffer.byteLength(content, 'utf-8'),
99
+ };
100
+ }
101
+ async function extractDocx(filePath) {
102
+ const mammoth = await import('mammoth');
103
+ const result = await mammoth.convertToHtml({ path: filePath });
104
+ if (!result.value || result.value.trim().length === 0) {
105
+ throw new Error('No extractable content in DOCX file.');
106
+ }
107
+ // Pipe the clean HTML through existing Readability extraction
108
+ const { extractContent } = await import('./extract.js');
109
+ try {
110
+ const extracted = extractContent(result.value, filePath);
111
+ return {
112
+ ...extracted,
113
+ url: filePath,
114
+ title: extracted.title || filenameTitle(filePath),
115
+ };
116
+ }
117
+ catch {
118
+ // If Readability fails (e.g. very simple doc), fall back to plain text
119
+ const text = cleanText(result.value.replace(/<[^>]+>/g, ' '));
120
+ if (text.length === 0) {
121
+ throw new Error('No extractable content in DOCX file.');
122
+ }
123
+ return {
124
+ title: filenameTitle(filePath),
125
+ content: text,
126
+ url: filePath,
127
+ byteLength: Buffer.byteLength(text, 'utf-8'),
128
+ };
129
+ }
130
+ }
131
+ async function extractPptx(filePath) {
132
+ const JSZip = (await import('jszip')).default;
133
+ const buffer = await fs.readFile(filePath);
134
+ const zip = await JSZip.loadAsync(buffer);
135
+ // Find slide XML files and sort by slide number
136
+ const slideFiles = Object.keys(zip.files)
137
+ .filter((name) => /^ppt\/slides\/slide\d+\.xml$/.test(name))
138
+ .sort((a, b) => {
139
+ const numA = parseInt(a.match(/slide(\d+)/)?.[1] ?? '0');
140
+ const numB = parseInt(b.match(/slide(\d+)/)?.[1] ?? '0');
141
+ return numA - numB;
142
+ });
143
+ if (slideFiles.length === 0) {
144
+ throw new Error('No slides found in PPTX file.');
145
+ }
146
+ // Decompress slides incrementally, tracking actual bytes to guard against zip bombs
147
+ const slideTexts = [];
148
+ let firstSlideTitle = '';
149
+ let totalDecompressed = 0;
150
+ for (const slideFile of slideFiles) {
151
+ const bytes = await zip.file(slideFile).async('uint8array');
152
+ totalDecompressed += bytes.byteLength;
153
+ if (totalDecompressed > MAX_UNCOMPRESSED_SIZE) {
154
+ throw new Error(`PPTX uncompressed content exceeds ${MAX_UNCOMPRESSED_SIZE / 1024 / 1024} MB limit.`);
155
+ }
156
+ const xml = new TextDecoder().decode(bytes);
157
+ const text = extractSlideText(xml);
158
+ if (text) {
159
+ slideTexts.push(text);
160
+ if (!firstSlideTitle) {
161
+ firstSlideTitle = text.split('\n')[0].trim();
162
+ }
163
+ }
164
+ }
165
+ const content = cleanText(slideTexts.join('\n\n'));
166
+ if (content.length === 0) {
167
+ throw new Error('No extractable text in PPTX file.');
168
+ }
169
+ return {
170
+ title: firstSlideTitle || filenameTitle(filePath),
171
+ content,
172
+ url: filePath,
173
+ byteLength: Buffer.byteLength(content, 'utf-8'),
174
+ };
175
+ }
176
+ async function extractHtmlFile(filePath) {
177
+ const html = await fs.readFile(filePath, 'utf-8');
178
+ const { extractContent } = await import('./extract.js');
179
+ const extracted = extractContent(html, filePath);
180
+ return {
181
+ ...extracted,
182
+ url: filePath,
183
+ title: extracted.title || filenameTitle(filePath),
184
+ };
185
+ }
186
+ async function extractPlainText(filePath) {
187
+ const raw = await fs.readFile(filePath, 'utf-8');
188
+ const content = cleanText(raw);
189
+ if (content.length === 0) {
190
+ throw new Error('Empty file.');
191
+ }
192
+ // Title: first markdown heading, or first short line, or filename
193
+ let title = '';
194
+ const headingMatch = content.match(/^#\s+(.+)$/m);
195
+ if (headingMatch) {
196
+ title = headingMatch[1].trim();
197
+ }
198
+ else {
199
+ const firstLine = content.split('\n').find((l) => l.trim().length > 0);
200
+ if (firstLine && firstLine.length < 120) {
201
+ title = firstLine.trim();
202
+ }
203
+ }
204
+ return {
205
+ title: title || filenameTitle(filePath),
206
+ content,
207
+ url: filePath,
208
+ byteLength: Buffer.byteLength(content, 'utf-8'),
209
+ };
210
+ }
211
+ // ===== Utilities ===========================================================
212
+ /**
213
+ * Extract text from a PPTX slide XML string.
214
+ * Groups by `<a:p>` paragraphs, collects `<a:t>` text runs within each.
215
+ */
216
+ function extractSlideText(xml) {
217
+ const paragraphs = [];
218
+ const pRegex = /<a:p\b[^>]*>([\s\S]*?)<\/a:p>/g;
219
+ const tRegex = /<a:t>([\s\S]*?)<\/a:t>/g;
220
+ let pMatch;
221
+ while ((pMatch = pRegex.exec(xml)) !== null) {
222
+ const pContent = pMatch[1];
223
+ const texts = [];
224
+ let tMatch;
225
+ tRegex.lastIndex = 0;
226
+ while ((tMatch = tRegex.exec(pContent)) !== null) {
227
+ texts.push(tMatch[1]);
228
+ }
229
+ if (texts.length > 0) {
230
+ paragraphs.push(texts.join(''));
231
+ }
232
+ }
233
+ return paragraphs.join('\n');
234
+ }
235
+ /** Extract a readable title from a file path (filename without extension). */
236
+ function filenameTitle(filePath) {
237
+ return path.basename(filePath, path.extname(filePath));
238
+ }
239
+ /** Clean extracted text: strip steganographic chars, collapse whitespace, trim lines. */
240
+ function cleanText(text) {
241
+ return text
242
+ .replace(/[\u2028\u2029]/g, '\n') // Unicode line/paragraph separators → newline
243
+ .replace(/[\u200B-\u200F\u202A-\u202F\u2060-\u206F\uFEFF]/g, '') // strip zero-width / bidi / invisible chars
244
+ .replace(/[\t ]+/g, ' ')
245
+ .replace(/\n{3,}/g, '\n\n')
246
+ .replace(/^ +| +$/gm, '')
247
+ .trim();
248
+ }
249
+ //# sourceMappingURL=file-extract.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"file-extract.js","sourceRoot":"","sources":["../../pipeline/file-extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACvC,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAGlC,8EAA8E;AAE9E,MAAM,aAAa,GAA2B;IAC5C,MAAM,EAAE,KAAK;IACb,OAAO,EAAE,MAAM;IACf,OAAO,EAAE,MAAM;IACf,OAAO,EAAE,MAAM;IACf,MAAM,EAAE,MAAM;IACd,MAAM,EAAE,KAAK;IACb,KAAK,EAAE,IAAI;IACX,WAAW,EAAE,IAAI;CAClB,CAAC;AAEF,MAAM,oBAAoB,GAAG,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEnE,8EAA8E;AAE9E,0CAA0C;AAC1C,MAAM,aAAa,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC;AAEvC,sEAAsE;AACtE,MAAM,qBAAqB,GAAG,GAAG,GAAG,IAAI,GAAG,IAAI,CAAC;AAEhD,8EAA8E;AAE9E;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,QAAgB;IACpD,iEAAiE;IACjE,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACjD,MAAM,MAAM,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;IAClC,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,4BAA4B,GAAG,yBAAyB,oBAAoB,EAAE,CAC/E,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IAEvC,qEAAqE;IACrE,IAAI,QAAgB,CAAC;IACrB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;IACxC,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,mCAAmC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IAChF,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACrD,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,KAAK,CACb,6CAA6C,OAAO,yBAAyB,oBAAoB,EAAE,CACpG,CAAC;IACJ,CAAC;IAED,8CAA8C;IAC9C,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACrC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACjD,CAAC;IACD,IAAI,IAAI,CAAC,IAAI,GAAG,aAAa,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CACb,sBAAsB,CAAC,IAAI,CAAC,IAAI,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,mCAAmC,aAAa,GAAG,IAAI,GAAG,IAAI,MAAM,CAC/H,CAAC;IACJ,CAAC;IAED,QAAQ,MAAM,EAAE,CAAC;QACf,KAAK,KAAK;YACR,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC;QAC9B,KAAK,MAAM;YACT,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC;QAC/B,KAAK,MAAM;YACT,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC;QAC/B,KAAK,MAAM;YACT,OAAO,eAAe,CAAC,QAAQ,CAAC,CAAC;QACnC,KAAK,KAAK,CAAC;QACX,KAAK,IAAI;YACP,OAAO,gBAAgB,CAAC,QAAQ,CAAC,CAAC;QACpC;YACE,MAAM,IAAI,KAAK,CAAC,uBAAuB,MAAM,EAAE,CAAC,CAAC;IACrD,CAAC;AACH,CAAC;AAED,8EAA8E;AAE9E,KAAK,UAAU,UAAU,CAAC,QAAgB;IACxC,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;IAC/C,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC3C,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IAElF,MAAM,MAAM,GAAG,IAAI,QAAQ,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,eAAe,EAAE,KAAK,EAAE,CAAC,CAAC;IACrE,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,OAAO,EAAE,CAAC;IAC1C,iDAAiD;IACjD,MAAM,OAAO,GAAG,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,uBAAuB,EAAE,IAAI,CAAC,CAAC;IACvE,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC;IAEnC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CACb,6EAA6E,CAC9E,CAAC;IACJ,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,OAAO,EAAE,CAAC;IAC1C,MAAM,KAAK,GAAG,UAAU,EAAE,IAAI,EAAE,KAAK,IAAI,aAAa,CAAC,QAAQ,CAAC,CAAC;IACjE,MAAM,CAAC,OAAO,EAAE,CAAC;IAEjB,OAAO;QACL,KAAK;QACL,OAAO;QACP,GAAG,EAAE,QAAQ;QACb,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC;KAChD,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,QAAgB;IACzC,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;IACxC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,aAAa,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC;IAE/D,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,MAAM,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtD,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;IAC1D,CAAC;IAED,8DAA8D;IAC9D,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;IACxD,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,cAAc,CAAC,MAAM,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;QACzD,OAAO;YACL,GAAG,SAAS;YACZ,GAAG,EAAE,QAAQ;YACb,KAAK,EAAE,SAAS,CAAC,KAAK,IAAI,aAAa,CAAC,QAAQ,CAAC;SAClD,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,uEAAuE;QACvE,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC,CAAC;QAC9D,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QACD,OAAO;YACL,KAAK,EAAE,aAAa,CAAC,QAAQ,CAAC;YAC9B,OAAO,EAAE,IAAI;YACb,GAAG,EAAE,QAAQ;YACb,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC;SAC7C,CAAC;IACJ,CAAC;AACH,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,QAAgB;IACzC,MAAM,KAAK,GAAG,CAAC,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC3C,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IAE1C,gDAAgD;IAChD,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;SACtC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,8BAA8B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;SAC3D,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACb,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;QACzD,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;QACzD,OAAO,IAAI,GAAG,IAAI,CAAC;IACrB,CAAC,CAAC,CAAC;IAEL,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACnD,CAAC;IAED,oFAAoF;IACpF,MAAM,UAAU,GAAa,EAAE,CAAC;IAChC,IAAI,eAAe,GAAG,EAAE,CAAC;IACzB,IAAI,iBAAiB,GAAG,CAAC,CAAC;IAE1B,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,MAAM,GAAG,CAAC,IAAI,CAAC,SAAS,CAAE,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAC7D,iBAAiB,IAAI,KAAK,CAAC,UAAU,CAAC;QACtC,IAAI,iBAAiB,GAAG,qBAAqB,EAAE,CAAC;YAC9C,MAAM,IAAI,KAAK,CACb,qCAAqC,qBAAqB,GAAG,IAAI,GAAG,IAAI,YAAY,CACrF,CAAC;QACJ,CAAC;QACD,MAAM,GAAG,GAAG,IAAI,WAAW,EAAE,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,GAAG,gBAAgB,CAAC,GAAG,CAAC,CAAC;QACnC,IAAI,IAAI,EAAE,CAAC;YACT,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,IAAI,CAAC,eAAe,EAAE,CAAC;gBACrB,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC/C,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,SAAS,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;IACnD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;IACvD,CAAC;IAED,OAAO;QACL,KAAK,EAAE,eAAe,IAAI,aAAa,CAAC,QAAQ,CAAC;QACjD,OAAO;QACP,GAAG,EAAE,QAAQ;QACb,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC;KAChD,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,QAAgB;IAC7C,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAClD,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;IACxD,MAAM,SAAS,GAAG,cAAc,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACjD,OAAO;QACL,GAAG,SAAS;QACZ,GAAG,EAAE,QAAQ;QACb,KAAK,EAAE,SAAS,CAAC,KAAK,IAAI,aAAa,CAAC,QAAQ,CAAC;KAClD,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,gBAAgB,CAAC,QAAgB;IAC9C,MAAM,GAAG,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACjD,MAAM,OAAO,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;IAE/B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,aAAa,CAAC,CAAC;IACjC,CAAC;IAED,kEAAkE;IAClE,IAAI,KAAK,GAAG,EAAE,CAAC;IACf,MAAM,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAClD,IAAI,YAAY,EAAE,CAAC;QACjB,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACjC,CAAC;SAAM,CAAC;QACN,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvE,IAAI,SAAS,IAAI,SAAS,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YACxC,KAAK,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC;QAC3B,CAAC;IACH,CAAC;IAED,OAAO;QACL,KAAK,EAAE,KAAK,IAAI,aAAa,CAAC,QAAQ,CAAC;QACvC,OAAO;QACP,GAAG,EAAE,QAAQ;QACb,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC;KAChD,CAAC;AACJ,CAAC;AAED,8EAA8E;AAE9E;;;GAGG;AACH,SAAS,gBAAgB,CAAC,GAAW;IACnC,MAAM,UAAU,GAAa,EAAE,CAAC;IAChC,MAAM,MAAM,GAAG,gCAAgC,CAAC;IAChD,MAAM,MAAM,GAAG,yBAAyB,CAAC;IAEzC,IAAI,MAAM,CAAC;IACX,OAAO,CAAC,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC5C,MAAM,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,IAAI,MAAM,CAAC;QACX,MAAM,CAAC,SAAS,GAAG,CAAC,CAAC;QACrB,OAAO,CAAC,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACjD,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QACxB,CAAC;QACD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrB,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC/B,CAAC;AAED,8EAA8E;AAC9E,SAAS,aAAa,CAAC,QAAgB;IACrC,OAAO,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;AACzD,CAAC;AAED,yFAAyF;AACzF,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAgC,8CAA8C;SAC9G,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC,CAAC,4CAA4C;SAC5G,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC;SACxB,IAAI,EAAE,CAAC;AACZ,CAAC"}
@@ -7,4 +7,6 @@ export { Embedder } from './embedder.js';
7
7
  export { ModelManager } from './model-manager.js';
8
8
  export { ContentPipeline } from './content-pipeline.js';
9
9
  export { cosineSimilarity, searchEmbeddings } from './search.js';
10
+ export { validateContent, validateFileContent, type ContentIssue } from './validate.js';
11
+ export { extractFromFile } from './file-extract.js';
10
12
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AACjE,OAAO,EAAE,eAAe,EAAE,mBAAmB,EAAE,KAAK,YAAY,EAAE,MAAM,eAAe,CAAC;AACxF,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC"}
@@ -7,4 +7,6 @@ export { Embedder } from './embedder.js';
7
7
  export { ModelManager } from './model-manager.js';
8
8
  export { ContentPipeline } from './content-pipeline.js';
9
9
  export { cosineSimilarity, searchEmbeddings } from './search.js';
10
+ export { validateContent, validateFileContent } from './validate.js';
11
+ export { extractFromFile } from './file-extract.js';
10
12
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AACjE,OAAO,EAAE,eAAe,EAAE,mBAAmB,EAAqB,MAAM,eAAe,CAAC;AACxF,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC"}
@@ -12,6 +12,8 @@ export interface PipelineConfig {
12
12
  maxRedirects: number;
13
13
  /** User-Agent header */
14
14
  userAgent: string;
15
+ /** Skip SSRF validation — ONLY for tests against localhost servers */
16
+ _skipSsrfCheck?: boolean;
15
17
  }
16
18
  /** Extracted content from a URL */
17
19
  export interface ExtractedContent {
@@ -21,6 +23,8 @@ export interface ExtractedContent {
21
23
  url: string;
22
24
  /** Byte length of extracted content */
23
25
  byteLength: number;
26
+ /** Validation warnings (content was stored, but quality may be degraded) */
27
+ warnings?: string[];
24
28
  }
25
29
  /** A text chunk from extracted content */
26
30
  export interface Chunk {
@@ -59,6 +63,8 @@ export type DownloadProgressCallback = (downloaded: number, total: number) => vo
59
63
  export interface IContentPipeline {
60
64
  /** Fetch and extract content from a URL */
61
65
  fetchAndExtract(url: string): Promise<ExtractedContent>;
66
+ /** Extract content from a local file (PDF, DOCX, PPTX, HTML, TXT, MD) */
67
+ extractFromFile(filePath: string): Promise<ExtractedContent>;
62
68
  /** Split extracted text into chunks */
63
69
  chunk(text: string): Chunk[];
64
70
  /** Embed a single text string */
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../pipeline/types.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,MAAM,WAAW,cAAc;IAC7B,iEAAiE;IACjE,SAAS,EAAE,MAAM,CAAC;IAClB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;IACvB,iDAAiD;IACjD,aAAa,EAAE,MAAM,CAAC;IACtB,kDAAkD;IAClD,cAAc,EAAE,MAAM,CAAC;IACvB,+CAA+C;IAC/C,YAAY,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,mCAAmC;AACnC,MAAM,WAAW,gBAAgB;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,mBAAmB;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,uCAAuC;IACvC,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,0CAA0C;AAC1C,MAAM,WAAW,KAAK;IACpB,yBAAyB;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,yCAAyC;IACzC,KAAK,EAAE,MAAM,CAAC;IACd,8BAA8B;IAC9B,UAAU,EAAE,MAAM,CAAC;IACnB,6CAA6C;IAC7C,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,sEAAsE;AACtE,MAAM,MAAM,eAAe,GAAG,YAAY,CAAC;AAE3C,0CAA0C;AAC1C,MAAM,WAAW,aAAc,SAAQ,KAAK;IAC1C,uCAAuC;IACvC,SAAS,EAAE,eAAe,CAAC;IAC3B,uDAAuD;IACvD,KAAK,EAAE,MAAM,CAAC;CACf;AAED,6CAA6C;AAC7C,MAAM,WAAW,YAAY;IAC3B,qBAAqB;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,yBAAyB;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,oCAAoC;IACpC,KAAK,EAAE,MAAM,CAAC;CACf;AAED,uCAAuC;AACvC,MAAM,MAAM,wBAAwB,GAAG,CAAC,UAAU,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;AAEnF,yCAAyC;AACzC,MAAM,WAAW,gBAAgB;IAC/B,2CAA2C;IAC3C,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC;IACxD,uCAAuC;IACvC,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,KAAK,EAAE,CAAC;IAC7B,iCAAiC;IACjC,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC;IAC9C,4BAA4B;IAC5B,WAAW,CAAC,MAAM,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC,CAAC;IACvD,4CAA4C;IAC5C,MAAM,CAAC,KAAK,EAAE,eAAe,EAAE,UAAU,EAAE,eAAe,EAAE,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;CACvF;AAED,iEAAiE;AACjE,MAAM,WAAW,aAAa;IAC5B,gDAAgD;IAChD,YAAY,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IACjC,2EAA2E;IAC3E,WAAW,CAAC,UAAU,CAAC,EAAE,wBAAwB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACpE,6BAA6B;IAC7B,YAAY,IAAI,MAAM,CAAC;IACvB,kCAAkC;IAClC,YAAY,IAAI,MAAM,CAAC;IACvB,qDAAqD;IACrD,YAAY,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IACjC,yDAAyD;IACzD,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC;CAChC"}
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../pipeline/types.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,MAAM,WAAW,cAAc;IAC7B,iEAAiE;IACjE,SAAS,EAAE,MAAM,CAAC;IAClB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;IACvB,iDAAiD;IACjD,aAAa,EAAE,MAAM,CAAC;IACtB,kDAAkD;IAClD,cAAc,EAAE,MAAM,CAAC;IACvB,+CAA+C;IAC/C,YAAY,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,sEAAsE;IACtE,cAAc,CAAC,EAAE,OAAO,CAAC;CAC1B;AAED,mCAAmC;AACnC,MAAM,WAAW,gBAAgB;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,mBAAmB;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,uCAAuC;IACvC,UAAU,EAAE,MAAM,CAAC;IACnB,4EAA4E;IAC5E,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CACrB;AAED,0CAA0C;AAC1C,MAAM,WAAW,KAAK;IACpB,yBAAyB;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,yCAAyC;IACzC,KAAK,EAAE,MAAM,CAAC;IACd,8BAA8B;IAC9B,UAAU,EAAE,MAAM,CAAC;IACnB,6CAA6C;IAC7C,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,sEAAsE;AACtE,MAAM,MAAM,eAAe,GAAG,YAAY,CAAC;AAE3C,0CAA0C;AAC1C,MAAM,WAAW,aAAc,SAAQ,KAAK;IAC1C,uCAAuC;IACvC,SAAS,EAAE,eAAe,CAAC;IAC3B,uDAAuD;IACvD,KAAK,EAAE,MAAM,CAAC;CACf;AAED,6CAA6C;AAC7C,MAAM,WAAW,YAAY;IAC3B,qBAAqB;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,yBAAyB;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,oCAAoC;IACpC,KAAK,EAAE,MAAM,CAAC;CACf;AAED,uCAAuC;AACvC,MAAM,MAAM,wBAAwB,GAAG,CAAC,UAAU,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;AAEnF,yCAAyC;AACzC,MAAM,WAAW,gBAAgB;IAC/B,2CAA2C;IAC3C,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC;IACxD,yEAAyE;IACzE,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC;IAC7D,uCAAuC;IACvC,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,KAAK,EAAE,CAAC;IAC7B,iCAAiC;IACjC,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC;IAC9C,4BAA4B;IAC5B,WAAW,CAAC,MAAM,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC,CAAC;IACvD,4CAA4C;IAC5C,MAAM,CAAC,KAAK,EAAE,eAAe,EAAE,UAAU,EAAE,eAAe,EAAE,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;CACvF;AAED,iEAAiE;AACjE,MAAM,WAAW,aAAa;IAC5B,gDAAgD;IAChD,YAAY,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IACjC,2EAA2E;IAC3E,WAAW,CAAC,UAAU,CAAC,EAAE,wBAAwB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACpE,6BAA6B;IAC7B,YAAY,IAAI,MAAM,CAAC;IACvB,kCAAkC;IAClC,YAAY,IAAI,MAAM,CAAC;IACvB,qDAAqD;IACrD,YAAY,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IACjC,yDAAyD;IACzD,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC;CAChC"}
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Content validation for the ingestion pipeline.
3
+ *
4
+ * Runs quality checks on fetched HTML and extracted content, returning
5
+ * structured issues. Errors block ingestion; warnings are surfaced
6
+ * to the user alongside the stored content.
7
+ *
8
+ * Checks are organised into three tiers:
9
+ * 1. Pattern matching — known services / phrases
10
+ * 2. Structural HTML — HTML characteristics regardless of service
11
+ * 3. Content heuristics — statistical properties of extracted text
12
+ *
13
+ * All patterns are English-only for now.
14
+ */
15
+ import type { ExtractedContent } from './types.js';
16
+ export interface ContentIssue {
17
+ severity: 'error' | 'warning';
18
+ code: string;
19
+ message: string;
20
+ }
21
+ /**
22
+ * Validate fetched HTML and its extracted content.
23
+ *
24
+ * Returns all detected issues (not just the first). Callers should
25
+ * treat `error`-severity issues as ingestion blockers and `warning`
26
+ * issues as informational.
27
+ */
28
+ export declare function validateContent(html: string, extracted: ExtractedContent): ContentIssue[];
29
+ /**
30
+ * Validate extracted content from a local file.
31
+ *
32
+ * Runs only Tier 3 (content heuristic) checks. Tier 1 (pattern matching)
33
+ * and Tier 2 (structural HTML analysis) are specific to URL-fetched content.
34
+ */
35
+ export declare function validateFileContent(extracted: ExtractedContent): ContentIssue[];
36
+ //# sourceMappingURL=validate.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"validate.d.ts","sourceRoot":"","sources":["../../pipeline/validate.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAInD,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,OAAO,GAAG,SAAS,CAAC;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;CACjB;AA8RD;;;;;;GAMG;AACH,wBAAgB,eAAe,CAC7B,IAAI,EAAE,MAAM,EACZ,SAAS,EAAE,gBAAgB,GAC1B,YAAY,EAAE,CAgChB;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,gBAAgB,GAAG,YAAY,EAAE,CAa/E"}