@de-otio/chaoskb-client 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/mcp-server.d.ts +16 -1
- package/dist/cli/mcp-server.d.ts.map +1 -1
- package/dist/cli/mcp-server.js +29 -12
- package/dist/cli/mcp-server.js.map +1 -1
- package/dist/cli/tools/kb-ingest.d.ts +3 -1
- package/dist/cli/tools/kb-ingest.d.ts.map +1 -1
- package/dist/cli/tools/kb-ingest.js +45 -5
- package/dist/cli/tools/kb-ingest.js.map +1 -1
- package/dist/cli/tools/kb-query.d.ts +2 -0
- package/dist/cli/tools/kb-query.d.ts.map +1 -1
- package/dist/cli/tools/kb-query.js +11 -2
- package/dist/cli/tools/kb-query.js.map +1 -1
- package/dist/pipeline/content-pipeline.d.ts +2 -0
- package/dist/pipeline/content-pipeline.d.ts.map +1 -1
- package/dist/pipeline/content-pipeline.js +27 -1
- package/dist/pipeline/content-pipeline.js.map +1 -1
- package/dist/pipeline/extract.d.ts.map +1 -1
- package/dist/pipeline/extract.js +129 -4
- package/dist/pipeline/extract.js.map +1 -1
- package/dist/pipeline/fetch.d.ts +11 -0
- package/dist/pipeline/fetch.d.ts.map +1 -1
- package/dist/pipeline/fetch.js +153 -1
- package/dist/pipeline/fetch.js.map +1 -1
- package/dist/pipeline/file-extract.d.ts +16 -0
- package/dist/pipeline/file-extract.d.ts.map +1 -0
- package/dist/pipeline/file-extract.js +249 -0
- package/dist/pipeline/file-extract.js.map +1 -0
- package/dist/pipeline/index.d.ts +2 -0
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +2 -0
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/types.d.ts +6 -0
- package/dist/pipeline/types.d.ts.map +1 -1
- package/dist/pipeline/validate.d.ts +36 -0
- package/dist/pipeline/validate.d.ts.map +1 -0
- package/dist/pipeline/validate.js +632 -0
- package/dist/pipeline/validate.js.map +1 -0
- package/dist/storage/source-repo.d.ts +2 -0
- package/dist/storage/source-repo.d.ts.map +1 -1
- package/dist/storage/source-repo.js +9 -2
- package/dist/storage/source-repo.js.map +1 -1
- package/dist/storage/types.d.ts +1 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +4 -1
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Local file content extraction.
|
|
3
|
+
*
|
|
4
|
+
* Dispatches to format-specific extractors based on file extension.
|
|
5
|
+
* Supports PDF, DOCX, PPTX, HTML, TXT, and Markdown.
|
|
6
|
+
*/
|
|
7
|
+
import * as fs from 'node:fs/promises';
|
|
8
|
+
import * as path from 'node:path';
|
|
9
|
+
// ===== Supported formats ===================================================
|
|
10
|
+
const EXTENSION_MAP = {
|
|
11
|
+
'.pdf': 'pdf',
|
|
12
|
+
'.docx': 'docx',
|
|
13
|
+
'.pptx': 'pptx',
|
|
14
|
+
'.html': 'html',
|
|
15
|
+
'.htm': 'html',
|
|
16
|
+
'.txt': 'txt',
|
|
17
|
+
'.md': 'md',
|
|
18
|
+
'.markdown': 'md',
|
|
19
|
+
};
|
|
20
|
+
const SUPPORTED_EXTENSIONS = Object.keys(EXTENSION_MAP).join(', ');
|
|
21
|
+
// ===== Limits ==============================================================
|
|
22
|
+
/** Maximum file size in bytes (50 MB). */
|
|
23
|
+
const MAX_FILE_SIZE = 50 * 1024 * 1024;
|
|
24
|
+
/** Maximum total uncompressed size for ZIP-based formats (100 MB). */
|
|
25
|
+
const MAX_UNCOMPRESSED_SIZE = 100 * 1024 * 1024;
|
|
26
|
+
// ===== Public API ==========================================================
|
|
27
|
+
/**
|
|
28
|
+
* Extract content from a local file.
|
|
29
|
+
*
|
|
30
|
+
* @param filePath - Path to the file (resolved to absolute).
|
|
31
|
+
* @returns Extracted content with title, text, and the absolute path as `url`.
|
|
32
|
+
* @throws On missing/unreadable file, unsupported format, or empty content.
|
|
33
|
+
*/
|
|
34
|
+
export async function extractFromFile(filePath) {
|
|
35
|
+
// Check extension first for a clear error on unsupported formats
|
|
36
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
37
|
+
const format = EXTENSION_MAP[ext];
|
|
38
|
+
if (!format) {
|
|
39
|
+
throw new Error(`Unsupported file format "${ext}". Supported formats: ${SUPPORTED_EXTENSIONS}`);
|
|
40
|
+
}
|
|
41
|
+
const absPath = path.resolve(filePath);
|
|
42
|
+
// Resolve symlinks to get the real path, then re-check the extension
|
|
43
|
+
let realPath;
|
|
44
|
+
try {
|
|
45
|
+
realPath = await fs.realpath(absPath);
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
throw new Error(`File not found or not readable: ${path.basename(filePath)}`);
|
|
49
|
+
}
|
|
50
|
+
const realExt = path.extname(realPath).toLowerCase();
|
|
51
|
+
if (!EXTENSION_MAP[realExt]) {
|
|
52
|
+
throw new Error(`Symlink target has unsupported extension "${realExt}". Supported formats: ${SUPPORTED_EXTENSIONS}`);
|
|
53
|
+
}
|
|
54
|
+
// Check file is regular and within size limit
|
|
55
|
+
const stat = await fs.stat(realPath);
|
|
56
|
+
if (!stat.isFile()) {
|
|
57
|
+
throw new Error('Path is not a regular file.');
|
|
58
|
+
}
|
|
59
|
+
if (stat.size > MAX_FILE_SIZE) {
|
|
60
|
+
throw new Error(`File is too large (${(stat.size / 1024 / 1024).toFixed(1)} MB). Maximum supported size is ${MAX_FILE_SIZE / 1024 / 1024} MB.`);
|
|
61
|
+
}
|
|
62
|
+
switch (format) {
|
|
63
|
+
case 'pdf':
|
|
64
|
+
return extractPdf(realPath);
|
|
65
|
+
case 'docx':
|
|
66
|
+
return extractDocx(realPath);
|
|
67
|
+
case 'pptx':
|
|
68
|
+
return extractPptx(realPath);
|
|
69
|
+
case 'html':
|
|
70
|
+
return extractHtmlFile(realPath);
|
|
71
|
+
case 'txt':
|
|
72
|
+
case 'md':
|
|
73
|
+
return extractPlainText(realPath);
|
|
74
|
+
default:
|
|
75
|
+
throw new Error(`Unsupported format: ${format}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
// ===== Format extractors ===================================================
|
|
79
|
+
async function extractPdf(filePath) {
|
|
80
|
+
const { PDFParse } = await import('pdf-parse');
|
|
81
|
+
const buffer = await fs.readFile(filePath);
|
|
82
|
+
const uint8 = new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
83
|
+
const parser = new PDFParse({ data: uint8, isEvalSupported: false });
|
|
84
|
+
const textResult = await parser.getText();
|
|
85
|
+
// Strip the page footer markers ("-- N of M --")
|
|
86
|
+
const rawText = textResult.text.replace(/\n-- \d+ of \d+ --\n/g, '\n');
|
|
87
|
+
const content = cleanText(rawText);
|
|
88
|
+
if (content.length === 0) {
|
|
89
|
+
throw new Error('No extractable text in PDF. The file may be a scanned document without OCR.');
|
|
90
|
+
}
|
|
91
|
+
const infoResult = await parser.getInfo();
|
|
92
|
+
const title = infoResult?.info?.Title || filenameTitle(filePath);
|
|
93
|
+
parser.destroy();
|
|
94
|
+
return {
|
|
95
|
+
title,
|
|
96
|
+
content,
|
|
97
|
+
url: filePath,
|
|
98
|
+
byteLength: Buffer.byteLength(content, 'utf-8'),
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
async function extractDocx(filePath) {
|
|
102
|
+
const mammoth = await import('mammoth');
|
|
103
|
+
const result = await mammoth.convertToHtml({ path: filePath });
|
|
104
|
+
if (!result.value || result.value.trim().length === 0) {
|
|
105
|
+
throw new Error('No extractable content in DOCX file.');
|
|
106
|
+
}
|
|
107
|
+
// Pipe the clean HTML through existing Readability extraction
|
|
108
|
+
const { extractContent } = await import('./extract.js');
|
|
109
|
+
try {
|
|
110
|
+
const extracted = extractContent(result.value, filePath);
|
|
111
|
+
return {
|
|
112
|
+
...extracted,
|
|
113
|
+
url: filePath,
|
|
114
|
+
title: extracted.title || filenameTitle(filePath),
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
catch {
|
|
118
|
+
// If Readability fails (e.g. very simple doc), fall back to plain text
|
|
119
|
+
const text = cleanText(result.value.replace(/<[^>]+>/g, ' '));
|
|
120
|
+
if (text.length === 0) {
|
|
121
|
+
throw new Error('No extractable content in DOCX file.');
|
|
122
|
+
}
|
|
123
|
+
return {
|
|
124
|
+
title: filenameTitle(filePath),
|
|
125
|
+
content: text,
|
|
126
|
+
url: filePath,
|
|
127
|
+
byteLength: Buffer.byteLength(text, 'utf-8'),
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
async function extractPptx(filePath) {
|
|
132
|
+
const JSZip = (await import('jszip')).default;
|
|
133
|
+
const buffer = await fs.readFile(filePath);
|
|
134
|
+
const zip = await JSZip.loadAsync(buffer);
|
|
135
|
+
// Find slide XML files and sort by slide number
|
|
136
|
+
const slideFiles = Object.keys(zip.files)
|
|
137
|
+
.filter((name) => /^ppt\/slides\/slide\d+\.xml$/.test(name))
|
|
138
|
+
.sort((a, b) => {
|
|
139
|
+
const numA = parseInt(a.match(/slide(\d+)/)?.[1] ?? '0');
|
|
140
|
+
const numB = parseInt(b.match(/slide(\d+)/)?.[1] ?? '0');
|
|
141
|
+
return numA - numB;
|
|
142
|
+
});
|
|
143
|
+
if (slideFiles.length === 0) {
|
|
144
|
+
throw new Error('No slides found in PPTX file.');
|
|
145
|
+
}
|
|
146
|
+
// Decompress slides incrementally, tracking actual bytes to guard against zip bombs
|
|
147
|
+
const slideTexts = [];
|
|
148
|
+
let firstSlideTitle = '';
|
|
149
|
+
let totalDecompressed = 0;
|
|
150
|
+
for (const slideFile of slideFiles) {
|
|
151
|
+
const bytes = await zip.file(slideFile).async('uint8array');
|
|
152
|
+
totalDecompressed += bytes.byteLength;
|
|
153
|
+
if (totalDecompressed > MAX_UNCOMPRESSED_SIZE) {
|
|
154
|
+
throw new Error(`PPTX uncompressed content exceeds ${MAX_UNCOMPRESSED_SIZE / 1024 / 1024} MB limit.`);
|
|
155
|
+
}
|
|
156
|
+
const xml = new TextDecoder().decode(bytes);
|
|
157
|
+
const text = extractSlideText(xml);
|
|
158
|
+
if (text) {
|
|
159
|
+
slideTexts.push(text);
|
|
160
|
+
if (!firstSlideTitle) {
|
|
161
|
+
firstSlideTitle = text.split('\n')[0].trim();
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
const content = cleanText(slideTexts.join('\n\n'));
|
|
166
|
+
if (content.length === 0) {
|
|
167
|
+
throw new Error('No extractable text in PPTX file.');
|
|
168
|
+
}
|
|
169
|
+
return {
|
|
170
|
+
title: firstSlideTitle || filenameTitle(filePath),
|
|
171
|
+
content,
|
|
172
|
+
url: filePath,
|
|
173
|
+
byteLength: Buffer.byteLength(content, 'utf-8'),
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
async function extractHtmlFile(filePath) {
|
|
177
|
+
const html = await fs.readFile(filePath, 'utf-8');
|
|
178
|
+
const { extractContent } = await import('./extract.js');
|
|
179
|
+
const extracted = extractContent(html, filePath);
|
|
180
|
+
return {
|
|
181
|
+
...extracted,
|
|
182
|
+
url: filePath,
|
|
183
|
+
title: extracted.title || filenameTitle(filePath),
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
async function extractPlainText(filePath) {
|
|
187
|
+
const raw = await fs.readFile(filePath, 'utf-8');
|
|
188
|
+
const content = cleanText(raw);
|
|
189
|
+
if (content.length === 0) {
|
|
190
|
+
throw new Error('Empty file.');
|
|
191
|
+
}
|
|
192
|
+
// Title: first markdown heading, or first short line, or filename
|
|
193
|
+
let title = '';
|
|
194
|
+
const headingMatch = content.match(/^#\s+(.+)$/m);
|
|
195
|
+
if (headingMatch) {
|
|
196
|
+
title = headingMatch[1].trim();
|
|
197
|
+
}
|
|
198
|
+
else {
|
|
199
|
+
const firstLine = content.split('\n').find((l) => l.trim().length > 0);
|
|
200
|
+
if (firstLine && firstLine.length < 120) {
|
|
201
|
+
title = firstLine.trim();
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
return {
|
|
205
|
+
title: title || filenameTitle(filePath),
|
|
206
|
+
content,
|
|
207
|
+
url: filePath,
|
|
208
|
+
byteLength: Buffer.byteLength(content, 'utf-8'),
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
// ===== Utilities ===========================================================
|
|
212
|
+
/**
|
|
213
|
+
* Extract text from a PPTX slide XML string.
|
|
214
|
+
* Groups by `<a:p>` paragraphs, collects `<a:t>` text runs within each.
|
|
215
|
+
*/
|
|
216
|
+
function extractSlideText(xml) {
|
|
217
|
+
const paragraphs = [];
|
|
218
|
+
const pRegex = /<a:p\b[^>]*>([\s\S]*?)<\/a:p>/g;
|
|
219
|
+
const tRegex = /<a:t>([\s\S]*?)<\/a:t>/g;
|
|
220
|
+
let pMatch;
|
|
221
|
+
while ((pMatch = pRegex.exec(xml)) !== null) {
|
|
222
|
+
const pContent = pMatch[1];
|
|
223
|
+
const texts = [];
|
|
224
|
+
let tMatch;
|
|
225
|
+
tRegex.lastIndex = 0;
|
|
226
|
+
while ((tMatch = tRegex.exec(pContent)) !== null) {
|
|
227
|
+
texts.push(tMatch[1]);
|
|
228
|
+
}
|
|
229
|
+
if (texts.length > 0) {
|
|
230
|
+
paragraphs.push(texts.join(''));
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
return paragraphs.join('\n');
|
|
234
|
+
}
|
|
235
|
+
/** Extract a readable title from a file path (filename without extension). */
|
|
236
|
+
function filenameTitle(filePath) {
|
|
237
|
+
return path.basename(filePath, path.extname(filePath));
|
|
238
|
+
}
|
|
239
|
+
/** Clean extracted text: strip steganographic chars, collapse whitespace, trim lines. */
|
|
240
|
+
function cleanText(text) {
|
|
241
|
+
return text
|
|
242
|
+
.replace(/[\u2028\u2029]/g, '\n') // Unicode line/paragraph separators → newline
|
|
243
|
+
.replace(/[\u200B-\u200F\u202A-\u202F\u2060-\u206F\uFEFF]/g, '') // strip zero-width / bidi / invisible chars
|
|
244
|
+
.replace(/[\t ]+/g, ' ')
|
|
245
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
246
|
+
.replace(/^ +| +$/gm, '')
|
|
247
|
+
.trim();
|
|
248
|
+
}
|
|
249
|
+
//# sourceMappingURL=file-extract.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"file-extract.js","sourceRoot":"","sources":["../../pipeline/file-extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACvC,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAGlC,8EAA8E;AAE9E,MAAM,aAAa,GAA2B;IAC5C,MAAM,EAAE,KAAK;IACb,OAAO,EAAE,MAAM;IACf,OAAO,EAAE,MAAM;IACf,OAAO,EAAE,MAAM;IACf,MAAM,EAAE,MAAM;IACd,MAAM,EAAE,KAAK;IACb,KAAK,EAAE,IAAI;IACX,WAAW,EAAE,IAAI;CAClB,CAAC;AAEF,MAAM,oBAAoB,GAAG,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEnE,8EAA8E;AAE9E,0CAA0C;AAC1C,MAAM,aAAa,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC;AAEvC,sEAAsE;AACtE,MAAM,qBAAqB,GAAG,GAAG,GAAG,IAAI,GAAG,IAAI,CAAC;AAEhD,8EAA8E;AAE9E;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,QAAgB;IACpD,iEAAiE;IACjE,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACjD,MAAM,MAAM,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;IAClC,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,4BAA4B,GAAG,yBAAyB,oBAAoB,EAAE,CAC/E,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IAEvC,qEAAqE;IACrE,IAAI,QAAgB,CAAC;IACrB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;IACxC,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,mCAAmC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IAChF,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACrD,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,KAAK,CACb,6CAA6C,OAAO,yBAAyB,oBAAoB,EAAE,CACpG,CAAC;IACJ,CAAC;IAED,8CAA8C;IAC9C,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACrC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACjD,CAAC;IACD,IAAI,IAAI,CAAC,IAAI,GAAG,aAAa,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CACb,sBAAsB,CAAC,IAAI,CAAC,IAAI,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,mCAAmC,aAAa,GAAG,IAAI,GAAG,IAAI,MAAM,CAC/H,CAAC;IACJ,CAAC;IAED,QAAQ,MAAM,EAAE,CAAC;QACf,KAAK,KAAK;YACR,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC;QAC9B,KAAK,MAAM;YACT,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC;QAC/B,KAAK,MAAM;YACT,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC;QAC/B,KAAK,MAAM;YACT,OAAO,eAAe,CAAC,QAAQ,CAAC,CAAC;QACnC,KAAK,KAAK,CAAC;QACX,KAAK,IAAI;YACP,OAAO,gBAAgB,CAAC,QAAQ,CAAC,CAAC;QACpC;YACE,MAAM,IAAI,KAAK,CAAC,uBAAuB,MAAM,EAAE,CAAC,CAAC;IACrD,CAAC;AACH,CAAC;AAED,8EAA8E;AAE9E,KAAK,UAAU,UAAU,CAAC,QAAgB;IACxC,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;IAC/C,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC3C,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IAElF,MAAM,MAAM,GAAG,IAAI,QAAQ,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,eAAe,EAAE,KAAK,EAAE,CAAC,CAAC;IACrE,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,OAAO,EAAE,CAAC;IAC1C,iDAAiD;IACjD,MAAM,OAAO,GAAG,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,uBAAuB,EAAE,IAAI,CAAC,CAAC;IACvE,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC;IAEnC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CACb,6EAA6E,CAC9E,CAAC;IACJ,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,OAAO,EAAE,CAAC;IAC1C,MAAM,KAAK,GAAG,UAAU,EAAE,IAAI,EAAE,KAAK,IAAI,aAAa,CAAC,QAAQ,CAAC,CAAC;IACjE,MAAM,CAAC,OAAO,EAAE,CAAC;IAEjB,OAAO;QACL,KAAK;QACL,OAAO;QACP,GAAG,EAAE,QAAQ;QACb,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC;KAChD,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,QAAgB;IACzC,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;IACxC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,aAAa,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC;IAE/D,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,MAAM,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtD,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;IAC1D,CAAC;IAED,8DAA8D;IAC9D,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;IACxD,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,cAAc,CAAC,MAAM,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;QACzD,OAAO;YACL,GAAG,SAAS;YACZ,GAAG,EAAE,QAAQ;YACb,KAAK,EAAE,SAAS,CAAC,KAAK,IAAI,aAAa,CAAC,QAAQ,CAAC;SAClD,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,uEAAuE;QACvE,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC,CAAC;QAC9D,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QACD,OAAO;YACL,KAAK,EAAE,aAAa,CAAC,QAAQ,CAAC;YAC9B,OAAO,EAAE,IAAI;YACb,GAAG,EAAE,QAAQ;YACb,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC;SAC7C,CAAC;IACJ,CAAC;AACH,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,QAAgB;IACzC,MAAM,KAAK,GAAG,CAAC,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC3C,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IAE1C,gDAAgD;IAChD,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;SACtC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,8BAA8B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;SAC3D,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACb,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;QACzD,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;QACzD,OAAO,IAAI,GAAG,IAAI,CAAC;IACrB,CAAC,CAAC,CAAC;IAEL,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACnD,CAAC;IAED,oFAAoF;IACpF,MAAM,UAAU,GAAa,EAAE,CAAC;IAChC,IAAI,eAAe,GAAG,EAAE,CAAC;IACzB,IAAI,iBAAiB,GAAG,CAAC,CAAC;IAE1B,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,MAAM,GAAG,CAAC,IAAI,CAAC,SAAS,CAAE,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAC7D,iBAAiB,IAAI,KAAK,CAAC,UAAU,CAAC;QACtC,IAAI,iBAAiB,GAAG,qBAAqB,EAAE,CAAC;YAC9C,MAAM,IAAI,KAAK,CACb,qCAAqC,qBAAqB,GAAG,IAAI,GAAG,IAAI,YAAY,CACrF,CAAC;QACJ,CAAC;QACD,MAAM,GAAG,GAAG,IAAI,WAAW,EAAE,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,GAAG,gBAAgB,CAAC,GAAG,CAAC,CAAC;QACnC,IAAI,IAAI,EAAE,CAAC;YACT,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,IAAI,CAAC,eAAe,EAAE,CAAC;gBACrB,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC/C,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,SAAS,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;IACnD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;IACvD,CAAC;IAED,OAAO;QACL,KAAK,EAAE,eAAe,IAAI,aAAa,CAAC,QAAQ,CAAC;QACjD,OAAO;QACP,GAAG,EAAE,QAAQ;QACb,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC;KAChD,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,QAAgB;IAC7C,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAClD,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;IACxD,MAAM,SAAS,GAAG,cAAc,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACjD,OAAO;QACL,GAAG,SAAS;QACZ,GAAG,EAAE,QAAQ;QACb,KAAK,EAAE,SAAS,CAAC,KAAK,IAAI,aAAa,CAAC,QAAQ,CAAC;KAClD,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,gBAAgB,CAAC,QAAgB;IAC9C,MAAM,GAAG,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACjD,MAAM,OAAO,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;IAE/B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,aAAa,CAAC,CAAC;IACjC,CAAC;IAED,kEAAkE;IAClE,IAAI,KAAK,GAAG,EAAE,CAAC;IACf,MAAM,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAClD,IAAI,YAAY,EAAE,CAAC;QACjB,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACjC,CAAC;SAAM,CAAC;QACN,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvE,IAAI,SAAS,IAAI,SAAS,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YACxC,KAAK,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC;QAC3B,CAAC;IACH,CAAC;IAED,OAAO;QACL,KAAK,EAAE,KAAK,IAAI,aAAa,CAAC,QAAQ,CAAC;QACvC,OAAO;QACP,GAAG,EAAE,QAAQ;QACb,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC;KAChD,CAAC;AACJ,CAAC;AAED,8EAA8E;AAE9E;;;GAGG;AACH,SAAS,gBAAgB,CAAC,GAAW;IACnC,MAAM,UAAU,GAAa,EAAE,CAAC;IAChC,MAAM,MAAM,GAAG,gCAAgC,CAAC;IAChD,MAAM,MAAM,GAAG,yBAAyB,CAAC;IAEzC,IAAI,MAAM,CAAC;IACX,OAAO,CAAC,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC5C,MAAM,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,IAAI,MAAM,CAAC;QACX,MAAM,CAAC,SAAS,GAAG,CAAC,CAAC;QACrB,OAAO,CAAC,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACjD,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QACxB,CAAC;QACD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrB,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC/B,CAAC;AAED,8EAA8E;AAC9E,SAAS,aAAa,CAAC,QAAgB;IACrC,OAAO,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;AACzD,CAAC;AAED,yFAAyF;AACzF,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAgC,8CAA8C;SAC9G,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC,CAAC,4CAA4C;SAC5G,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC;SACxB,IAAI,EAAE,CAAC;AACZ,CAAC"}
|
package/dist/pipeline/index.d.ts
CHANGED
|
@@ -7,4 +7,6 @@ export { Embedder } from './embedder.js';
|
|
|
7
7
|
export { ModelManager } from './model-manager.js';
|
|
8
8
|
export { ContentPipeline } from './content-pipeline.js';
|
|
9
9
|
export { cosineSimilarity, searchEmbeddings } from './search.js';
|
|
10
|
+
export { validateContent, validateFileContent, type ContentIssue } from './validate.js';
|
|
11
|
+
export { extractFromFile } from './file-extract.js';
|
|
10
12
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AACjE,OAAO,EAAE,eAAe,EAAE,mBAAmB,EAAE,KAAK,YAAY,EAAE,MAAM,eAAe,CAAC;AACxF,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC"}
|
package/dist/pipeline/index.js
CHANGED
|
@@ -7,4 +7,6 @@ export { Embedder } from './embedder.js';
|
|
|
7
7
|
export { ModelManager } from './model-manager.js';
|
|
8
8
|
export { ContentPipeline } from './content-pipeline.js';
|
|
9
9
|
export { cosineSimilarity, searchEmbeddings } from './search.js';
|
|
10
|
+
export { validateContent, validateFileContent } from './validate.js';
|
|
11
|
+
export { extractFromFile } from './file-extract.js';
|
|
10
12
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../pipeline/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AACjE,OAAO,EAAE,eAAe,EAAE,mBAAmB,EAAqB,MAAM,eAAe,CAAC;AACxF,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC"}
|
package/dist/pipeline/types.d.ts
CHANGED
|
@@ -12,6 +12,8 @@ export interface PipelineConfig {
|
|
|
12
12
|
maxRedirects: number;
|
|
13
13
|
/** User-Agent header */
|
|
14
14
|
userAgent: string;
|
|
15
|
+
/** Skip SSRF validation — ONLY for tests against localhost servers */
|
|
16
|
+
_skipSsrfCheck?: boolean;
|
|
15
17
|
}
|
|
16
18
|
/** Extracted content from a URL */
|
|
17
19
|
export interface ExtractedContent {
|
|
@@ -21,6 +23,8 @@ export interface ExtractedContent {
|
|
|
21
23
|
url: string;
|
|
22
24
|
/** Byte length of extracted content */
|
|
23
25
|
byteLength: number;
|
|
26
|
+
/** Validation warnings (content was stored, but quality may be degraded) */
|
|
27
|
+
warnings?: string[];
|
|
24
28
|
}
|
|
25
29
|
/** A text chunk from extracted content */
|
|
26
30
|
export interface Chunk {
|
|
@@ -59,6 +63,8 @@ export type DownloadProgressCallback = (downloaded: number, total: number) => vo
|
|
|
59
63
|
export interface IContentPipeline {
|
|
60
64
|
/** Fetch and extract content from a URL */
|
|
61
65
|
fetchAndExtract(url: string): Promise<ExtractedContent>;
|
|
66
|
+
/** Extract content from a local file (PDF, DOCX, PPTX, HTML, TXT, MD) */
|
|
67
|
+
extractFromFile(filePath: string): Promise<ExtractedContent>;
|
|
62
68
|
/** Split extracted text into chunks */
|
|
63
69
|
chunk(text: string): Chunk[];
|
|
64
70
|
/** Embed a single text string */
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../pipeline/types.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,MAAM,WAAW,cAAc;IAC7B,iEAAiE;IACjE,SAAS,EAAE,MAAM,CAAC;IAClB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;IACvB,iDAAiD;IACjD,aAAa,EAAE,MAAM,CAAC;IACtB,kDAAkD;IAClD,cAAc,EAAE,MAAM,CAAC;IACvB,+CAA+C;IAC/C,YAAY,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,SAAS,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../pipeline/types.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,MAAM,WAAW,cAAc;IAC7B,iEAAiE;IACjE,SAAS,EAAE,MAAM,CAAC;IAClB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;IACvB,iDAAiD;IACjD,aAAa,EAAE,MAAM,CAAC;IACtB,kDAAkD;IAClD,cAAc,EAAE,MAAM,CAAC;IACvB,+CAA+C;IAC/C,YAAY,EAAE,MAAM,CAAC;IACrB,wBAAwB;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,sEAAsE;IACtE,cAAc,CAAC,EAAE,OAAO,CAAC;CAC1B;AAED,mCAAmC;AACnC,MAAM,WAAW,gBAAgB;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,mBAAmB;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,uCAAuC;IACvC,UAAU,EAAE,MAAM,CAAC;IACnB,4EAA4E;IAC5E,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CACrB;AAED,0CAA0C;AAC1C,MAAM,WAAW,KAAK;IACpB,yBAAyB;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,yCAAyC;IACzC,KAAK,EAAE,MAAM,CAAC;IACd,8BAA8B;IAC9B,UAAU,EAAE,MAAM,CAAC;IACnB,6CAA6C;IAC7C,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,sEAAsE;AACtE,MAAM,MAAM,eAAe,GAAG,YAAY,CAAC;AAE3C,0CAA0C;AAC1C,MAAM,WAAW,aAAc,SAAQ,KAAK;IAC1C,uCAAuC;IACvC,SAAS,EAAE,eAAe,CAAC;IAC3B,uDAAuD;IACvD,KAAK,EAAE,MAAM,CAAC;CACf;AAED,6CAA6C;AAC7C,MAAM,WAAW,YAAY;IAC3B,qBAAqB;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,yBAAyB;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,oCAAoC;IACpC,KAAK,EAAE,MAAM,CAAC;CACf;AAED,uCAAuC;AACvC,MAAM,MAAM,wBAAwB,GAAG,CAAC,UAAU,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;AAEnF,yCAAyC;AACzC,MAAM,WAAW,gBAAgB;IAC/B,2CAA2C;IAC3C,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC;IACxD,yEAAyE;IACzE,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC;IAC7D,uCAAuC;IACvC,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,KAAK,EAAE,CAAC;IAC7B,iCAAiC;IACjC,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC;IAC9C,4BAA4B;IAC5B,WAAW,CAAC,MAAM,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC,CAAC;IACvD,4CAA4C;IAC5C,MAAM,CAAC,KAAK,EAAE,eAAe,EAAE,UAAU,EAAE,eAAe,EAAE,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;CACvF;AAED,iEAAiE;AACjE,MAAM,WAAW,aAAa;IAC5B,gDAAgD;IAChD,YAAY,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IACjC,2EAA2E;IAC3E,WAAW,CAAC,UAAU,CAAC,EAAE,wBAAwB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACpE,6BAA6B;IAC7B,YAAY,IAAI,MAAM,CAAC;IACvB,kCAAkC;IAClC,YAAY,IAAI,MAAM,CAAC;IACvB,qDAAqD;IACrD,YAAY,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IACjC,yDAAyD;IACzD,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC;CAChC"}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content validation for the ingestion pipeline.
|
|
3
|
+
*
|
|
4
|
+
* Runs quality checks on fetched HTML and extracted content, returning
|
|
5
|
+
* structured issues. Errors block ingestion; warnings are surfaced
|
|
6
|
+
* to the user alongside the stored content.
|
|
7
|
+
*
|
|
8
|
+
* Checks are organised into three tiers:
|
|
9
|
+
* 1. Pattern matching — known services / phrases
|
|
10
|
+
* 2. Structural HTML — HTML characteristics regardless of service
|
|
11
|
+
* 3. Content heuristics — statistical properties of extracted text
|
|
12
|
+
*
|
|
13
|
+
* All patterns are English-only for now.
|
|
14
|
+
*/
|
|
15
|
+
import type { ExtractedContent } from './types.js';
|
|
16
|
+
export interface ContentIssue {
|
|
17
|
+
severity: 'error' | 'warning';
|
|
18
|
+
code: string;
|
|
19
|
+
message: string;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Validate fetched HTML and its extracted content.
|
|
23
|
+
*
|
|
24
|
+
* Returns all detected issues (not just the first). Callers should
|
|
25
|
+
* treat `error`-severity issues as ingestion blockers and `warning`
|
|
26
|
+
* issues as informational.
|
|
27
|
+
*/
|
|
28
|
+
export declare function validateContent(html: string, extracted: ExtractedContent): ContentIssue[];
|
|
29
|
+
/**
|
|
30
|
+
* Validate extracted content from a local file.
|
|
31
|
+
*
|
|
32
|
+
* Runs only Tier 3 (content heuristic) checks. Tier 1 (pattern matching)
|
|
33
|
+
* and Tier 2 (structural HTML analysis) are specific to URL-fetched content.
|
|
34
|
+
*/
|
|
35
|
+
export declare function validateFileContent(extracted: ExtractedContent): ContentIssue[];
|
|
36
|
+
//# sourceMappingURL=validate.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"validate.d.ts","sourceRoot":"","sources":["../../pipeline/validate.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAInD,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,OAAO,GAAG,SAAS,CAAC;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;CACjB;AA8RD;;;;;;GAMG;AACH,wBAAgB,eAAe,CAC7B,IAAI,EAAE,MAAM,EACZ,SAAS,EAAE,gBAAgB,GAC1B,YAAY,EAAE,CAgChB;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,gBAAgB,GAAG,YAAY,EAAE,CAa/E"}
|