@de-otio/chaoskb-client 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/mcp-server.d.ts +16 -1
- package/dist/cli/mcp-server.d.ts.map +1 -1
- package/dist/cli/mcp-server.js +29 -12
- package/dist/cli/mcp-server.js.map +1 -1
- package/dist/cli/tools/kb-ingest.d.ts +3 -1
- package/dist/cli/tools/kb-ingest.d.ts.map +1 -1
- package/dist/cli/tools/kb-ingest.js +45 -5
- package/dist/cli/tools/kb-ingest.js.map +1 -1
- package/dist/cli/tools/kb-query.d.ts +2 -0
- package/dist/cli/tools/kb-query.d.ts.map +1 -1
- package/dist/cli/tools/kb-query.js +11 -2
- package/dist/cli/tools/kb-query.js.map +1 -1
- package/dist/pipeline/content-pipeline.d.ts +2 -0
- package/dist/pipeline/content-pipeline.d.ts.map +1 -1
- package/dist/pipeline/content-pipeline.js +27 -1
- package/dist/pipeline/content-pipeline.js.map +1 -1
- package/dist/pipeline/extract.d.ts.map +1 -1
- package/dist/pipeline/extract.js +129 -4
- package/dist/pipeline/extract.js.map +1 -1
- package/dist/pipeline/fetch.d.ts +11 -0
- package/dist/pipeline/fetch.d.ts.map +1 -1
- package/dist/pipeline/fetch.js +153 -1
- package/dist/pipeline/fetch.js.map +1 -1
- package/dist/pipeline/file-extract.d.ts +16 -0
- package/dist/pipeline/file-extract.d.ts.map +1 -0
- package/dist/pipeline/file-extract.js +249 -0
- package/dist/pipeline/file-extract.js.map +1 -0
- package/dist/pipeline/index.d.ts +2 -0
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +2 -0
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/types.d.ts +6 -0
- package/dist/pipeline/types.d.ts.map +1 -1
- package/dist/pipeline/validate.d.ts +36 -0
- package/dist/pipeline/validate.d.ts.map +1 -0
- package/dist/pipeline/validate.js +632 -0
- package/dist/pipeline/validate.js.map +1 -0
- package/dist/storage/source-repo.d.ts +2 -0
- package/dist/storage/source-repo.d.ts.map +1 -1
- package/dist/storage/source-repo.js +9 -2
- package/dist/storage/source-repo.js.map +1 -1
- package/dist/storage/types.d.ts +1 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +4 -1
package/dist/pipeline/extract.js
CHANGED
|
@@ -4,8 +4,53 @@
|
|
|
4
4
|
* Parses HTML with `linkedom` and runs it through Readability to pull out
|
|
5
5
|
* the main article content, stripped of navigation, ads, and boilerplate.
|
|
6
6
|
*/
|
|
7
|
+
import { basename } from 'node:path';
|
|
7
8
|
import { Readability } from '@mozilla/readability';
|
|
8
9
|
import { parseHTML } from 'linkedom';
|
|
10
|
+
/**
|
|
11
|
+
* Regex matching inline `style` attribute values that visually hide an element.
|
|
12
|
+
*
|
|
13
|
+
* These patterns detect CSS-based hiding tricks that attackers use to embed
|
|
14
|
+
* invisible prompt-injection payloads in web pages. Because `linkedom` does
|
|
15
|
+
* not compute styles, Readability would otherwise include this hidden text.
|
|
16
|
+
*/
|
|
17
|
+
const HIDDEN_STYLE_RE = /display\s*:\s*none|visibility\s*:\s*hidden|font-size\s*:\s*0(?:px|em|rem|%)?\s*(?:;|$)|opacity\s*:\s*0(?:\.\d+)?(?:;|$)|position\s*:\s*(?:absolute|fixed)[\s\S]*?(?:left|top)\s*:\s*-\d{4,}|clip\s*:\s*rect\(\s*0/i;
|
|
18
|
+
/**
|
|
19
|
+
* Strip elements that are visually hidden from the DOM before Readability runs.
|
|
20
|
+
*
|
|
21
|
+
* Removes:
|
|
22
|
+
* - Elements with the `hidden` attribute
|
|
23
|
+
* - Elements with `aria-hidden="true"`
|
|
24
|
+
* - `<noscript>` elements (content is irrelevant for non-JS fetching)
|
|
25
|
+
* - Elements whose inline `style` matches common hiding patterns
|
|
26
|
+
*/
|
|
27
|
+
function stripHiddenElements(document) {
|
|
28
|
+
// Remove <noscript> — irrelevant when we don't execute JS
|
|
29
|
+
for (const el of document.querySelectorAll('noscript')) {
|
|
30
|
+
el.remove();
|
|
31
|
+
}
|
|
32
|
+
// Remove elements with the `hidden` attribute
|
|
33
|
+
for (const el of document.querySelectorAll('[hidden]')) {
|
|
34
|
+
el.remove();
|
|
35
|
+
}
|
|
36
|
+
// Remove elements with aria-hidden="true"
|
|
37
|
+
for (const el of document.querySelectorAll('[aria-hidden="true"]')) {
|
|
38
|
+
el.remove();
|
|
39
|
+
}
|
|
40
|
+
// Remove elements whose inline style indicates visual hiding
|
|
41
|
+
for (const el of document.querySelectorAll('[style]')) {
|
|
42
|
+
const style = el.getAttribute('style') ?? '';
|
|
43
|
+
if (HIDDEN_STYLE_RE.test(style)) {
|
|
44
|
+
el.remove();
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
/** Sanitize a source identifier for error messages (strip full paths). */
|
|
49
|
+
function safeSourceLabel(url) {
|
|
50
|
+
if (url.startsWith('/') || /^[A-Z]:\\/i.test(url))
|
|
51
|
+
return basename(url);
|
|
52
|
+
return url;
|
|
53
|
+
}
|
|
9
54
|
/**
|
|
10
55
|
* Extract the main article content from an HTML string.
|
|
11
56
|
*
|
|
@@ -16,9 +61,19 @@ import { parseHTML } from 'linkedom';
|
|
|
16
61
|
*/
|
|
17
62
|
export function extractContent(html, url) {
|
|
18
63
|
if (!html || html.trim().length === 0) {
|
|
19
|
-
throw new Error(`Empty HTML content from ${url}`);
|
|
64
|
+
throw new Error(`Empty HTML content from ${safeSourceLabel(url)}`);
|
|
65
|
+
}
|
|
66
|
+
// Early SPA detection: check the raw HTML before we strip <noscript>.
|
|
67
|
+
// SPA pages have <noscript> as their only meaningful content; once we strip it
|
|
68
|
+
// below, there's nothing left to extract and we'd get a confusing error.
|
|
69
|
+
if (looksLikeSpaHtml(html)) {
|
|
70
|
+
throw new Error(`This page appears to require JavaScript to render its content (${safeSourceLabel(url)}). ` +
|
|
71
|
+
`Only the noscript fallback was captured. ChaosKB does not yet support JavaScript-rendered pages.`);
|
|
20
72
|
}
|
|
21
73
|
const { document } = parseHTML(html);
|
|
74
|
+
// Strip visually-hidden elements before Readability sees them.
|
|
75
|
+
// This prevents CSS-based prompt-injection payloads from surviving extraction.
|
|
76
|
+
stripHiddenElements(document);
|
|
22
77
|
// Attempt Readability extraction
|
|
23
78
|
const reader = new Readability(document);
|
|
24
79
|
const article = reader.parse();
|
|
@@ -29,17 +84,18 @@ export function extractContent(html, url) {
|
|
|
29
84
|
rawContent = article.textContent;
|
|
30
85
|
}
|
|
31
86
|
else {
|
|
32
|
-
// Fallback: extract text from body (strip script/style first)
|
|
87
|
+
// Fallback: extract text from body (strip script/style/hidden first)
|
|
33
88
|
// Wrap in a full HTML document to ensure linkedom creates a body element
|
|
34
89
|
const wrappedHtml = html.includes('<body') ? html : `<html><body>${html}</body></html>`;
|
|
35
90
|
const { document: fallbackDoc } = parseHTML(wrappedHtml);
|
|
91
|
+
stripHiddenElements(fallbackDoc);
|
|
36
92
|
for (const el of fallbackDoc.querySelectorAll('script, style')) {
|
|
37
93
|
el.remove();
|
|
38
94
|
}
|
|
39
95
|
const body = fallbackDoc.querySelector('body');
|
|
40
96
|
rawContent = body ? body.textContent ?? '' : '';
|
|
41
97
|
if (rawContent.trim().length === 0) {
|
|
42
|
-
throw new Error(`No extractable content from ${url}`);
|
|
98
|
+
throw new Error(`No extractable content from ${safeSourceLabel(url)}`);
|
|
43
99
|
}
|
|
44
100
|
title = '';
|
|
45
101
|
}
|
|
@@ -54,14 +110,83 @@ export function extractContent(html, url) {
|
|
|
54
110
|
if (content.length === 0) {
|
|
55
111
|
throw new Error(`No extractable content from ${url}`);
|
|
56
112
|
}
|
|
113
|
+
// Detect JavaScript-only SPA pages that didn't render
|
|
114
|
+
if (looksLikeJsOnlyPage(html, content)) {
|
|
115
|
+
throw new Error(`This page appears to require JavaScript to render its content (${safeSourceLabel(url)}). ` +
|
|
116
|
+
`Only the noscript fallback was captured. ChaosKB does not yet support JavaScript-rendered pages.`);
|
|
117
|
+
}
|
|
57
118
|
const byteLength = Buffer.byteLength(content, 'utf-8');
|
|
58
119
|
return { title, content, url, byteLength };
|
|
59
120
|
}
|
|
121
|
+
/** Patterns that indicate a noscript fallback message. */
|
|
122
|
+
const NOSCRIPT_PATTERNS = [
|
|
123
|
+
/you need to enable javascript/i,
|
|
124
|
+
/please enable javascript/i,
|
|
125
|
+
/javascript is required/i,
|
|
126
|
+
/javascript is disabled/i,
|
|
127
|
+
/this app requires javascript/i,
|
|
128
|
+
/enable javascript to run this app/i,
|
|
129
|
+
/this application requires javascript/i,
|
|
130
|
+
/javascript must be enabled/i,
|
|
131
|
+
/works best with javascript enabled/i,
|
|
132
|
+
/this site requires javascript/i,
|
|
133
|
+
];
|
|
134
|
+
/**
|
|
135
|
+
* Minimum content length (in characters) below which a noscript message
|
|
136
|
+
* is treated as the *entire* page content rather than an incidental mention.
|
|
137
|
+
* A real article that happens to discuss JavaScript would be much longer.
|
|
138
|
+
*/
|
|
139
|
+
const SPA_CONTENT_THRESHOLD = 500;
|
|
140
|
+
/**
|
|
141
|
+
* Quick pre-extraction check on raw HTML for SPA shells.
|
|
142
|
+
*
|
|
143
|
+
* Detects pages that have a noscript message AND an empty SPA root container,
|
|
144
|
+
* which is a strong signal that the page requires JS. This runs before
|
|
145
|
+
* `<noscript>` stripping so we can give a clear error message.
|
|
146
|
+
*/
|
|
147
|
+
function looksLikeSpaHtml(html) {
|
|
148
|
+
const hasNoscript = /<noscript\b[^>]*>[^<]*<\/noscript>/i.test(html);
|
|
149
|
+
if (!hasNoscript)
|
|
150
|
+
return false;
|
|
151
|
+
const hasNoscriptMessage = NOSCRIPT_PATTERNS.some((p) => p.test(html));
|
|
152
|
+
if (!hasNoscriptMessage)
|
|
153
|
+
return false;
|
|
154
|
+
const hasSpaRoot = /<div\s+id=["'](?:root|app|__next|__nuxt|__gatsby)["']\s*>\s*<\/div>/i.test(html);
|
|
155
|
+
return hasSpaRoot;
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Detect if extracted content is likely a JavaScript-only noscript fallback.
|
|
159
|
+
*
|
|
160
|
+
* The heuristic is intentionally conservative: both a short extracted text
|
|
161
|
+
* AND a noscript-style message must be present. A real article that
|
|
162
|
+
* discusses JavaScript would have far more than 500 characters of content.
|
|
163
|
+
*
|
|
164
|
+
* Because `<noscript>` elements are stripped during hidden-element removal
|
|
165
|
+
* (they are irrelevant for non-JS fetching), we check the raw HTML for
|
|
166
|
+
* noscript patterns rather than the extracted text.
|
|
167
|
+
*/
|
|
168
|
+
function looksLikeJsOnlyPage(html, extractedText) {
|
|
169
|
+
// Check raw HTML for noscript messages (since <noscript> is stripped before extraction)
|
|
170
|
+
const hasNoscriptMessage = NOSCRIPT_PATTERNS.some((p) => p.test(extractedText)) ||
|
|
171
|
+
NOSCRIPT_PATTERNS.some((p) => p.test(html));
|
|
172
|
+
if (!hasNoscriptMessage)
|
|
173
|
+
return false;
|
|
174
|
+
// Short content + noscript message → almost certainly an SPA shell
|
|
175
|
+
if (extractedText.length < SPA_CONTENT_THRESHOLD)
|
|
176
|
+
return true;
|
|
177
|
+
// Longer content but the HTML has an empty SPA root container
|
|
178
|
+
// (e.g. <div id="root"></div>) alongside the noscript message
|
|
179
|
+
const hasSpaRoot = /<div\s+id=["'](?:root|app|__next|__nuxt|__gatsby)["']\s*>\s*<\/div>/i.test(html);
|
|
180
|
+
return hasSpaRoot;
|
|
181
|
+
}
|
|
60
182
|
/**
|
|
61
|
-
* Clean extracted text by
|
|
183
|
+
* Clean extracted text by stripping steganographic characters,
|
|
184
|
+
* collapsing whitespace, and trimming.
|
|
62
185
|
*/
|
|
63
186
|
function cleanText(text) {
|
|
64
187
|
return text
|
|
188
|
+
.replace(/[\u2028\u2029]/g, '\n') // Unicode line/paragraph separators → newline
|
|
189
|
+
.replace(/[\u200B-\u200F\u202A-\u202F\u2060-\u206F\uFEFF]/g, '') // strip zero-width / bidi / invisible chars
|
|
65
190
|
.replace(/[\t ]+/g, ' ') // collapse horizontal whitespace
|
|
66
191
|
.replace(/\n{3,}/g, '\n\n') // collapse excessive newlines
|
|
67
192
|
.replace(/^ +| +$/gm, '') // trim each line
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"extract.js","sourceRoot":"","sources":["../../pipeline/extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,GAAW;IACtD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,KAAK,CAAC,2BAA2B,GAAG,EAAE,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"extract.js","sourceRoot":"","sources":["../../pipeline/extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AACrC,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC;;;;;;GAMG;AACH,MAAM,eAAe,GACnB,oNAAoN,CAAC;AAEvN;;;;;;;;GAQG;AACH,SAAS,mBAAmB,CAAC,QAAa;IACxC,0DAA0D;IAC1D,KAAK,MAAM,EAAE,IAAI,QAAQ,CAAC,gBAAgB,CAAC,UAAU,CAAC,EAAE,CAAC;QACvD,EAAE,CAAC,MAAM,EAAE,CAAC;IACd,CAAC;IAED,8CAA8C;IAC9C,KAAK,MAAM,EAAE,IAAI,QAAQ,CAAC,gBAAgB,CAAC,UAAU,CAAC,EAAE,CAAC;QACvD,EAAE,CAAC,MAAM,EAAE,CAAC;IACd,CAAC;IAED,0CAA0C;IAC1C,KAAK,MAAM,EAAE,IAAI,QAAQ,CAAC,gBAAgB,CAAC,sBAAsB,CAAC,EAAE,CAAC;QACnE,EAAE,CAAC,MAAM,EAAE,CAAC;IACd,CAAC;IAED,6DAA6D;IAC7D,KAAK,MAAM,EAAE,IAAI,QAAQ,CAAC,gBAAgB,CAAC,SAAS,CAAC,EAAE,CAAC;QACtD,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;QAC7C,IAAI,eAAe,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAChC,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;IACH,CAAC;AACH,CAAC;AAED,0EAA0E;AAC1E,SAAS,eAAe,CAAC,GAAW;IAClC,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC;IACxE,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,GAAW;IACtD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,KAAK,CAAC,2BAA2B,eAAe,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACrE,CAAC;IAED,sEAAsE;IACtE,+EAA+E;IAC/E,yEAAyE;IACzE,IAAI,gBAAgB,CAAC,IAAI,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,kEAAkE,eAAe,CAAC,GAAG,CAAC,KAAK;YAC3F,kGAAkG,CACnG,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAErC,+DAA+D;IAC/D,+EAA+E;IAC/E,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IAE9B,iCAAiC;IACjC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;IAE/B,IAAI,KAAa,CAAC;IAClB,IAAI,UAAkB,CAAC;IAEvB,IAAI,OAAO,IAAI,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5E,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC;QAC5B,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC;IACnC,CAAC;SAAM,CAAC;QACN,qEAAqE;QACrE,yEAAyE;QACzE,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,eAAe,IAAI,gBAAgB,CAAC;QACxF,MAAM,EAAE,QAAQ,EAAE,WAAW,EAAE,GAAG,SAAS,CAAC,WAAW,CAAC,CAAC;QACzD,mBAAmB,CAAC,WAAW,CAAC,CAAC;QACjC,KAAK,MAAM,EAAE,IAAI,WAAW,CAAC,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;YAC/D,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;QACD,MAAM,IAAI,GAAG,WAAW,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAC/C,UAAU,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAEhD,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CAAC,+BAA+B,eAAe,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACzE,CAAC;QAED,KAAK,GAAG,EAAE,CAAC;IACb,CAAC;IAED,qDAAqD;IACrD,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAC/C,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAChD,KAAK,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED,0DAA0D;IAC1D,MAAM,OAAO,GAAG,SAAS,CAAC,UAAU,CAAC,CAAC;IAEtC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,sDAAsD;IACtD,IAAI,mBAAmB,CAAC,IAAI,EAAE,OAAO,CAAC,EAAE,CAAC;QACvC,MAAM,IAAI,KAAK,CACb,kEAAkE,eAAe,CAAC,GAAG,CAAC,KAAK;YAC3F,kGAAkG,CACnG,CAAC;IACJ,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAEvD,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,CAAC;AAC7C,CAAC;AAED,0DAA0D;AAC1D,MAAM,iBAAiB,GAAG;IACxB,gCAAgC;IAChC,2BAA2B;IAC3B,yBAAyB;IACzB,yBAAyB;IACzB,+BAA+B;IAC/B,oCAAoC;IACpC,uCAAuC;IACvC,6BAA6B;IAC7B,qCAAqC;IACrC,gCAAgC;CACjC,CAAC;AAEF;;;;GAIG;AACH,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC;;;;;;GAMG;AACH,SAAS,gBAAgB,CAAC,IAAY;IACpC,MAAM,WAAW,GAAG,qCAAqC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrE,IAAI,CAAC,WAAW;QAAE,OAAO,KAAK,CAAC;IAE/B,MAAM,kBAAkB,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IACvE,IAAI,CAAC,kBAAkB;QAAE,OAAO,KAAK,CAAC;IAEtC,MAAM,UAAU,GACd,sEAAsE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEpF,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;;;;;;;;GAUG;AACH,SAAS,mBAAmB,CAAC,IAAY,EAAE,aAAqB;IAC9D,wFAAwF;IACxF,MAAM,kBAAkB,GACtB,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QACpD,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IAC9C,IAAI,CAAC,kBAAkB;QAAE,OAAO,KAAK,CAAC;IAEtC,mEAAmE;IACnE,IAAI,aAAa,CAAC,MAAM,GAAG,qBAAqB;QAAE,OAAO,IAAI,CAAC;IAE9D,8DAA8D;IAC9D,8DAA8D;IAC9D,MAAM,UAAU,GACd,sEAAsE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEpF,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;GAGG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAgC,8CAA8C;SAC9G,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC,CAAC,4CAA4C;SAC5G,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAO,iCAAiC;SAC/D,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAI,8BAA8B;SAC5D,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAM,iBAAiB;SAC/C,IAAI,EAAE,CAAC;AACZ,CAAC"}
|
package/dist/pipeline/fetch.d.ts
CHANGED
|
@@ -3,8 +3,19 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Uses Node.js built-in `fetch` (available since Node 18) with
|
|
5
5
|
* configurable timeout, redirect limits, and user-agent.
|
|
6
|
+
*
|
|
7
|
+
* Includes SSRF protection to prevent fetching from private/internal networks.
|
|
6
8
|
*/
|
|
7
9
|
import type { PipelineConfig } from './types.js';
|
|
10
|
+
/** Maximum response body size in bytes (10 MB). */
|
|
11
|
+
export declare const MAX_RESPONSE_BYTES: number;
|
|
12
|
+
/**
|
|
13
|
+
* Validate a URL for SSRF safety before fetching.
|
|
14
|
+
*
|
|
15
|
+
* Rejects non-HTTP(S) schemes, private/internal IPs, and known
|
|
16
|
+
* cloud metadata endpoints.
|
|
17
|
+
*/
|
|
18
|
+
export declare function validateUrl(url: string): Promise<void>;
|
|
8
19
|
/** Result of a successful URL fetch. */
|
|
9
20
|
export interface FetchResult {
|
|
10
21
|
/** Raw HTML body. */
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetch.d.ts","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"fetch.d.ts","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AASjD,mDAAmD;AACnD,eAAO,MAAM,kBAAkB,QAAmB,CAAC;AAoDnD;;;;;GAKG;AACH,wBAAsB,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAiD5D;AA2CD,wCAAwC;AACxC,MAAM,WAAW,WAAW;IAC1B,qBAAqB;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,QAAQ,EAAE,MAAM,CAAC;IACjB,iCAAiC;IACjC,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;GAOG;AACH,wBAAsB,QAAQ,CAC5B,GAAG,EAAE,MAAM,EACX,MAAM,CAAC,EAAE,OAAO,CAAC,cAAc,CAAC,GAC/B,OAAO,CAAC,WAAW,CAAC,CAkFtB"}
|
package/dist/pipeline/fetch.js
CHANGED
|
@@ -3,13 +3,161 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Uses Node.js built-in `fetch` (available since Node 18) with
|
|
5
5
|
* configurable timeout, redirect limits, and user-agent.
|
|
6
|
+
*
|
|
7
|
+
* Includes SSRF protection to prevent fetching from private/internal networks.
|
|
6
8
|
*/
|
|
9
|
+
import dns from 'node:dns/promises';
|
|
10
|
+
import { isIP } from 'node:net';
|
|
7
11
|
/** Default pipeline configuration values relevant to fetching. */
|
|
8
12
|
const DEFAULTS = {
|
|
9
13
|
fetchTimeoutMs: 30_000,
|
|
10
14
|
maxRedirects: 5,
|
|
11
15
|
userAgent: 'ChaosKB/0.1',
|
|
12
16
|
};
|
|
17
|
+
/** Maximum response body size in bytes (10 MB). */
|
|
18
|
+
export const MAX_RESPONSE_BYTES = 10 * 1024 * 1024;
|
|
19
|
+
/** Well-known cloud metadata hostnames to block. */
|
|
20
|
+
const BLOCKED_HOSTNAMES = new Set([
|
|
21
|
+
'metadata.google.internal',
|
|
22
|
+
'metadata.google.internal.',
|
|
23
|
+
]);
|
|
24
|
+
/**
|
|
25
|
+
* Check if an IP address belongs to a private/reserved range.
|
|
26
|
+
*
|
|
27
|
+
* Blocks: loopback, RFC 1918, link-local (incl. cloud metadata 169.254.x.x),
|
|
28
|
+
* IPv6 loopback, IPv6 ULA, IPv6 link-local, and unspecified addresses.
|
|
29
|
+
*/
|
|
30
|
+
function isPrivateIp(ip) {
|
|
31
|
+
// IPv4 checks
|
|
32
|
+
if (isIP(ip) === 4) {
|
|
33
|
+
const parts = ip.split('.').map(Number);
|
|
34
|
+
const [a, b] = parts;
|
|
35
|
+
if (a === 127)
|
|
36
|
+
return true; // 127.0.0.0/8 loopback
|
|
37
|
+
if (a === 10)
|
|
38
|
+
return true; // 10.0.0.0/8
|
|
39
|
+
if (a === 172 && b >= 16 && b <= 31)
|
|
40
|
+
return true; // 172.16.0.0/12
|
|
41
|
+
if (a === 192 && b === 168)
|
|
42
|
+
return true; // 192.168.0.0/16
|
|
43
|
+
if (a === 169 && b === 254)
|
|
44
|
+
return true; // 169.254.0.0/16 link-local / cloud metadata
|
|
45
|
+
if (a === 0)
|
|
46
|
+
return true; // 0.0.0.0/8
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
// IPv6 checks
|
|
50
|
+
if (isIP(ip) === 6) {
|
|
51
|
+
const normalized = ip.toLowerCase();
|
|
52
|
+
if (normalized === '::1')
|
|
53
|
+
return true; // loopback
|
|
54
|
+
if (normalized === '::')
|
|
55
|
+
return true; // unspecified
|
|
56
|
+
if (normalized.startsWith('fc') || normalized.startsWith('fd'))
|
|
57
|
+
return true; // ULA fc00::/7
|
|
58
|
+
if (normalized.startsWith('fe80'))
|
|
59
|
+
return true; // link-local
|
|
60
|
+
// IPv4-mapped IPv6 — dotted form (::ffff:127.0.0.1)
|
|
61
|
+
const v4dotted = normalized.match(/^::ffff:(\d+\.\d+\.\d+\.\d+)$/);
|
|
62
|
+
if (v4dotted)
|
|
63
|
+
return isPrivateIp(v4dotted[1]);
|
|
64
|
+
// IPv4-mapped IPv6 — hex form (::ffff:7f00:1) as normalized by URL parser
|
|
65
|
+
const v4hex = normalized.match(/^::ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/);
|
|
66
|
+
if (v4hex) {
|
|
67
|
+
const hi = parseInt(v4hex[1], 16);
|
|
68
|
+
const lo = parseInt(v4hex[2], 16);
|
|
69
|
+
const ip = `${(hi >> 8) & 0xff}.${hi & 0xff}.${(lo >> 8) & 0xff}.${lo & 0xff}`;
|
|
70
|
+
return isPrivateIp(ip);
|
|
71
|
+
}
|
|
72
|
+
return false;
|
|
73
|
+
}
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Validate a URL for SSRF safety before fetching.
|
|
78
|
+
*
|
|
79
|
+
* Rejects non-HTTP(S) schemes, private/internal IPs, and known
|
|
80
|
+
* cloud metadata endpoints.
|
|
81
|
+
*/
|
|
82
|
+
export async function validateUrl(url) {
|
|
83
|
+
let parsed;
|
|
84
|
+
try {
|
|
85
|
+
parsed = new URL(url);
|
|
86
|
+
}
|
|
87
|
+
catch {
|
|
88
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
89
|
+
}
|
|
90
|
+
// Only allow http and https
|
|
91
|
+
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
|
92
|
+
throw new Error(`URL scheme "${parsed.protocol}" is not allowed. Only http: and https: are supported.`);
|
|
93
|
+
}
|
|
94
|
+
const hostname = parsed.hostname;
|
|
95
|
+
// Block known cloud metadata hostnames
|
|
96
|
+
if (BLOCKED_HOSTNAMES.has(hostname.toLowerCase())) {
|
|
97
|
+
throw new Error('URL targets a cloud metadata endpoint and cannot be fetched.');
|
|
98
|
+
}
|
|
99
|
+
// Strip IPv6 brackets for IP checks (URL parses [::1] with brackets)
|
|
100
|
+
const bareHost = hostname.startsWith('[') && hostname.endsWith(']')
|
|
101
|
+
? hostname.slice(1, -1)
|
|
102
|
+
: hostname;
|
|
103
|
+
// If hostname is already an IP literal, check it directly
|
|
104
|
+
if (isIP(bareHost)) {
|
|
105
|
+
if (isPrivateIp(bareHost)) {
|
|
106
|
+
throw new Error('URL targets a private/internal network address and cannot be fetched.');
|
|
107
|
+
}
|
|
108
|
+
return;
|
|
109
|
+
}
|
|
110
|
+
// Resolve hostname and check all resulting IPs
|
|
111
|
+
let addresses;
|
|
112
|
+
try {
|
|
113
|
+
addresses = await dns.lookup(hostname, { all: true });
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
// DNS failure will be caught later by fetch itself
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
for (const addr of addresses) {
|
|
120
|
+
if (isPrivateIp(addr.address)) {
|
|
121
|
+
throw new Error('URL targets a private/internal network address and cannot be fetched.');
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Read a Response body with a size limit to prevent memory exhaustion.
|
|
127
|
+
*/
|
|
128
|
+
async function readResponseWithLimit(response, maxBytes) {
|
|
129
|
+
const reader = response.body?.getReader();
|
|
130
|
+
if (!reader) {
|
|
131
|
+
// Fallback for environments without streaming
|
|
132
|
+
return response.text();
|
|
133
|
+
}
|
|
134
|
+
const chunks = [];
|
|
135
|
+
let totalBytes = 0;
|
|
136
|
+
try {
|
|
137
|
+
while (true) {
|
|
138
|
+
const { done, value } = await reader.read();
|
|
139
|
+
if (done)
|
|
140
|
+
break;
|
|
141
|
+
totalBytes += value.byteLength;
|
|
142
|
+
if (totalBytes > maxBytes) {
|
|
143
|
+
reader.cancel();
|
|
144
|
+
throw new Error(`Response body exceeds ${maxBytes / 1024 / 1024} MB limit. ` +
|
|
145
|
+
'The page is too large to ingest.');
|
|
146
|
+
}
|
|
147
|
+
chunks.push(value);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
finally {
|
|
151
|
+
reader.releaseLock();
|
|
152
|
+
}
|
|
153
|
+
const combined = new Uint8Array(totalBytes);
|
|
154
|
+
let offset = 0;
|
|
155
|
+
for (const chunk of chunks) {
|
|
156
|
+
combined.set(chunk, offset);
|
|
157
|
+
offset += chunk.byteLength;
|
|
158
|
+
}
|
|
159
|
+
return new TextDecoder().decode(combined);
|
|
160
|
+
}
|
|
13
161
|
/**
|
|
14
162
|
* Fetch the HTML content of a URL.
|
|
15
163
|
*
|
|
@@ -19,6 +167,10 @@ const DEFAULTS = {
|
|
|
19
167
|
* @throws On network errors, non-2xx status codes, or non-HTML content.
|
|
20
168
|
*/
|
|
21
169
|
export async function fetchUrl(url, config) {
|
|
170
|
+
// SSRF protection: reject private/internal network targets
|
|
171
|
+
if (!config?._skipSsrfCheck) {
|
|
172
|
+
await validateUrl(url);
|
|
173
|
+
}
|
|
22
174
|
const timeoutMs = config?.fetchTimeoutMs ?? DEFAULTS.fetchTimeoutMs;
|
|
23
175
|
const userAgent = config?.userAgent ?? DEFAULTS.userAgent;
|
|
24
176
|
const controller = new AbortController();
|
|
@@ -84,7 +236,7 @@ export async function fetchUrl(url, config) {
|
|
|
84
236
|
if (!isHtml) {
|
|
85
237
|
throw new Error(`Non-HTML content type "${contentType}" for ${url}. Only text/html is supported.`);
|
|
86
238
|
}
|
|
87
|
-
const html = await response
|
|
239
|
+
const html = await readResponseWithLimit(response, MAX_RESPONSE_BYTES);
|
|
88
240
|
const finalUrl = response.url || url;
|
|
89
241
|
return { html, finalUrl, contentType };
|
|
90
242
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetch.js","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"fetch.js","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,GAAG,MAAM,mBAAmB,CAAC;AACpC,OAAO,EAAE,IAAI,EAAE,MAAM,UAAU,CAAC;AAGhC,kEAAkE;AAClE,MAAM,QAAQ,GAA0E;IACtF,cAAc,EAAE,MAAM;IACtB,YAAY,EAAE,CAAC;IACf,SAAS,EAAE,aAAa;CACzB,CAAC;AAEF,mDAAmD;AACnD,MAAM,CAAC,MAAM,kBAAkB,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC;AAEnD,oDAAoD;AACpD,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC;IAChC,0BAA0B;IAC1B,2BAA2B;CAC5B,CAAC,CAAC;AAEH;;;;;GAKG;AACH,SAAS,WAAW,CAAC,EAAU;IAC7B,cAAc;IACd,IAAI,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC;QACnB,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACxC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC;QACrB,IAAI,CAAC,KAAK,GAAG;YAAE,OAAO,IAAI,CAAC,CAAwB,uBAAuB;QAC1E,IAAI,CAAC,KAAK,EAAE;YAAE,OAAO,IAAI,CAAC,CAAyB,aAAa;QAChE,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE;YAAE,OAAO,IAAI,CAAC,CAAE,gBAAgB;QACnE,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG;YAAE,OAAO,IAAI,CAAC,CAAW,iBAAiB;QACpE,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG;YAAE,OAAO,IAAI,CAAC,CAAW,6CAA6C;QAChG,IAAI,CAAC,KAAK,CAAC;YAAE,OAAO,IAAI,CAAC,CAA0B,YAAY;QAC/D,OAAO,KAAK,CAAC;IACf,CAAC;IAED,cAAc;IACd,IAAI,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC;QACnB,MAAM,UAAU,GAAG,EAAE,CAAC,WAAW,EAAE,CAAC;QACpC,IAAI,UAAU,KAAK,KAAK;YAAE,OAAO,IAAI,CAAC,CAAmB,WAAW;QACpE,IAAI,UAAU,KAAK,IAAI;YAAE,OAAO,IAAI,CAAC,CAAoB,cAAc;QACvE,IAAI,UAAU,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,UAAU,CAAC,UAAU,CAAC,IAAI,CAAC;YAAE,OAAO,IAAI,CAAC,CAAC,eAAe;QAC5F,IAAI,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC;YAAE,OAAO,IAAI,CAAC,CAAU,aAAa;QACtE,oDAAoD;QACpD,MAAM,QAAQ,GAAG,UAAU,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;QACnE,IAAI,QAAQ;YAAE,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAC9C,0EAA0E;QAC1E,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,0CAA0C,CAAC,CAAC;QAC3E,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAClC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAClC,MAAM,EAAE,GAAG,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC,GAAG,IAAI,IAAI,EAAE,GAAG,IAAI,IAAI,CAAC,EAAE,IAAI,CAAC,CAAC,GAAG,IAAI,IAAI,EAAE,GAAG,IAAI,EAAE,CAAC;YAC/E,OAAO,WAAW,CAAC,EAAE,CAAC,CAAC;QACzB,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,GAAW;IAC3C,IAAI,MAAW,CAAC;IAChB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IACxB,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,gBAAgB,GAAG,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,4BAA4B;IAC5B,IAAI,MAAM,CAAC,QAAQ,KAAK,OAAO,IAAI,MAAM,CAAC,QAAQ,KAAK,QAAQ,EAAE,CAAC;QAChE,MAAM,IAAI,KAAK,CACb,eAAe,MAAM,CAAC,QAAQ,wDAAwD,CACvF,CAAC;IACJ,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;IAEjC,uCAAuC;IACvC,IAAI,iBAAiB,CAAC,GAAG,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;QAClD,MAAM,IAAI,KAAK,CAAC,8DAA8D,CAAC,CAAC;IAClF,CAAC;IAED,qEAAqE;IACrE,MAAM,QAAQ,GAAG,QAAQ,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;QACjE,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACvB,CAAC,CAAC,QAAQ,CAAC;IAEb,0DAA0D;IAC1D,IAAI,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;QACnB,IAAI,WAAW,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,uEAAuE,CAAC,CAAC;QAC3F,CAAC;QACD,OAAO;IACT,CAAC;IAED,+CAA+C;IAC/C,IAAI,SAAgD,CAAC;IACrD,IAAI,CAAC;QACH,SAAS,GAAG,MAAM,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IACxD,CAAC;IAAC,MAAM,CAAC;QACP,mDAAmD;QACnD,OAAO;IACT,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YAC9B,MAAM,IAAI,KAAK,CAAC,uEAAuE,CAAC,CAAC;QAC3F,CAAC;IACH,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,qBAAqB,CAAC,QAAkB,EAAE,QAAgB;IACvE,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC;IAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,8CAA8C;QAC9C,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAED,MAAM,MAAM,GAAiB,EAAE,CAAC;IAChC,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,IAAI,CAAC;QACH,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;YAC5C,IAAI,IAAI;gBAAE,MAAM;YAChB,UAAU,IAAI,KAAK,CAAC,UAAU,CAAC;YAC/B,IAAI,UAAU,GAAG,QAAQ,EAAE,CAAC;gBAC1B,MAAM,CAAC,MAAM,EAAE,CAAC;gBAChB,MAAM,IAAI,KAAK,CACb,yBAAyB,QAAQ,GAAG,IAAI,GAAG,IAAI,aAAa;oBAC5D,kCAAkC,CACnC,CAAC;YACJ,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;YAAS,CAAC;QACT,MAAM,CAAC,WAAW,EAAE,CAAC;IACvB,CAAC;IAED,MAAM,QAAQ,GAAG,IAAI,UAAU,CAAC,UAAU,CAAC,CAAC;IAC5C,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;QAC5B,MAAM,IAAI,KAAK,CAAC,UAAU,CAAC;IAC7B,CAAC;IAED,OAAO,IAAI,WAAW,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;AAC5C,CAAC;AAYD;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,GAAW,EACX,MAAgC;IAEhC,2DAA2D;IAC3D,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC;QAC5B,MAAM,WAAW,CAAC,GAAG,CAAC,CAAC;IACzB,CAAC;IAED,MAAM,SAAS,GAAG,MAAM,EAAE,cAAc,IAAI,QAAQ,CAAC,cAAc,CAAC;IACpE,MAAM,SAAS,GAAG,MAAM,EAAE,SAAS,IAAI,QAAQ,CAAC,SAAS,CAAC;IAE1D,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;IAE9D,IAAI,QAAkB,CAAC;IACvB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC1B,MAAM,EAAE,KAAK;YACb,OAAO,EAAE;gBACP,YAAY,EAAE,SAAS;gBACvB,MAAM,EAAE,iEAAiE;aAC1E;YACD,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,QAAQ,EAAE,QAAQ;SACnB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,YAAY,CAAC,KAAK,CAAC,CAAC;QACpB,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAChC,MAAM,IAAI,KAAK,CAAC,yBAAyB,SAAS,OAAO,GAAG,EAAE,CAAC,CAAC;YAClE,CAAC;YACD,0BAA0B;YAC1B,IAAI,KAAK,CAAC,KAAK,IAAI,OAAO,KAAK,CAAC,KAAK,KAAK,QAAQ,IAAI,MAAM,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;gBAC5E,MAAM,IAAI,GAAI,KAAK,CAAC,KAA2B,CAAC,IAAI,CAAC;gBACrD,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;oBACzB,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,kBAAkB,CAAC,CAAC;gBACtE,CAAC;gBACD,IAAI,IAAI,KAAK,cAAc,EAAE,CAAC;oBAC5B,MAAM,IAAI,KAAK,CAAC,0BAA0B,GAAG,EAAE,CAAC,CAAC;gBACnD,CAAC;gBACD,IAAI,IAAI,KAAK,iCAAiC,IAAI,IAAI,KAAK,8BAA8B,EAAE,CAAC;oBAC1F,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,KAAK,IAAI,EAAE,CAAC,CAAC;gBAC/D,CAAC;YACH,CAAC;YACD,0CAA0C;YAC1C,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;gBAC5G,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;YACjE,CAAC;YACD,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,iBAAiB,CAAC,CAAC;IAC3D,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;IACtB,CAAC;IAED,oBAAoB;IACpB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC;QAC/B,IAAI,MAAM,IAAI,GAAG,IAAI,MAAM,GAAG,GAAG,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,qBAAqB,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,qBAAqB,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,QAAQ,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IACvE,CAAC;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;IAC/D,MAAM,MAAM,GACV,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC;QACjC,WAAW,CAAC,QAAQ,CAAC,uBAAuB,CAAC;QAC7C,WAAW,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAE1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,0BAA0B,WAAW,SAAS,GAAG,gCAAgC,CAClF,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,CAAC,CAAC;IACvE,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,IAAI,GAAG,CAAC;IAErC,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,CAAC;AACzC,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Local file content extraction.
|
|
3
|
+
*
|
|
4
|
+
* Dispatches to format-specific extractors based on file extension.
|
|
5
|
+
* Supports PDF, DOCX, PPTX, HTML, TXT, and Markdown.
|
|
6
|
+
*/
|
|
7
|
+
import type { ExtractedContent } from './types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Extract content from a local file.
|
|
10
|
+
*
|
|
11
|
+
* @param filePath - Path to the file (resolved to absolute).
|
|
12
|
+
* @returns Extracted content with title, text, and the absolute path as `url`.
|
|
13
|
+
* @throws On missing/unreadable file, unsupported format, or empty content.
|
|
14
|
+
*/
|
|
15
|
+
export declare function extractFromFile(filePath: string): Promise<ExtractedContent>;
|
|
16
|
+
//# sourceMappingURL=file-extract.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"file-extract.d.ts","sourceRoot":"","sources":["../../pipeline/file-extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AA2BnD;;;;;;GAMG;AACH,wBAAsB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAqDjF"}
|