@de-otio/chaoskb-client 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/dist/cli/mcp-server.d.ts +16 -1
  2. package/dist/cli/mcp-server.d.ts.map +1 -1
  3. package/dist/cli/mcp-server.js +29 -12
  4. package/dist/cli/mcp-server.js.map +1 -1
  5. package/dist/cli/tools/kb-ingest.d.ts +3 -1
  6. package/dist/cli/tools/kb-ingest.d.ts.map +1 -1
  7. package/dist/cli/tools/kb-ingest.js +45 -5
  8. package/dist/cli/tools/kb-ingest.js.map +1 -1
  9. package/dist/cli/tools/kb-query.d.ts +2 -0
  10. package/dist/cli/tools/kb-query.d.ts.map +1 -1
  11. package/dist/cli/tools/kb-query.js +11 -2
  12. package/dist/cli/tools/kb-query.js.map +1 -1
  13. package/dist/pipeline/content-pipeline.d.ts +2 -0
  14. package/dist/pipeline/content-pipeline.d.ts.map +1 -1
  15. package/dist/pipeline/content-pipeline.js +27 -1
  16. package/dist/pipeline/content-pipeline.js.map +1 -1
  17. package/dist/pipeline/extract.d.ts.map +1 -1
  18. package/dist/pipeline/extract.js +129 -4
  19. package/dist/pipeline/extract.js.map +1 -1
  20. package/dist/pipeline/fetch.d.ts +11 -0
  21. package/dist/pipeline/fetch.d.ts.map +1 -1
  22. package/dist/pipeline/fetch.js +153 -1
  23. package/dist/pipeline/fetch.js.map +1 -1
  24. package/dist/pipeline/file-extract.d.ts +16 -0
  25. package/dist/pipeline/file-extract.d.ts.map +1 -0
  26. package/dist/pipeline/file-extract.js +249 -0
  27. package/dist/pipeline/file-extract.js.map +1 -0
  28. package/dist/pipeline/index.d.ts +2 -0
  29. package/dist/pipeline/index.d.ts.map +1 -1
  30. package/dist/pipeline/index.js +2 -0
  31. package/dist/pipeline/index.js.map +1 -1
  32. package/dist/pipeline/types.d.ts +6 -0
  33. package/dist/pipeline/types.d.ts.map +1 -1
  34. package/dist/pipeline/validate.d.ts +36 -0
  35. package/dist/pipeline/validate.d.ts.map +1 -0
  36. package/dist/pipeline/validate.js +632 -0
  37. package/dist/pipeline/validate.js.map +1 -0
  38. package/dist/storage/source-repo.d.ts +2 -0
  39. package/dist/storage/source-repo.d.ts.map +1 -1
  40. package/dist/storage/source-repo.js +9 -2
  41. package/dist/storage/source-repo.js.map +1 -1
  42. package/dist/storage/types.d.ts +1 -0
  43. package/dist/storage/types.d.ts.map +1 -1
  44. package/package.json +4 -1
@@ -4,8 +4,53 @@
4
4
  * Parses HTML with `linkedom` and runs it through Readability to pull out
5
5
  * the main article content, stripped of navigation, ads, and boilerplate.
6
6
  */
7
+ import { basename } from 'node:path';
7
8
  import { Readability } from '@mozilla/readability';
8
9
  import { parseHTML } from 'linkedom';
10
+ /**
11
+ * Regex matching inline `style` attribute values that visually hide an element.
12
+ *
13
+ * These patterns detect CSS-based hiding tricks that attackers use to embed
14
+ * invisible prompt-injection payloads in web pages. Because `linkedom` does
15
+ * not compute styles, Readability would otherwise include this hidden text.
16
+ */
17
+ const HIDDEN_STYLE_RE = /display\s*:\s*none|visibility\s*:\s*hidden|font-size\s*:\s*0(?:px|em|rem|%)?\s*(?:;|$)|opacity\s*:\s*0(?:\.\d+)?(?:;|$)|position\s*:\s*(?:absolute|fixed)[\s\S]*?(?:left|top)\s*:\s*-\d{4,}|clip\s*:\s*rect\(\s*0/i;
18
+ /**
19
+ * Strip elements that are visually hidden from the DOM before Readability runs.
20
+ *
21
+ * Removes:
22
+ * - Elements with the `hidden` attribute
23
+ * - Elements with `aria-hidden="true"`
24
+ * - `<noscript>` elements (content is irrelevant for non-JS fetching)
25
+ * - Elements whose inline `style` matches common hiding patterns
26
+ */
27
+ function stripHiddenElements(document) {
28
+ // Remove <noscript> — irrelevant when we don't execute JS
29
+ for (const el of document.querySelectorAll('noscript')) {
30
+ el.remove();
31
+ }
32
+ // Remove elements with the `hidden` attribute
33
+ for (const el of document.querySelectorAll('[hidden]')) {
34
+ el.remove();
35
+ }
36
+ // Remove elements with aria-hidden="true"
37
+ for (const el of document.querySelectorAll('[aria-hidden="true"]')) {
38
+ el.remove();
39
+ }
40
+ // Remove elements whose inline style indicates visual hiding
41
+ for (const el of document.querySelectorAll('[style]')) {
42
+ const style = el.getAttribute('style') ?? '';
43
+ if (HIDDEN_STYLE_RE.test(style)) {
44
+ el.remove();
45
+ }
46
+ }
47
+ }
48
+ /** Sanitize a source identifier for error messages (strip full paths). */
49
+ function safeSourceLabel(url) {
50
+ if (url.startsWith('/') || /^[A-Z]:\\/i.test(url))
51
+ return basename(url);
52
+ return url;
53
+ }
9
54
  /**
10
55
  * Extract the main article content from an HTML string.
11
56
  *
@@ -16,9 +61,19 @@ import { parseHTML } from 'linkedom';
16
61
  */
17
62
  export function extractContent(html, url) {
18
63
  if (!html || html.trim().length === 0) {
19
- throw new Error(`Empty HTML content from ${url}`);
64
+ throw new Error(`Empty HTML content from ${safeSourceLabel(url)}`);
65
+ }
66
+ // Early SPA detection: check the raw HTML before we strip <noscript>.
67
+ // SPA pages have <noscript> as their only meaningful content; once we strip it
68
+ // below, there's nothing left to extract and we'd get a confusing error.
69
+ if (looksLikeSpaHtml(html)) {
70
+ throw new Error(`This page appears to require JavaScript to render its content (${safeSourceLabel(url)}). ` +
71
+ `Only the noscript fallback was captured. ChaosKB does not yet support JavaScript-rendered pages.`);
20
72
  }
21
73
  const { document } = parseHTML(html);
74
+ // Strip visually-hidden elements before Readability sees them.
75
+ // This prevents CSS-based prompt-injection payloads from surviving extraction.
76
+ stripHiddenElements(document);
22
77
  // Attempt Readability extraction
23
78
  const reader = new Readability(document);
24
79
  const article = reader.parse();
@@ -29,17 +84,18 @@ export function extractContent(html, url) {
29
84
  rawContent = article.textContent;
30
85
  }
31
86
  else {
32
- // Fallback: extract text from body (strip script/style first)
87
+ // Fallback: extract text from body (strip script/style/hidden first)
33
88
  // Wrap in a full HTML document to ensure linkedom creates a body element
34
89
  const wrappedHtml = html.includes('<body') ? html : `<html><body>${html}</body></html>`;
35
90
  const { document: fallbackDoc } = parseHTML(wrappedHtml);
91
+ stripHiddenElements(fallbackDoc);
36
92
  for (const el of fallbackDoc.querySelectorAll('script, style')) {
37
93
  el.remove();
38
94
  }
39
95
  const body = fallbackDoc.querySelector('body');
40
96
  rawContent = body ? body.textContent ?? '' : '';
41
97
  if (rawContent.trim().length === 0) {
42
- throw new Error(`No extractable content from ${url}`);
98
+ throw new Error(`No extractable content from ${safeSourceLabel(url)}`);
43
99
  }
44
100
  title = '';
45
101
  }
@@ -54,14 +110,83 @@ export function extractContent(html, url) {
54
110
  if (content.length === 0) {
55
111
  throw new Error(`No extractable content from ${url}`);
56
112
  }
113
+ // Detect JavaScript-only SPA pages that didn't render
114
+ if (looksLikeJsOnlyPage(html, content)) {
115
+ throw new Error(`This page appears to require JavaScript to render its content (${safeSourceLabel(url)}). ` +
116
+ `Only the noscript fallback was captured. ChaosKB does not yet support JavaScript-rendered pages.`);
117
+ }
57
118
  const byteLength = Buffer.byteLength(content, 'utf-8');
58
119
  return { title, content, url, byteLength };
59
120
  }
121
+ /** Patterns that indicate a noscript fallback message. */
122
+ const NOSCRIPT_PATTERNS = [
123
+ /you need to enable javascript/i,
124
+ /please enable javascript/i,
125
+ /javascript is required/i,
126
+ /javascript is disabled/i,
127
+ /this app requires javascript/i,
128
+ /enable javascript to run this app/i,
129
+ /this application requires javascript/i,
130
+ /javascript must be enabled/i,
131
+ /works best with javascript enabled/i,
132
+ /this site requires javascript/i,
133
+ ];
134
+ /**
135
+ * Minimum content length (in characters) below which a noscript message
136
+ * is treated as the *entire* page content rather than an incidental mention.
137
+ * A real article that happens to discuss JavaScript would be much longer.
138
+ */
139
+ const SPA_CONTENT_THRESHOLD = 500;
140
+ /**
141
+ * Quick pre-extraction check on raw HTML for SPA shells.
142
+ *
143
+ * Detects pages that have a noscript message AND an empty SPA root container,
144
+ * which is a strong signal that the page requires JS. This runs before
145
+ * `<noscript>` stripping so we can give a clear error message.
146
+ */
147
+ function looksLikeSpaHtml(html) {
148
+ const hasNoscript = /<noscript\b[^>]*>[^<]*<\/noscript>/i.test(html);
149
+ if (!hasNoscript)
150
+ return false;
151
+ const hasNoscriptMessage = NOSCRIPT_PATTERNS.some((p) => p.test(html));
152
+ if (!hasNoscriptMessage)
153
+ return false;
154
+ const hasSpaRoot = /<div\s+id=["'](?:root|app|__next|__nuxt|__gatsby)["']\s*>\s*<\/div>/i.test(html);
155
+ return hasSpaRoot;
156
+ }
157
+ /**
158
+ * Detect if extracted content is likely a JavaScript-only noscript fallback.
159
+ *
160
+ * The heuristic is intentionally conservative: both a short extracted text
161
+ * AND a noscript-style message must be present. A real article that
162
+ * discusses JavaScript would have far more than 500 characters of content.
163
+ *
164
+ * Because `<noscript>` elements are stripped during hidden-element removal
165
+ * (they are irrelevant for non-JS fetching), we check the raw HTML for
166
+ * noscript patterns rather than the extracted text.
167
+ */
168
+ function looksLikeJsOnlyPage(html, extractedText) {
169
+ // Check raw HTML for noscript messages (since <noscript> is stripped before extraction)
170
+ const hasNoscriptMessage = NOSCRIPT_PATTERNS.some((p) => p.test(extractedText)) ||
171
+ NOSCRIPT_PATTERNS.some((p) => p.test(html));
172
+ if (!hasNoscriptMessage)
173
+ return false;
174
+ // Short content + noscript message → almost certainly an SPA shell
175
+ if (extractedText.length < SPA_CONTENT_THRESHOLD)
176
+ return true;
177
+ // Longer content but the HTML has an empty SPA root container
178
+ // (e.g. <div id="root"></div>) alongside the noscript message
179
+ const hasSpaRoot = /<div\s+id=["'](?:root|app|__next|__nuxt|__gatsby)["']\s*>\s*<\/div>/i.test(html);
180
+ return hasSpaRoot;
181
+ }
60
182
  /**
61
- * Clean extracted text by collapsing whitespace and trimming.
183
+ * Clean extracted text by stripping steganographic characters,
184
+ * collapsing whitespace, and trimming.
62
185
  */
63
186
  function cleanText(text) {
64
187
  return text
188
+ .replace(/[\u2028\u2029]/g, '\n') // Unicode line/paragraph separators → newline
189
+ .replace(/[\u200B-\u200F\u202A-\u202F\u2060-\u206F\uFEFF]/g, '') // strip zero-width / bidi / invisible chars
65
190
  .replace(/[\t ]+/g, ' ') // collapse horizontal whitespace
66
191
  .replace(/\n{3,}/g, '\n\n') // collapse excessive newlines
67
192
  .replace(/^ +| +$/gm, '') // trim each line
@@ -1 +1 @@
1
- {"version":3,"file":"extract.js","sourceRoot":"","sources":["../../pipeline/extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,GAAW;IACtD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,KAAK,CAAC,2BAA2B,GAAG,EAAE,CAAC,CAAC;IACpD,CAAC;IAED,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAErC,iCAAiC;IACjC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;IAE/B,IAAI,KAAa,CAAC;IAClB,IAAI,UAAkB,CAAC;IAEvB,IAAI,OAAO,IAAI,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5E,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC;QAC5B,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC;IACnC,CAAC;SAAM,CAAC;QACN,8DAA8D;QAC9D,yEAAyE;QACzE,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,eAAe,IAAI,gBAAgB,CAAC;QACxF,MAAM,EAAE,QAAQ,EAAE,WAAW,EAAE,GAAG,SAAS,CAAC,WAAW,CAAC,CAAC;QACzD,KAAK,MAAM,EAAE,IAAI,WAAW,CAAC,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;YAC/D,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;QACD,MAAM,IAAI,GAAG,WAAW,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAC/C,UAAU,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAEhD,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;QACxD,CAAC;QAED,KAAK,GAAG,EAAE,CAAC;IACb,CAAC;IAED,qDAAqD;IACrD,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAC/C,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAChD,KAAK,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED,0DAA0D;IAC1D,MAAM,OAAO,GAAG,SAAS,CAAC,UAAU,CAAC,CAAC;IAEtC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAEvD,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,CAAC;AAC7C,CAAC;AAED;;GAEG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAO,iCAAiC;SAC/D,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAI,8BAA8B;SAC5D,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAM,iBAAiB;SAC/C,IAAI,EAAE,CAAC;AACZ,CAAC"}
1
+ {"version":3,"file":"extract.js","sourceRoot":"","sources":["../../pipeline/extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AACrC,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC;;;;;;GAMG;AACH,MAAM,eAAe,GACnB,oNAAoN,CAAC;AAEvN;;;;;;;;GAQG;AACH,SAAS,mBAAmB,CAAC,QAAa;IACxC,0DAA0D;IAC1D,KAAK,MAAM,EAAE,IAAI,QAAQ,CAAC,gBAAgB,CAAC,UAAU,CAAC,EAAE,CAAC;QACvD,EAAE,CAAC,MAAM,EAAE,CAAC;IACd,CAAC;IAED,8CAA8C;IAC9C,KAAK,MAAM,EAAE,IAAI,QAAQ,CAAC,gBAAgB,CAAC,UAAU,CAAC,EAAE,CAAC;QACvD,EAAE,CAAC,MAAM,EAAE,CAAC;IACd,CAAC;IAED,0CAA0C;IAC1C,KAAK,MAAM,EAAE,IAAI,QAAQ,CAAC,gBAAgB,CAAC,sBAAsB,CAAC,EAAE,CAAC;QACnE,EAAE,CAAC,MAAM,EAAE,CAAC;IACd,CAAC;IAED,6DAA6D;IAC7D,KAAK,MAAM,EAAE,IAAI,QAAQ,CAAC,gBAAgB,CAAC,SAAS,CAAC,EAAE,CAAC;QACtD,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;QAC7C,IAAI,eAAe,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAChC,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;IACH,CAAC;AACH,CAAC;AAED,0EAA0E;AAC1E,SAAS,eAAe,CAAC,GAAW;IAClC,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC;IACxE,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,GAAW;IACtD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,KAAK,CAAC,2BAA2B,eAAe,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACrE,CAAC;IAED,sEAAsE;IACtE,+EAA+E;IAC/E,yEAAyE;IACzE,IAAI,gBAAgB,CAAC,IAAI,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,kEAAkE,eAAe,CAAC,GAAG,CAAC,KAAK;YAC3F,kGAAkG,CACnG,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAErC,+DAA+D;IAC/D,+EAA+E;IAC/E,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IAE9B,iCAAiC;IACjC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;IAE/B,IAAI,KAAa,CAAC;IAClB,IAAI,UAAkB,CAAC;IAEvB,IAAI,OAAO,IAAI,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5E,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC;QAC5B,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC;IACnC,CAAC;SAAM,CAAC;QACN,qEAAqE;QACrE,yEAAyE;QACzE,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,eAAe,IAAI,gBAAgB,CAAC;QACxF,MAAM,EAAE,QAAQ,EAAE,WAAW,EAAE,GAAG,SAAS,CAAC,WAAW,CAAC,CAAC;QACzD,mBAAmB,CAAC,WAAW,CAAC,CAAC;QACjC,KAAK,MAAM,EAAE,IAAI,WAAW,CAAC,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;YAC/D,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;QACD,MAAM,IAAI,GAAG,WAAW,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAC/C,UAAU,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAEhD,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CAAC,+BAA+B,eAAe,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACzE,CAAC;QAED,KAAK,GAAG,EAAE,CAAC;IACb,CAAC;IAED,qDAAqD;IACrD,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAC/C,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAChD,KAAK,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED,0DAA0D;IAC1D,MAAM,OAAO,GAAG,SAAS,CAAC,UAAU,CAAC,CAAC;IAEtC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,sDAAsD;IACtD,IAAI,mBAAmB,CAAC,IAAI,EAAE,OAAO,CAAC,EAAE,CAAC;QACvC,MAAM,IAAI,KAAK,CACb,kEAAkE,eAAe,CAAC,GAAG,CAAC,KAAK;YAC3F,kGAAkG,CACnG,CAAC;IACJ,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAEvD,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,CAAC;AAC7C,CAAC;AAED,0DAA0D;AAC1D,MAAM,iBAAiB,GAAG;IACxB,gCAAgC;IAChC,2BAA2B;IAC3B,yBAAyB;IACzB,yBAAyB;IACzB,+BAA+B;IAC/B,oCAAoC;IACpC,uCAAuC;IACvC,6BAA6B;IAC7B,qCAAqC;IACrC,gCAAgC;CACjC,CAAC;AAEF;;;;GAIG;AACH,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC;;;;;;GAMG;AACH,SAAS,gBAAgB,CAAC,IAAY;IACpC,MAAM,WAAW,GAAG,qCAAqC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrE,IAAI,CAAC,WAAW;QAAE,OAAO,KAAK,CAAC;IAE/B,MAAM,kBAAkB,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IACvE,IAAI,CAAC,kBAAkB;QAAE,OAAO,KAAK,CAAC;IAEtC,MAAM,UAAU,GACd,sEAAsE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEpF,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;;;;;;;;GAUG;AACH,SAAS,mBAAmB,CAAC,IAAY,EAAE,aAAqB;IAC9D,wFAAwF;IACxF,MAAM,kBAAkB,GACtB,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QACpD,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IAC9C,IAAI,CAAC,kBAAkB;QAAE,OAAO,KAAK,CAAC;IAEtC,mEAAmE;IACnE,IAAI,aAAa,CAAC,MAAM,GAAG,qBAAqB;QAAE,OAAO,IAAI,CAAC;IAE9D,8DAA8D;IAC9D,8DAA8D;IAC9D,MAAM,UAAU,GACd,sEAAsE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEpF,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;GAGG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAgC,8CAA8C;SAC9G,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC,CAAC,4CAA4C;SAC5G,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAO,iCAAiC;SAC/D,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAI,8BAA8B;SAC5D,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAM,iBAAiB;SAC/C,IAAI,EAAE,CAAC;AACZ,CAAC"}
@@ -3,8 +3,19 @@
3
3
  *
4
4
  * Uses Node.js built-in `fetch` (available since Node 18) with
5
5
  * configurable timeout, redirect limits, and user-agent.
6
+ *
7
+ * Includes SSRF protection to prevent fetching from private/internal networks.
6
8
  */
7
9
  import type { PipelineConfig } from './types.js';
10
+ /** Maximum response body size in bytes (10 MB). */
11
+ export declare const MAX_RESPONSE_BYTES: number;
12
+ /**
13
+ * Validate a URL for SSRF safety before fetching.
14
+ *
15
+ * Rejects non-HTTP(S) schemes, private/internal IPs, and known
16
+ * cloud metadata endpoints.
17
+ */
18
+ export declare function validateUrl(url: string): Promise<void>;
8
19
  /** Result of a successful URL fetch. */
9
20
  export interface FetchResult {
10
21
  /** Raw HTML body. */
@@ -1 +1 @@
1
- {"version":3,"file":"fetch.d.ts","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AASjD,wCAAwC;AACxC,MAAM,WAAW,WAAW;IAC1B,qBAAqB;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,QAAQ,EAAE,MAAM,CAAC;IACjB,iCAAiC;IACjC,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;GAOG;AACH,wBAAsB,QAAQ,CAC5B,GAAG,EAAE,MAAM,EACX,MAAM,CAAC,EAAE,OAAO,CAAC,cAAc,CAAC,GAC/B,OAAO,CAAC,WAAW,CAAC,CA6EtB"}
1
+ {"version":3,"file":"fetch.d.ts","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AASjD,mDAAmD;AACnD,eAAO,MAAM,kBAAkB,QAAmB,CAAC;AAoDnD;;;;;GAKG;AACH,wBAAsB,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAiD5D;AA2CD,wCAAwC;AACxC,MAAM,WAAW,WAAW;IAC1B,qBAAqB;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,QAAQ,EAAE,MAAM,CAAC;IACjB,iCAAiC;IACjC,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;GAOG;AACH,wBAAsB,QAAQ,CAC5B,GAAG,EAAE,MAAM,EACX,MAAM,CAAC,EAAE,OAAO,CAAC,cAAc,CAAC,GAC/B,OAAO,CAAC,WAAW,CAAC,CAkFtB"}
@@ -3,13 +3,161 @@
3
3
  *
4
4
  * Uses Node.js built-in `fetch` (available since Node 18) with
5
5
  * configurable timeout, redirect limits, and user-agent.
6
+ *
7
+ * Includes SSRF protection to prevent fetching from private/internal networks.
6
8
  */
9
+ import dns from 'node:dns/promises';
10
+ import { isIP } from 'node:net';
7
11
  /** Default pipeline configuration values relevant to fetching. */
8
12
  const DEFAULTS = {
9
13
  fetchTimeoutMs: 30_000,
10
14
  maxRedirects: 5,
11
15
  userAgent: 'ChaosKB/0.1',
12
16
  };
17
+ /** Maximum response body size in bytes (10 MB). */
18
+ export const MAX_RESPONSE_BYTES = 10 * 1024 * 1024;
19
+ /** Well-known cloud metadata hostnames to block. */
20
+ const BLOCKED_HOSTNAMES = new Set([
21
+ 'metadata.google.internal',
22
+ 'metadata.google.internal.',
23
+ ]);
24
+ /**
25
+ * Check if an IP address belongs to a private/reserved range.
26
+ *
27
+ * Blocks: loopback, RFC 1918, link-local (incl. cloud metadata 169.254.x.x),
28
+ * IPv6 loopback, IPv6 ULA, IPv6 link-local, and unspecified addresses.
29
+ */
30
+ function isPrivateIp(ip) {
31
+ // IPv4 checks
32
+ if (isIP(ip) === 4) {
33
+ const parts = ip.split('.').map(Number);
34
+ const [a, b] = parts;
35
+ if (a === 127)
36
+ return true; // 127.0.0.0/8 loopback
37
+ if (a === 10)
38
+ return true; // 10.0.0.0/8
39
+ if (a === 172 && b >= 16 && b <= 31)
40
+ return true; // 172.16.0.0/12
41
+ if (a === 192 && b === 168)
42
+ return true; // 192.168.0.0/16
43
+ if (a === 169 && b === 254)
44
+ return true; // 169.254.0.0/16 link-local / cloud metadata
45
+ if (a === 0)
46
+ return true; // 0.0.0.0/8
47
+ return false;
48
+ }
49
+ // IPv6 checks
50
+ if (isIP(ip) === 6) {
51
+ const normalized = ip.toLowerCase();
52
+ if (normalized === '::1')
53
+ return true; // loopback
54
+ if (normalized === '::')
55
+ return true; // unspecified
56
+ if (normalized.startsWith('fc') || normalized.startsWith('fd'))
57
+ return true; // ULA fc00::/7
58
+ if (normalized.startsWith('fe80'))
59
+ return true; // link-local
60
+ // IPv4-mapped IPv6 — dotted form (::ffff:127.0.0.1)
61
+ const v4dotted = normalized.match(/^::ffff:(\d+\.\d+\.\d+\.\d+)$/);
62
+ if (v4dotted)
63
+ return isPrivateIp(v4dotted[1]);
64
+ // IPv4-mapped IPv6 — hex form (::ffff:7f00:1) as normalized by URL parser
65
+ const v4hex = normalized.match(/^::ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/);
66
+ if (v4hex) {
67
+ const hi = parseInt(v4hex[1], 16);
68
+ const lo = parseInt(v4hex[2], 16);
69
+ const ip = `${(hi >> 8) & 0xff}.${hi & 0xff}.${(lo >> 8) & 0xff}.${lo & 0xff}`;
70
+ return isPrivateIp(ip);
71
+ }
72
+ return false;
73
+ }
74
+ return false;
75
+ }
76
+ /**
77
+ * Validate a URL for SSRF safety before fetching.
78
+ *
79
+ * Rejects non-HTTP(S) schemes, private/internal IPs, and known
80
+ * cloud metadata endpoints.
81
+ */
82
+ export async function validateUrl(url) {
83
+ let parsed;
84
+ try {
85
+ parsed = new URL(url);
86
+ }
87
+ catch {
88
+ throw new Error(`Invalid URL: ${url}`);
89
+ }
90
+ // Only allow http and https
91
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
92
+ throw new Error(`URL scheme "${parsed.protocol}" is not allowed. Only http: and https: are supported.`);
93
+ }
94
+ const hostname = parsed.hostname;
95
+ // Block known cloud metadata hostnames
96
+ if (BLOCKED_HOSTNAMES.has(hostname.toLowerCase())) {
97
+ throw new Error('URL targets a cloud metadata endpoint and cannot be fetched.');
98
+ }
99
+ // Strip IPv6 brackets for IP checks (URL parses [::1] with brackets)
100
+ const bareHost = hostname.startsWith('[') && hostname.endsWith(']')
101
+ ? hostname.slice(1, -1)
102
+ : hostname;
103
+ // If hostname is already an IP literal, check it directly
104
+ if (isIP(bareHost)) {
105
+ if (isPrivateIp(bareHost)) {
106
+ throw new Error('URL targets a private/internal network address and cannot be fetched.');
107
+ }
108
+ return;
109
+ }
110
+ // Resolve hostname and check all resulting IPs
111
+ let addresses;
112
+ try {
113
+ addresses = await dns.lookup(hostname, { all: true });
114
+ }
115
+ catch {
116
+ // DNS failure will be caught later by fetch itself
117
+ return;
118
+ }
119
+ for (const addr of addresses) {
120
+ if (isPrivateIp(addr.address)) {
121
+ throw new Error('URL targets a private/internal network address and cannot be fetched.');
122
+ }
123
+ }
124
+ }
125
+ /**
126
+ * Read a Response body with a size limit to prevent memory exhaustion.
127
+ */
128
+ async function readResponseWithLimit(response, maxBytes) {
129
+ const reader = response.body?.getReader();
130
+ if (!reader) {
131
+ // Fallback for environments without streaming
132
+ return response.text();
133
+ }
134
+ const chunks = [];
135
+ let totalBytes = 0;
136
+ try {
137
+ while (true) {
138
+ const { done, value } = await reader.read();
139
+ if (done)
140
+ break;
141
+ totalBytes += value.byteLength;
142
+ if (totalBytes > maxBytes) {
143
+ reader.cancel();
144
+ throw new Error(`Response body exceeds ${maxBytes / 1024 / 1024} MB limit. ` +
145
+ 'The page is too large to ingest.');
146
+ }
147
+ chunks.push(value);
148
+ }
149
+ }
150
+ finally {
151
+ reader.releaseLock();
152
+ }
153
+ const combined = new Uint8Array(totalBytes);
154
+ let offset = 0;
155
+ for (const chunk of chunks) {
156
+ combined.set(chunk, offset);
157
+ offset += chunk.byteLength;
158
+ }
159
+ return new TextDecoder().decode(combined);
160
+ }
13
161
  /**
14
162
  * Fetch the HTML content of a URL.
15
163
  *
@@ -19,6 +167,10 @@ const DEFAULTS = {
19
167
  * @throws On network errors, non-2xx status codes, or non-HTML content.
20
168
  */
21
169
  export async function fetchUrl(url, config) {
170
+ // SSRF protection: reject private/internal network targets
171
+ if (!config?._skipSsrfCheck) {
172
+ await validateUrl(url);
173
+ }
22
174
  const timeoutMs = config?.fetchTimeoutMs ?? DEFAULTS.fetchTimeoutMs;
23
175
  const userAgent = config?.userAgent ?? DEFAULTS.userAgent;
24
176
  const controller = new AbortController();
@@ -84,7 +236,7 @@ export async function fetchUrl(url, config) {
84
236
  if (!isHtml) {
85
237
  throw new Error(`Non-HTML content type "${contentType}" for ${url}. Only text/html is supported.`);
86
238
  }
87
- const html = await response.text();
239
+ const html = await readResponseWithLimit(response, MAX_RESPONSE_BYTES);
88
240
  const finalUrl = response.url || url;
89
241
  return { html, finalUrl, contentType };
90
242
  }
@@ -1 +1 @@
1
- {"version":3,"file":"fetch.js","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,kEAAkE;AAClE,MAAM,QAAQ,GAA0E;IACtF,cAAc,EAAE,MAAM;IACtB,YAAY,EAAE,CAAC;IACf,SAAS,EAAE,aAAa;CACzB,CAAC;AAYF;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,GAAW,EACX,MAAgC;IAEhC,MAAM,SAAS,GAAG,MAAM,EAAE,cAAc,IAAI,QAAQ,CAAC,cAAc,CAAC;IACpE,MAAM,SAAS,GAAG,MAAM,EAAE,SAAS,IAAI,QAAQ,CAAC,SAAS,CAAC;IAE1D,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;IAE9D,IAAI,QAAkB,CAAC;IACvB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC1B,MAAM,EAAE,KAAK;YACb,OAAO,EAAE;gBACP,YAAY,EAAE,SAAS;gBACvB,MAAM,EAAE,iEAAiE;aAC1E;YACD,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,QAAQ,EAAE,QAAQ;SACnB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,YAAY,CAAC,KAAK,CAAC,CAAC;QACpB,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAChC,MAAM,IAAI,KAAK,CAAC,yBAAyB,SAAS,OAAO,GAAG,EAAE,CAAC,CAAC;YAClE,CAAC;YACD,0BAA0B;YAC1B,IAAI,KAAK,CAAC,KAAK,IAAI,OAAO,KAAK,CAAC,KAAK,KAAK,QAAQ,IAAI,MAAM,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;gBAC5E,MAAM,IAAI,GAAI,KAAK,CAAC,KAA2B,CAAC,IAAI,CAAC;gBACrD,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;oBACzB,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,kBAAkB,CAAC,CAAC;gBACtE,CAAC;gBACD,IAAI,IAAI,KAAK,cAAc,EAAE,CAAC;oBAC5B,MAAM,IAAI,KAAK,CAAC,0BAA0B,GAAG,EAAE,CAAC,CAAC;gBACnD,CAAC;gBACD,IAAI,IAAI,KAAK,iCAAiC,IAAI,IAAI,KAAK,8BAA8B,EAAE,CAAC;oBAC1F,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,KAAK,IAAI,EAAE,CAAC,CAAC;gBAC/D,CAAC;YACH,CAAC;YACD,0CAA0C;YAC1C,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;gBAC5G,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;YACjE,CAAC;YACD,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,iBAAiB,CAAC,CAAC;IAC3D,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;IACtB,CAAC;IAED,oBAAoB;IACpB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC;QAC/B,IAAI,MAAM,IAAI,GAAG,IAAI,MAAM,GAAG,GAAG,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,qBAAqB,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,qBAAqB,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,QAAQ,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IACvE,CAAC;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;IAC/D,MAAM,MAAM,GACV,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC;QACjC,WAAW,CAAC,QAAQ,CAAC,uBAAuB,CAAC;QAC7C,WAAW,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAE1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,0BAA0B,WAAW,SAAS,GAAG,gCAAgC,CAClF,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACnC,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,IAAI,GAAG,CAAC;IAErC,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,CAAC;AACzC,CAAC"}
1
+ {"version":3,"file":"fetch.js","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,GAAG,MAAM,mBAAmB,CAAC;AACpC,OAAO,EAAE,IAAI,EAAE,MAAM,UAAU,CAAC;AAGhC,kEAAkE;AAClE,MAAM,QAAQ,GAA0E;IACtF,cAAc,EAAE,MAAM;IACtB,YAAY,EAAE,CAAC;IACf,SAAS,EAAE,aAAa;CACzB,CAAC;AAEF,mDAAmD;AACnD,MAAM,CAAC,MAAM,kBAAkB,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC;AAEnD,oDAAoD;AACpD,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC;IAChC,0BAA0B;IAC1B,2BAA2B;CAC5B,CAAC,CAAC;AAEH;;;;;GAKG;AACH,SAAS,WAAW,CAAC,EAAU;IAC7B,cAAc;IACd,IAAI,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC;QACnB,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACxC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC;QACrB,IAAI,CAAC,KAAK,GAAG;YAAE,OAAO,IAAI,CAAC,CAAwB,uBAAuB;QAC1E,IAAI,CAAC,KAAK,EAAE;YAAE,OAAO,IAAI,CAAC,CAAyB,aAAa;QAChE,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE;YAAE,OAAO,IAAI,CAAC,CAAE,gBAAgB;QACnE,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG;YAAE,OAAO,IAAI,CAAC,CAAW,iBAAiB;QACpE,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG;YAAE,OAAO,IAAI,CAAC,CAAW,6CAA6C;QAChG,IAAI,CAAC,KAAK,CAAC;YAAE,OAAO,IAAI,CAAC,CAA0B,YAAY;QAC/D,OAAO,KAAK,CAAC;IACf,CAAC;IAED,cAAc;IACd,IAAI,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC;QACnB,MAAM,UAAU,GAAG,EAAE,CAAC,WAAW,EAAE,CAAC;QACpC,IAAI,UAAU,KAAK,KAAK;YAAE,OAAO,IAAI,CAAC,CAAmB,WAAW;QACpE,IAAI,UAAU,KAAK,IAAI;YAAE,OAAO,IAAI,CAAC,CAAoB,cAAc;QACvE,IAAI,UAAU,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,UAAU,CAAC,UAAU,CAAC,IAAI,CAAC;YAAE,OAAO,IAAI,CAAC,CAAC,eAAe;QAC5F,IAAI,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC;YAAE,OAAO,IAAI,CAAC,CAAU,aAAa;QACtE,oDAAoD;QACpD,MAAM,QAAQ,GAAG,UAAU,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;QACnE,IAAI,QAAQ;YAAE,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAC9C,0EAA0E;QAC1E,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,0CAA0C,CAAC,CAAC;QAC3E,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAClC,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAClC,MAAM,EAAE,GAAG,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC,GAAG,IAAI,IAAI,EAAE,GAAG,IAAI,IAAI,CAAC,EAAE,IAAI,CAAC,CAAC,GAAG,IAAI,IAAI,EAAE,GAAG,IAAI,EAAE,CAAC;YAC/E,OAAO,WAAW,CAAC,EAAE,CAAC,CAAC;QACzB,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,GAAW;IAC3C,IAAI,MAAW,CAAC;IAChB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IACxB,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,gBAAgB,GAAG,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,4BAA4B;IAC5B,IAAI,MAAM,CAAC,QAAQ,KAAK,OAAO,IAAI,MAAM,CAAC,QAAQ,KAAK,QAAQ,EAAE,CAAC;QAChE,MAAM,IAAI,KAAK,CACb,eAAe,MAAM,CAAC,QAAQ,wDAAwD,CACvF,CAAC;IACJ,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;IAEjC,uCAAuC;IACvC,IAAI,iBAAiB,CAAC,GAAG,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;QAClD,MAAM,IAAI,KAAK,CAAC,8DAA8D,CAAC,CAAC;IAClF,CAAC;IAED,qEAAqE;IACrE,MAAM,QAAQ,GAAG,QAAQ,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;QACjE,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACvB,CAAC,CAAC,QAAQ,CAAC;IAEb,0DAA0D;IAC1D,IAAI,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;QACnB,IAAI,WAAW,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,uEAAuE,CAAC,CAAC;QAC3F,CAAC;QACD,OAAO;IACT,CAAC;IAED,+CAA+C;IAC/C,IAAI,SAAgD,CAAC;IACrD,IAAI,CAAC;QACH,SAAS,GAAG,MAAM,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IACxD,CAAC;IAAC,MAAM,CAAC;QACP,mDAAmD;QACnD,OAAO;IACT,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YAC9B,MAAM,IAAI,KAAK,CAAC,uEAAuE,CAAC,CAAC;QAC3F,CAAC;IACH,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,qBAAqB,CAAC,QAAkB,EAAE,QAAgB;IACvE,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC;IAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,8CAA8C;QAC9C,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAED,MAAM,MAAM,GAAiB,EAAE,CAAC;IAChC,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,IAAI,CAAC;QACH,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;YAC5C,IAAI,IAAI;gBAAE,MAAM;YAChB,UAAU,IAAI,KAAK,CAAC,UAAU,CAAC;YAC/B,IAAI,UAAU,GAAG,QAAQ,EAAE,CAAC;gBAC1B,MAAM,CAAC,MAAM,EAAE,CAAC;gBAChB,MAAM,IAAI,KAAK,CACb,yBAAyB,QAAQ,GAAG,IAAI,GAAG,IAAI,aAAa;oBAC5D,kCAAkC,CACnC,CAAC;YACJ,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;YAAS,CAAC;QACT,MAAM,CAAC,WAAW,EAAE,CAAC;IACvB,CAAC;IAED,MAAM,QAAQ,GAAG,IAAI,UAAU,CAAC,UAAU,CAAC,CAAC;IAC5C,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;QAC5B,MAAM,IAAI,KAAK,CAAC,UAAU,CAAC;IAC7B,CAAC;IAED,OAAO,IAAI,WAAW,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;AAC5C,CAAC;AAYD;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,GAAW,EACX,MAAgC;IAEhC,2DAA2D;IAC3D,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC;QAC5B,MAAM,WAAW,CAAC,GAAG,CAAC,CAAC;IACzB,CAAC;IAED,MAAM,SAAS,GAAG,MAAM,EAAE,cAAc,IAAI,QAAQ,CAAC,cAAc,CAAC;IACpE,MAAM,SAAS,GAAG,MAAM,EAAE,SAAS,IAAI,QAAQ,CAAC,SAAS,CAAC;IAE1D,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;IAE9D,IAAI,QAAkB,CAAC;IACvB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC1B,MAAM,EAAE,KAAK;YACb,OAAO,EAAE;gBACP,YAAY,EAAE,SAAS;gBACvB,MAAM,EAAE,iEAAiE;aAC1E;YACD,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,QAAQ,EAAE,QAAQ;SACnB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,YAAY,CAAC,KAAK,CAAC,CAAC;QACpB,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAChC,MAAM,IAAI,KAAK,CAAC,yBAAyB,SAAS,OAAO,GAAG,EAAE,CAAC,CAAC;YAClE,CAAC;YACD,0BAA0B;YAC1B,IAAI,KAAK,CAAC,KAAK,IAAI,OAAO,KAAK,CAAC,KAAK,KAAK,QAAQ,IAAI,MAAM,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;gBAC5E,MAAM,IAAI,GAAI,KAAK,CAAC,KAA2B,CAAC,IAAI,CAAC;gBACrD,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;oBACzB,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,kBAAkB,CAAC,CAAC;gBACtE,CAAC;gBACD,IAAI,IAAI,KAAK,cAAc,EAAE,CAAC;oBAC5B,MAAM,IAAI,KAAK,CAAC,0BAA0B,GAAG,EAAE,CAAC,CAAC;gBACnD,CAAC;gBACD,IAAI,IAAI,KAAK,iCAAiC,IAAI,IAAI,KAAK,8BAA8B,EAAE,CAAC;oBAC1F,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,KAAK,IAAI,EAAE,CAAC,CAAC;gBAC/D,CAAC;YACH,CAAC;YACD,0CAA0C;YAC1C,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;gBAC5G,MAAM,IAAI,KAAK,CAAC,sBAAsB,GAAG,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;YACjE,CAAC;YACD,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,KAAK,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,mBAAmB,GAAG,iBAAiB,CAAC,CAAC;IAC3D,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;IACtB,CAAC;IAED,oBAAoB;IACpB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC;QAC/B,IAAI,MAAM,IAAI,GAAG,IAAI,MAAM,GAAG,GAAG,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,qBAAqB,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,qBAAqB,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,QAAQ,GAAG,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;IACvE,CAAC;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;IAC/D,MAAM,MAAM,GACV,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC;QACjC,WAAW,CAAC,QAAQ,CAAC,uBAAuB,CAAC;QAC7C,WAAW,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAE1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,0BAA0B,WAAW,SAAS,GAAG,gCAAgC,CAClF,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,CAAC,CAAC;IACvE,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,IAAI,GAAG,CAAC;IAErC,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,CAAC;AACzC,CAAC"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Local file content extraction.
3
+ *
4
+ * Dispatches to format-specific extractors based on file extension.
5
+ * Supports PDF, DOCX, PPTX, HTML, TXT, and Markdown.
6
+ */
7
+ import type { ExtractedContent } from './types.js';
8
+ /**
9
+ * Extract content from a local file.
10
+ *
11
+ * @param filePath - Path to the file (resolved to absolute).
12
+ * @returns Extracted content with title, text, and the absolute path as `url`.
13
+ * @throws On missing/unreadable file, unsupported format, or empty content.
14
+ */
15
+ export declare function extractFromFile(filePath: string): Promise<ExtractedContent>;
16
+ //# sourceMappingURL=file-extract.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"file-extract.d.ts","sourceRoot":"","sources":["../../pipeline/file-extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AA2BnD;;;;;;GAMG;AACH,wBAAsB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAqDjF"}