@virstack/doc-ingest 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/README.md +203 -0
  2. package/dist/adapters/aiAdapters.d.ts +25 -0
  3. package/dist/adapters/aiAdapters.d.ts.map +1 -0
  4. package/dist/adapters/aiAdapters.js +73 -0
  5. package/dist/adapters/aiAdapters.js.map +1 -0
  6. package/dist/adapters/vectorStore.d.ts +24 -0
  7. package/dist/adapters/vectorStore.d.ts.map +1 -0
  8. package/dist/adapters/vectorStore.js +22 -0
  9. package/dist/adapters/vectorStore.js.map +1 -0
  10. package/dist/aiAdapters.d.ts +25 -0
  11. package/dist/aiAdapters.d.ts.map +1 -0
  12. package/dist/aiAdapters.js +50 -0
  13. package/dist/aiAdapters.js.map +1 -0
  14. package/dist/assets/logo.png +0 -0
  15. package/dist/batchPipeline.d.ts +52 -0
  16. package/dist/batchPipeline.d.ts.map +1 -0
  17. package/dist/batchPipeline.js +81 -0
  18. package/dist/batchPipeline.js.map +1 -0
  19. package/dist/cli.d.ts +3 -0
  20. package/dist/cli.d.ts.map +1 -0
  21. package/dist/cli.js +217 -0
  22. package/dist/cli.js.map +1 -0
  23. package/dist/config.d.ts +26 -0
  24. package/dist/config.d.ts.map +1 -0
  25. package/dist/config.js +97 -0
  26. package/dist/config.js.map +1 -0
  27. package/dist/core/config.d.ts +26 -0
  28. package/dist/core/config.d.ts.map +1 -0
  29. package/dist/core/config.js +106 -0
  30. package/dist/core/config.js.map +1 -0
  31. package/dist/core/logger.d.ts +31 -0
  32. package/dist/core/logger.d.ts.map +1 -0
  33. package/dist/core/logger.js +42 -0
  34. package/dist/core/logger.js.map +1 -0
  35. package/dist/core/state.d.ts +52 -0
  36. package/dist/core/state.d.ts.map +1 -0
  37. package/dist/core/state.js +27 -0
  38. package/dist/core/state.js.map +1 -0
  39. package/dist/graphs/batchProcessor.d.ts +72 -0
  40. package/dist/graphs/batchProcessor.d.ts.map +1 -0
  41. package/dist/graphs/batchProcessor.js +94 -0
  42. package/dist/graphs/batchProcessor.js.map +1 -0
  43. package/dist/graphs/singleDocument.d.ts +303 -0
  44. package/dist/graphs/singleDocument.d.ts.map +1 -0
  45. package/dist/graphs/singleDocument.js +93 -0
  46. package/dist/graphs/singleDocument.js.map +1 -0
  47. package/dist/index.d.ts +8 -0
  48. package/dist/index.d.ts.map +1 -0
  49. package/dist/index.js +10 -0
  50. package/dist/index.js.map +1 -0
  51. package/dist/logger.d.ts +24 -0
  52. package/dist/logger.d.ts.map +1 -0
  53. package/dist/logger.js +36 -0
  54. package/dist/logger.js.map +1 -0
  55. package/dist/logo.d.ts +2 -0
  56. package/dist/logo.d.ts.map +1 -0
  57. package/dist/logo.js +3 -0
  58. package/dist/logo.js.map +1 -0
  59. package/dist/nodes/fileTypeRouter.d.ts +16 -0
  60. package/dist/nodes/fileTypeRouter.d.ts.map +1 -0
  61. package/dist/nodes/fileTypeRouter.js +72 -0
  62. package/dist/nodes/fileTypeRouter.js.map +1 -0
  63. package/dist/nodes/geminiExtraction.d.ts +19 -0
  64. package/dist/nodes/geminiExtraction.d.ts.map +1 -0
  65. package/dist/nodes/geminiExtraction.js +87 -0
  66. package/dist/nodes/geminiExtraction.js.map +1 -0
  67. package/dist/nodes/libreOfficeToPdf.d.ts +8 -0
  68. package/dist/nodes/libreOfficeToPdf.d.ts.map +1 -0
  69. package/dist/nodes/libreOfficeToPdf.js +61 -0
  70. package/dist/nodes/libreOfficeToPdf.js.map +1 -0
  71. package/dist/nodes/llmExtractionNode.d.ts +19 -0
  72. package/dist/nodes/llmExtractionNode.d.ts.map +1 -0
  73. package/dist/nodes/llmExtractionNode.js +68 -0
  74. package/dist/nodes/llmExtractionNode.js.map +1 -0
  75. package/dist/nodes/markdownChunker.d.ts +8 -0
  76. package/dist/nodes/markdownChunker.d.ts.map +1 -0
  77. package/dist/nodes/markdownChunker.js +24 -0
  78. package/dist/nodes/markdownChunker.js.map +1 -0
  79. package/dist/nodes/markdownMerger.d.ts +9 -0
  80. package/dist/nodes/markdownMerger.d.ts.map +1 -0
  81. package/dist/nodes/markdownMerger.js +33 -0
  82. package/dist/nodes/markdownMerger.js.map +1 -0
  83. package/dist/nodes/markdownNormalizer.d.ts +10 -0
  84. package/dist/nodes/markdownNormalizer.d.ts.map +1 -0
  85. package/dist/nodes/markdownNormalizer.js +46 -0
  86. package/dist/nodes/markdownNormalizer.js.map +1 -0
  87. package/dist/nodes/openrouterEmbedder.d.ts +7 -0
  88. package/dist/nodes/openrouterEmbedder.d.ts.map +1 -0
  89. package/dist/nodes/openrouterEmbedder.js +31 -0
  90. package/dist/nodes/openrouterEmbedder.js.map +1 -0
  91. package/dist/nodes/pdfSplitter.d.ts +7 -0
  92. package/dist/nodes/pdfSplitter.d.ts.map +1 -0
  93. package/dist/nodes/pdfSplitter.js +41 -0
  94. package/dist/nodes/pdfSplitter.js.map +1 -0
  95. package/dist/nodes/saveMarkdown.d.ts +7 -0
  96. package/dist/nodes/saveMarkdown.d.ts.map +1 -0
  97. package/dist/nodes/saveMarkdown.js +28 -0
  98. package/dist/nodes/saveMarkdown.js.map +1 -0
  99. package/dist/nodes/textExtractorNode.d.ts +7 -0
  100. package/dist/nodes/textExtractorNode.d.ts.map +1 -0
  101. package/dist/nodes/textExtractorNode.js +39 -0
  102. package/dist/nodes/textExtractorNode.js.map +1 -0
  103. package/dist/nodes/upstashUpsert.d.ts +7 -0
  104. package/dist/nodes/upstashUpsert.d.ts.map +1 -0
  105. package/dist/nodes/upstashUpsert.js +45 -0
  106. package/dist/nodes/upstashUpsert.js.map +1 -0
  107. package/dist/nodes/vectorEmbedderNode.d.ts +7 -0
  108. package/dist/nodes/vectorEmbedderNode.d.ts.map +1 -0
  109. package/dist/nodes/vectorEmbedderNode.js +23 -0
  110. package/dist/nodes/vectorEmbedderNode.js.map +1 -0
  111. package/dist/nodes/vectorUpsertNode.d.ts +7 -0
  112. package/dist/nodes/vectorUpsertNode.d.ts.map +1 -0
  113. package/dist/nodes/vectorUpsertNode.js +45 -0
  114. package/dist/nodes/vectorUpsertNode.js.map +1 -0
  115. package/dist/pipeline.d.ts +303 -0
  116. package/dist/pipeline.d.ts.map +1 -0
  117. package/dist/pipeline.js +93 -0
  118. package/dist/pipeline.js.map +1 -0
  119. package/dist/state.d.ts +52 -0
  120. package/dist/state.d.ts.map +1 -0
  121. package/dist/state.js +27 -0
  122. package/dist/state.js.map +1 -0
  123. package/dist/vectorStore.d.ts +24 -0
  124. package/dist/vectorStore.d.ts.map +1 -0
  125. package/dist/vectorStore.js +22 -0
  126. package/dist/vectorStore.js.map +1 -0
  127. package/package.json +55 -0
@@ -0,0 +1,87 @@
1
+ import { openrouter, pipelineConfig, apiLimit, requireInit } from "../config.js";
2
+ import { logger, LogSource } from "../logger.js";
3
+ const DEFAULT_SYSTEM_PROMPT = `You are an expert document extraction and formatting AI. Your task is to extract the exact, verbatim content from the provided document and convert it entirely into standard Markdown format.
4
+
5
+ You must strictly adhere to the following rules:
6
+
7
+ 1. **Absolute Accuracy:** Extract the text exactly as it appears in the source document. Do not summarize, rephrase, omit, or add any text. Maintain the original spelling and punctuation.
8
+ 2. **Markdown Structure:** - Replicate the document's structure using standard Markdown.
9
+ - Use correct heading levels ('#', '##', '###') to match the visual hierarchy of the PDF.
10
+ - Preserve text formatting, utilizing '**bold**' for bold text and '*italics*' for italicized text.
11
+ - Convert bulleted and numbered lists into their respective Markdown list formats.
12
+ - Convert all tabular data into standard Markdown tables. Ensure rows and columns align with the original document.
13
+ 3. **Image Handling (CRITICAL):** For every image, photograph, chart, graph, or diagram in the PDF, you must insert a Markdown image placeholder.
14
+ - The format must be: '![Image Placeholder: <Detailed Description>](image_number)'
15
+ - Replace '<Detailed Description>' with a highly descriptive, comprehensive explanation of everything visible in the image. Include colors, subjects, layout, data points (if it's a chart), and transcribe any text that appears within the image itself.
16
+ - Example: '![Image Placeholder: A bar chart comparing Q1 and Q2 sales. Q1 shows $50,000 in blue, Q2 shows $75,000 in green. The x-axis is labeled 'Quarters' and the y-axis is labeled 'Revenue in USD'.](image_1)'
17
+ 4. **Headers and Footers:** Omit repetitive page numbers, document titles in the header, and footers unless they contain crucial footnotes directly referenced in the main text. If footnotes are present, append them to the end of the relevant section or document.
18
+ 5. **Formatting Artifacts:** Remove arbitrary line breaks caused by PDF page formatting. Stitch sentences back together so they flow naturally in the Markdown output.
19
+
20
+ Output the final Markdown only. Do not include conversational filler before or after the extracted content.`;
21
+ /**
22
+ * Unified Gemini node for all document extraction flows.
23
+ * Handles both:
24
+ * 1. Base64 PDF chunks via Vision (Parallel Map-Reduce branch)
25
+ * 2. Raw text extracted by textExtractorNode (Text branch)
26
+ */
27
+ export async function geminiExtraction(state) {
28
+ requireInit();
29
+ const isChunkFlow = state.chunk !== undefined && state.index !== undefined && state.totalChunks !== undefined;
30
+ const isTextFlow = !!state.rawText;
31
+ if (!isChunkFlow && !isTextFlow) {
32
+ throw new Error("[geminiExtraction] Neither chunk nor rawText was provided in the state.");
33
+ }
34
+ let userContent;
35
+ if (isChunkFlow) {
36
+ const { chunk: base64, totalChunks, index } = state;
37
+ logger.info(LogSource.GEMINI, `Processing PDF chunk ${index + 1}/${totalChunks} (${((base64.length * 0.75) / 1024).toFixed(0)} KB)`);
38
+ userContent = [
39
+ {
40
+ type: "file",
41
+ file: {
42
+ filename: `chunk_${index + 1}.pdf`,
43
+ file_data: `data:application/pdf;base64,${base64}`,
44
+ },
45
+ },
46
+ {
47
+ type: "text",
48
+ text: `Extract all content from this PDF (chunk ${index + 1} of ${totalChunks}) into clean Markdown.`,
49
+ },
50
+ ];
51
+ }
52
+ else {
53
+ logger.info(LogSource.GEMINI, `Sending ${state.rawText.length} chars to ${pipelineConfig.llmModel}`);
54
+ userContent = `Convert the following extracted document text into clean Markdown:\n\n${state.rawText}`;
55
+ }
56
+ const finalSystemPrompt = pipelineConfig.systemPrompt || DEFAULT_SYSTEM_PROMPT;
57
+ const response = await apiLimit(() => openrouter.chat.completions.create({
58
+ model: pipelineConfig.llmModel,
59
+ messages: [
60
+ { role: "system", content: finalSystemPrompt },
61
+ { role: "user", content: userContent },
62
+ ],
63
+ max_tokens: pipelineConfig.maxTokens,
64
+ temperature: 0,
65
+ }));
66
+ const markdown = response.choices[0]?.message?.content?.trim() ?? "";
67
+ if (isChunkFlow) {
68
+ logger.info(LogSource.GEMINI, `Chunk ${state.index + 1}/${state.totalChunks} extracted (${markdown.length} chars)`);
69
+ return { markdownParts: [markdown] };
70
+ }
71
+ logger.info(LogSource.GEMINI, `Extracted markdown: ${markdown.length} chars`);
72
+ return { markdown };
73
+ }
74
+ /**
75
+ * Conditional router to determine what happens after geminiExtraction.
76
+ * - If from PDF branch, it returns to markdownMerger
77
+ * - If from Text branch, it goes straight to markdownNormalizer
78
+ */
79
+ export function routeAfterGemini(state) {
80
+ // If markdownParts has contents but markdown is empty, we must merge
81
+ if (state.markdownParts && state.markdownParts.length > 0 && !state.markdown) {
82
+ return "markdownMerger";
83
+ }
84
+ // Otherwise, it was the raw text branch which already assigned state.markdown
85
+ return "markdownNormalizer";
86
+ }
87
+ //# sourceMappingURL=geminiExtraction.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"geminiExtraction.js","sourceRoot":"","sources":["../../src/nodes/geminiExtraction.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,cAAc,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAEjF,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEjD,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;4GAiB8E,CAAC;AAE7G;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,KAAwF;IAGxF,WAAW,EAAE,CAAC;IAEd,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,WAAW,KAAK,SAAS,CAAC;IAC9G,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAEnC,IAAI,CAAC,WAAW,IAAI,CAAC,UAAU,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CAAC,yEAAyE,CAAC,CAAC;IAC7F,CAAC;IAED,IAAI,WAAgB,CAAC;IAErB,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,WAAW,EAAE,KAAK,EAAE,GAAG,KAAK,CAAC;QACpD,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,wBAAwB,KAAM,GAAG,CAAC,IAAI,WAAW,KAAK,CAAC,CAAC,MAAO,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QACvI,WAAW,GAAG;YACZ;gBACE,IAAI,EAAE,MAAa;gBACnB,IAAI,EAAE;oBACJ,QAAQ,EAAE,SAAS,KAAM,GAAG,CAAC,MAAM;oBACnC,SAAS,EAAE,+BAA+B,MAAM,EAAE;iBACnD;aACK;YACR;gBACE,IAAI,EAAE,MAAM;gBACZ,IAAI,EAAE,4CAA4C,KAAM,GAAG,CAAC,OAAO,WAAW,wBAAwB;aACvG;SACF,CAAC;IACJ,CAAC;SAAM,CAAC;QACN,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,WAAW,KAAK,CAAC,OAAQ,CAAC,MAAM,aAAa,cAAc,CAAC,QAAQ,EAAE,CAAC,CAAC;QACtG,WAAW,GAAG,yEAAyE,KAAK,CAAC,OAAO,EAAE,CAAC;IACzG,CAAC;IAED,MAAM,iBAAiB,GAAG,cAAc,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAE/E,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,CACnC,UAAU,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC;QACjC,KAAK,EAAE,cAAc,CAAC,QAAQ;QAC9B,QAAQ,EAAE;YACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,iBAAiB,EAAE;YAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE;SACvC;QACD,UAAU,EAAE,cAAc,CAAC,SAAS;QACpC,WAAW,EAAE,CAAC;KACf,CAAC,CACH,CAAC;IAEF,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAErE,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,SAAS,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,eAAe,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAC;QACrH,OAAO,EAAE,aAAa,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;IACvC,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,uBAAuB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IAC9E,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,gBAAgB,CAAC,KAAoB;IACnD,qEAAqE;IACrE,IAAI,KAAK,CAAC,aAAa,IAAI,KAAK,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QAC7E,OAAO,gBAAgB,CAAC;IAC1B,CAAC;IACD,8EAA8E;IAC9E,OAAO,oBAAoB,CAAC;AAC9B,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Converts the input file to PDF using LibreOffice headless.
4
+ * Updates state.filePath to point to the newly generated PDF.
5
+ * Supported: DOCX, DOC, RTF, ODT, EPUB, PPTX, PPT, ODP
6
+ */
7
+ export declare function libreOfficeToPdf(state: PipelineState): Promise<Partial<PipelineState>>;
8
+ //# sourceMappingURL=libreOfficeToPdf.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"libreOfficeToPdf.d.ts","sourceRoot":"","sources":["../../src/nodes/libreOfficeToPdf.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AActD;;;;GAIG;AACH,wBAAsB,gBAAgB,CACpC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA6CjC"}
@@ -0,0 +1,61 @@
1
+ import { execFile } from "node:child_process";
2
+ import { promisify } from "node:util";
3
+ import path from "node:path";
4
+ import os from "node:os";
5
+ import fs from "node:fs/promises";
6
+ import { logger, LogSource } from "../core/logger.js";
7
+ const execFileAsync = promisify(execFile);
8
+ /**
9
+ * Resolves the LibreOffice soffice binary.
10
+ * Checks SOFFICE_PATH env var first, otherwise relies on system PATH.
11
+ */
12
+ function getSofficePath() {
13
+ if (process.env.SOFFICE_PATH)
14
+ return process.env.SOFFICE_PATH;
15
+ return "soffice"; // Default to whatever is in the PATH
16
+ }
17
+ /**
18
+ * Converts the input file to PDF using LibreOffice headless.
19
+ * Updates state.filePath to point to the newly generated PDF.
20
+ * Supported: DOCX, DOC, RTF, ODT, EPUB, PPTX, PPT, ODP
21
+ */
22
+ export async function libreOfficeToPdf(state) {
23
+ const sofficePath = getSofficePath();
24
+ if (!state.filePath)
25
+ throw new Error("[libreOfficeToPdf] filePath is missing");
26
+ const inputPath = path.resolve(process.cwd(), state.filePath);
27
+ const outputDir = await fs.mkdtemp(path.join(os.tmpdir(), "lo-pdf-"));
28
+ logger.info(LogSource.LIBRE_OFFICE, `Converting: ${path.basename(inputPath)}`);
29
+ logger.info(LogSource.LIBRE_OFFICE, `Using soffice: ${sofficePath}`);
30
+ logger.info(LogSource.LIBRE_OFFICE, `Output dir: ${outputDir}`);
31
+ try {
32
+ await execFileAsync(sofficePath, [
33
+ "--headless",
34
+ "--norestore",
35
+ "--convert-to", "pdf",
36
+ "--outdir", outputDir,
37
+ inputPath,
38
+ ]);
39
+ }
40
+ catch (err) {
41
+ throw new Error(`LibreOffice conversion failed. Is LibreOffice installed?\n` +
42
+ ` Tried: ${sofficePath}\n` +
43
+ ` On macOS: brew install --cask libreoffice\n` +
44
+ ` Set SOFFICE_PATH in .env to override.\n` +
45
+ ` Original error: ${err.message}`);
46
+ }
47
+ // LibreOffice names the output file after the input file with .pdf extension
48
+ const baseName = path.parse(inputPath).name;
49
+ const pdfPath = path.join(outputDir, `${baseName}.pdf`);
50
+ // Verify the file exists
51
+ try {
52
+ await fs.access(pdfPath);
53
+ }
54
+ catch {
55
+ throw new Error(`LibreOffice ran but output PDF not found at: ${pdfPath}. ` +
56
+ `Check LibreOffice installation.`);
57
+ }
58
+ logger.success(LogSource.LIBRE_OFFICE, `Converted to: ${pdfPath}`);
59
+ return { filePath: pdfPath };
60
+ }
61
+ //# sourceMappingURL=libreOfficeToPdf.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"libreOfficeToPdf.js","sourceRoot":"","sources":["../../src/nodes/libreOfficeToPdf.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACtC,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAElC,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD,MAAM,aAAa,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAC;AAE1C;;;GAGG;AACH,SAAS,cAAc;IACrB,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY;QAAE,OAAO,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC;IAC9D,OAAO,SAAS,CAAC,CAAC,qCAAqC;AACzD,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,KAAoB;IAEpB,MAAM,WAAW,GAAG,cAAc,EAAE,CAAC;IACrC,IAAI,CAAC,KAAK,CAAC,QAAQ;QAAE,MAAM,IAAI,KAAK,CAAC,wCAAwC,CAAC,CAAC;IAC/E,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC9D,MAAM,SAAS,GAAG,MAAM,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,SAAS,CAAC,CAAC,CAAC;IAEtE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,eAAe,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;IAC/E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,kBAAkB,WAAW,EAAE,CAAC,CAAC;IACrE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,eAAe,SAAS,EAAE,CAAC,CAAC;IAEhE,IAAI,CAAC;QACH,MAAM,aAAa,CAAC,WAAW,EAAE;YAC/B,YAAY;YACZ,aAAa;YACb,cAAc,EAAE,KAAK;YACrB,UAAU,EAAE,SAAS;YACrB,SAAS;SACV,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CACb,4DAA4D;YAC5D,YAAY,WAAW,IAAI;YAC3B,+CAA+C;YAC/C,2CAA2C;YAC3C,qBAAqB,GAAG,CAAC,OAAO,EAAE,CACnC,CAAC;IACJ,CAAC;IAED,6EAA6E;IAC7E,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC;IAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,QAAQ,MAAM,CAAC,CAAC;IAExD,yBAAyB;IACzB,IAAI,CAAC;QACH,MAAM,EAAE,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAC3B,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,gDAAgD,OAAO,IAAI;YAC3D,iCAAiC,CAClC,CAAC;IACJ,CAAC;IAED,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,YAAY,EAAE,iBAAiB,OAAO,EAAE,CAAC,CAAC;IAEnE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC;AAC/B,CAAC"}
@@ -0,0 +1,19 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Unified LLM node for all document extraction flows.
4
+ * Handles both:
5
+ * 1. Base64 PDF chunks via Vision (Parallel Map-Reduce branch)
6
+ * 2. Raw text extracted by textExtractorNode (Text branch)
7
+ */
8
+ export declare function llmExtractionNode(state: Partial<PipelineState> & {
9
+ chunk?: string;
10
+ index?: number;
11
+ totalChunks?: number;
12
+ }): Promise<Partial<PipelineState>>;
13
+ /**
14
+ * Conditional router to determine what happens after llmExtractionNode.
15
+ * - If from PDF branch, it returns to markdownMerger
16
+ * - If from Text branch, it goes straight to markdownNormalizer
17
+ */
18
+ export declare function routeAfterLlm(state: PipelineState): string;
19
+ //# sourceMappingURL=llmExtractionNode.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llmExtractionNode.d.ts","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAuBtD;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,OAAO,CAAC,aAAa,CAAC,GAAG;IAAE,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,GACvF,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAuCjC;AAED;;;;GAIG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,CAK1D"}
@@ -0,0 +1,68 @@
1
+ import { pipelineConfig, apiLimit, requireInit } from "../core/config.js";
2
+ import { logger, LogSource } from "../core/logger.js";
3
+ const DEFAULT_SYSTEM_PROMPT = `You are an expert document extraction and formatting AI. Your task is to extract the exact, verbatim content from the provided document and convert it entirely into standard Markdown format.
4
+
5
+ You must strictly adhere to the following rules:
6
+
7
+ 1. **Absolute Accuracy:** Extract the text exactly as it appears in the source document. Do not summarize, rephrase, omit, or add any text. Maintain the original spelling and punctuation.
8
+ 2. **Markdown Structure:** - Replicate the document's structure using standard Markdown.
9
+ - Use correct heading levels ('#', '##', '###') to match the visual hierarchy of the PDF.
10
+ - Preserve text formatting, utilizing '**bold**' for bold text and '*italics*' for italicized text.
11
+ - Convert bulleted and numbered lists into their respective Markdown list formats.
12
+ - Convert all tabular data into standard Markdown tables. Ensure rows and columns align with the original document.
13
+ 3. **Image Handling (CRITICAL):** For every image, photograph, chart, graph, or diagram in the PDF, you must insert a Markdown image placeholder.
14
+ - The format must be: '![Image Placeholder: <Detailed Description>](image_number)'
15
+ - Replace '<Detailed Description>' with a highly descriptive, comprehensive explanation of everything visible in the image. Include colors, subjects, layout, data points (if it's a chart), and transcribe any text that appears within the image itself.
16
+ - Example: '![Image Placeholder: A bar chart comparing Q1 and Q2 sales. Q1 shows $50,000 in blue, Q2 shows $75,000 in green. The x-axis is labeled 'Quarters' and the y-axis is labeled 'Revenue in USD'.](image_1)'
17
+ 4. **Headers and Footers:** Omit repetitive page numbers, document titles in the header, and footers unless they contain crucial footnotes directly referenced in the main text. If footnotes are present, append them to the end of the relevant section or document.
18
+ 5. **Formatting Artifacts:** Remove arbitrary line breaks caused by PDF page formatting. Stitch sentences back together so they flow naturally in the Markdown output.
19
+
20
+ Output the final Markdown only. Do not include conversational filler before or after the extracted content.`;
21
+ /**
22
+ * Unified LLM node for all document extraction flows.
23
+ * Handles both:
24
+ * 1. Base64 PDF chunks via Vision (Parallel Map-Reduce branch)
25
+ * 2. Raw text extracted by textExtractorNode (Text branch)
26
+ */
27
+ export async function llmExtractionNode(state) {
28
+ requireInit();
29
+ const isChunkFlow = state.chunk !== undefined && state.index !== undefined && state.totalChunks !== undefined;
30
+ const isTextFlow = !!state.rawText;
31
+ if (!isChunkFlow && !isTextFlow) {
32
+ throw new Error("[llmExtractionNode] Neither chunk nor rawText was provided in the state.");
33
+ }
34
+ const finalSystemPrompt = pipelineConfig.systemPrompt || DEFAULT_SYSTEM_PROMPT;
35
+ const promptInput = {
36
+ systemPrompt: finalSystemPrompt,
37
+ userText: isChunkFlow
38
+ ? `Extract all content from this PDF (chunk ${state.index + 1} of ${state.totalChunks}) into clean Markdown.`
39
+ : `Convert the following extracted document text into clean Markdown:\n\n${state.rawText}`,
40
+ base64PdfChunk: isChunkFlow ? state.chunk : undefined
41
+ };
42
+ if (isChunkFlow) {
43
+ logger.info(LogSource.LLM_EXTRACTION, `Processing PDF chunk ${state.index + 1}/${state.totalChunks} (${((state.chunk.length * 0.75) / 1024).toFixed(0)} KB)`);
44
+ }
45
+ else {
46
+ logger.info(LogSource.LLM_EXTRACTION, `Sending ${state.rawText.length} chars to generic LLM Adapter`);
47
+ }
48
+ // Call the injected LLM adapter wrapped in your rate limiter!
49
+ const markdown = await apiLimit(() => pipelineConfig.llm.generateMarkdown(promptInput));
50
+ if (isChunkFlow) {
51
+ logger.info(LogSource.LLM_EXTRACTION, `Chunk ${state.index + 1}/${state.totalChunks} extracted (${markdown.length} chars)`);
52
+ return { markdownParts: [markdown] };
53
+ }
54
+ logger.info(LogSource.LLM_EXTRACTION, `Extracted markdown: ${markdown.length} chars`);
55
+ return { markdown };
56
+ }
57
+ /**
58
+ * Conditional router to determine what happens after llmExtractionNode.
59
+ * - If from PDF branch, it returns to markdownMerger
60
+ * - If from Text branch, it goes straight to markdownNormalizer
61
+ */
62
+ export function routeAfterLlm(state) {
63
+ if (state.markdownParts && state.markdownParts.length > 0 && !state.markdown) {
64
+ return "markdownMerger";
65
+ }
66
+ return "markdownNormalizer";
67
+ }
68
+ //# sourceMappingURL=llmExtractionNode.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llmExtractionNode.js","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAE1E,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAGtD,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;4GAiB8E,CAAC;AAE7G;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAAwF;IAGxF,WAAW,EAAE,CAAC;IAEd,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,WAAW,KAAK,SAAS,CAAC;IAC9G,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAEnC,IAAI,CAAC,WAAW,IAAI,CAAC,UAAU,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;IAC9F,CAAC;IAED,MAAM,iBAAiB,GAAG,cAAc,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAE/E,MAAM,WAAW,GAAa;QAC5B,YAAY,EAAE,iBAAiB;QAC/B,QAAQ,EAAE,WAAW;YACnB,CAAC,CAAC,4CAA4C,KAAK,CAAC,KAAM,GAAG,CAAC,OAAO,KAAK,CAAC,WAAW,wBAAwB;YAC9G,CAAC,CAAC,yEAAyE,KAAK,CAAC,OAAO,EAAE;QAC5F,cAAc,EAAE,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;KACtD,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,wBAAwB,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,KAAK,CAAC,CAAC,KAAK,CAAC,KAAM,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAClK,CAAC;SAAM,CAAC;QACN,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,WAAW,KAAK,CAAC,OAAQ,CAAC,MAAM,+BAA+B,CAAC,CAAC;IACzG,CAAC;IAED,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,CACnC,cAAc,CAAC,GAAG,CAAC,gBAAgB,CAAC,WAAW,CAAC,CACjD,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,SAAS,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,eAAe,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAC;QAC7H,OAAO,EAAE,aAAa,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;IACvC,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,uBAAuB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IACtF,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,aAAa,CAAC,KAAoB;IAChD,IAAI,KAAK,CAAC,aAAa,IAAI,KAAK,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QAC7E,OAAO,gBAAgB,CAAC;IAC1B,CAAC;IACD,OAAO,oBAAoB,CAAC;AAC9B,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Splits markdown text into semantic chunks using LangChain's RecursiveCharacterTextSplitter.
4
+ * This splitter tries to split on paragraphs, then sentences, then words to keep
5
+ * related content together while respecting the chunk size.
6
+ */
7
+ export declare function markdownChunker(state: PipelineState): Promise<Partial<PipelineState>>;
8
+ //# sourceMappingURL=markdownChunker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdownChunker.d.ts","sourceRoot":"","sources":["../../src/nodes/markdownChunker.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;;GAIG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAmBjC"}
@@ -0,0 +1,24 @@
1
+ import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
2
+ import { pipelineConfig, requireInit } from "../core/config.js";
3
+ import { logger, LogSource } from "../core/logger.js";
4
+ /**
5
+ * Splits markdown text into semantic chunks using LangChain's RecursiveCharacterTextSplitter.
6
+ * This splitter tries to split on paragraphs, then sentences, then words to keep
7
+ * related content together while respecting the chunk size.
8
+ */
9
+ export async function markdownChunker(state) {
10
+ requireInit();
11
+ const { markdown } = state;
12
+ logger.info(LogSource.MARKDOWN_CHUNKER, `Input: ${markdown.length} chars`);
13
+ const splitter = new RecursiveCharacterTextSplitter({
14
+ chunkSize: pipelineConfig.chunkSize,
15
+ chunkOverlap: pipelineConfig.chunkOverlap,
16
+ // Optimal separators for Markdown
17
+ separators: ["\n\n", "\n", " ", ""],
18
+ });
19
+ const docs = await splitter.createDocuments([markdown]);
20
+ const textChunks = docs.map((doc) => doc.pageContent.trim());
21
+ logger.info(LogSource.MARKDOWN_CHUNKER, `Created ${textChunks.length} chunks`);
22
+ return { textChunks };
23
+ }
24
+ //# sourceMappingURL=markdownChunker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdownChunker.js","sourceRoot":"","sources":["../../src/nodes/markdownChunker.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,8BAA8B,EAAE,MAAM,0BAA0B,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,QAAQ,EAAE,GAAG,KAAK,CAAC;IAE3B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,UAAU,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IAE3E,MAAM,QAAQ,GAAG,IAAI,8BAA8B,CAAC;QAClD,SAAS,EAAE,cAAc,CAAC,SAAS;QACnC,YAAY,EAAE,cAAc,CAAC,YAAY;QACzC,kCAAkC;QAClC,UAAU,EAAE,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC;KACpC,CAAC,CAAC;IAEH,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,eAAe,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;IACxD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC;IAE7D,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,WAAW,UAAU,CAAC,MAAM,SAAS,CAAC,CAAC;IAE/E,OAAO,EAAE,UAAU,EAAE,CAAC;AACxB,CAAC"}
@@ -0,0 +1,9 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Joins markdownParts from the PDF branch into a single markdown string.
4
+ * Performs basic boundary-seam fixes:
5
+ * - Removes duplicate headings at chunk boundaries
6
+ * - Trims excessive whitespace between chunks
7
+ */
8
+ export declare function markdownMerger(state: PipelineState): Promise<Partial<PipelineState>>;
9
+ //# sourceMappingURL=markdownMerger.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdownMerger.d.ts","sourceRoot":"","sources":["../../src/nodes/markdownMerger.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;;;GAKG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAoCjC"}
@@ -0,0 +1,33 @@
1
+ import { logger, LogSource } from "../core/logger.js";
2
+ /**
3
+ * Joins markdownParts from the PDF branch into a single markdown string.
4
+ * Performs basic boundary-seam fixes:
5
+ * - Removes duplicate headings at chunk boundaries
6
+ * - Trims excessive whitespace between chunks
7
+ */
8
+ export async function markdownMerger(state) {
9
+ const { markdownParts } = state;
10
+ logger.info(LogSource.MARKDOWN_MERGER, `Merging ${markdownParts.length} markdown part(s)`);
11
+ let merged = "";
12
+ for (let i = 0; i < markdownParts.length; i++) {
13
+ let part = markdownParts[i].trim();
14
+ if (i > 0) {
15
+ // Check if the previous chunk ended with a heading that this chunk starts with
16
+ const prevLines = merged.trimEnd().split("\n");
17
+ const lastPrevLine = prevLines[prevLines.length - 1]?.trim() ?? "";
18
+ const firstLine = part.split("\n")[0]?.trim() ?? "";
19
+ if (lastPrevLine === firstLine &&
20
+ firstLine.startsWith("#")) {
21
+ // Duplicate heading at boundary — skip it in current chunk
22
+ part = part.split("\n").slice(1).join("\n").trim();
23
+ }
24
+ merged += "\n\n";
25
+ }
26
+ merged += part;
27
+ }
28
+ // Collapse triple+ newlines into double
29
+ merged = merged.replace(/\n{3,}/g, "\n\n");
30
+ logger.info(LogSource.MARKDOWN_MERGER, `Merged markdown: ${merged.length} chars`);
31
+ return { markdown: merged };
32
+ }
33
+ //# sourceMappingURL=markdownMerger.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdownMerger.js","sourceRoot":"","sources":["../../src/nodes/markdownMerger.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,KAAoB;IAEpB,MAAM,EAAE,aAAa,EAAE,GAAG,KAAK,CAAC;IAEhC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,WAAW,aAAa,CAAC,MAAM,mBAAmB,CAAC,CAAC;IAE3F,IAAI,MAAM,GAAG,EAAE,CAAC;IAEhB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9C,IAAI,IAAI,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAEnC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACV,+EAA+E;YAC/E,MAAM,SAAS,GAAG,MAAM,CAAC,OAAO,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAC/C,MAAM,YAAY,GAAG,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YACnE,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAEpD,IACE,YAAY,KAAK,SAAS;gBAC1B,SAAS,CAAC,UAAU,CAAC,GAAG,CAAC,EACzB,CAAC;gBACD,2DAA2D;gBAC3D,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;YACrD,CAAC;YAED,MAAM,IAAI,MAAM,CAAC;QACnB,CAAC;QAED,MAAM,IAAI,IAAI,CAAC;IACjB,CAAC;IAED,wCAAwC;IACxC,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE3C,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,oBAAoB,MAAM,CAAC,MAAM,QAAQ,CAAC,CAAC;IAElF,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC;AAC9B,CAAC"}
@@ -0,0 +1,10 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Normalizes the merged/extracted markdown:
4
+ * - Strips residual HTML tags
5
+ * - Fixes broken Markdown table alignment
6
+ * - Deduplicates repeated headers/footers
7
+ * - Trims excessive whitespace
8
+ */
9
+ export declare function markdownNormalizer(state: PipelineState): Promise<Partial<PipelineState>>;
10
+ //# sourceMappingURL=markdownNormalizer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdownNormalizer.d.ts","sourceRoot":"","sources":["../../src/nodes/markdownNormalizer.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;;;;GAMG;AACH,wBAAsB,kBAAkB,CACtC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA8CjC"}
@@ -0,0 +1,46 @@
1
+ import { logger, LogSource } from "../core/logger.js";
2
+ /**
3
+ * Normalizes the merged/extracted markdown:
4
+ * - Strips residual HTML tags
5
+ * - Fixes broken Markdown table alignment
6
+ * - Deduplicates repeated headers/footers
7
+ * - Trims excessive whitespace
8
+ */
9
+ export async function markdownNormalizer(state) {
10
+ let md = state.markdown;
11
+ logger.info(LogSource.MARKDOWN_NORMALIZER, `Input: ${md.length} chars`);
12
+ // 1. Strip HTML tags (but keep content)
13
+ md = md.replace(/<\/?[^>]+(>|$)/g, "");
14
+ // 2. Fix table alignment: ensure pipes are balanced
15
+ md = md.replace(/^\|(.+)\|$/gm, (match) => {
16
+ // Clean up cells — trim whitespace around pipes
17
+ return match
18
+ .split("|")
19
+ .map((cell) => cell.trim())
20
+ .filter(Boolean)
21
+ .map((cell) => ` ${cell} `)
22
+ .join("|")
23
+ .replace(/^/, "|")
24
+ .replace(/$/, "|");
25
+ });
26
+ // 3. Deduplicate consecutive identical headings
27
+ const lines = md.split("\n");
28
+ const deduped = [];
29
+ for (let i = 0; i < lines.length; i++) {
30
+ const line = lines[i];
31
+ const prev = deduped[deduped.length - 1];
32
+ if (line.startsWith("#") &&
33
+ prev === line) {
34
+ continue; // skip duplicate heading
35
+ }
36
+ deduped.push(line);
37
+ }
38
+ md = deduped.join("\n");
39
+ // 4. Collapse excessive blank lines
40
+ md = md.replace(/\n{3,}/g, "\n\n");
41
+ // 5. Trim
42
+ md = md.trim();
43
+ logger.info(LogSource.MARKDOWN_NORMALIZER, `Output: ${md.length} chars`);
44
+ return { markdown: md };
45
+ }
46
+ //# sourceMappingURL=markdownNormalizer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdownNormalizer.js","sourceRoot":"","sources":["../../src/nodes/markdownNormalizer.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAoB;IAEpB,IAAI,EAAE,GAAG,KAAK,CAAC,QAAQ,CAAC;IAExB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,UAAU,EAAE,CAAC,MAAM,QAAQ,CAAC,CAAC;IAExE,wCAAwC;IACxC,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;IAEvC,oDAAoD;IACpD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC,KAAK,EAAE,EAAE;QACxC,gDAAgD;QAChD,OAAO,KAAK;aACT,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;aAC1B,MAAM,CAAC,OAAO,CAAC;aACf,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,GAAG,CAAC;aAC1B,IAAI,CAAC,GAAG,CAAC;aACT,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC;aACjB,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,gDAAgD;IAChD,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACzC,IACE,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YACpB,IAAI,KAAK,IAAI,EACb,CAAC;YACD,SAAS,CAAC,yBAAyB;QACrC,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,CAAC;IACD,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAExB,oCAAoC;IACpC,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAEnC,UAAU;IACV,EAAE,GAAG,EAAE,CAAC,IAAI,EAAE,CAAC;IAEf,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,WAAW,EAAE,CAAC,MAAM,QAAQ,CAAC,CAAC;IAEzE,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC;AAC1B,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { PipelineState } from "../state.js";
2
+ /**
3
+ * Embeds all textChunks using OpenAI's text-embedding-3-large model.
4
+ * Processes in batches of BATCH_SIZE to stay within API limits.
5
+ */
6
+ export declare function openrouterEmbedder(state: PipelineState): Promise<Partial<PipelineState>>;
7
+ //# sourceMappingURL=openrouterEmbedder.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"openrouterEmbedder.d.ts","sourceRoot":"","sources":["../../src/nodes/openrouterEmbedder.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAMjD;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA6BjC"}
@@ -0,0 +1,31 @@
1
+ import { openrouter, pipelineConfig, requireInit } from "../config.js";
2
+ import { logger, LogSource } from "../logger.js";
3
+ /** Maximum chunks to embed in a single OpenAI API call */
4
+ const BATCH_SIZE = 50;
5
+ /**
6
+ * Embeds all textChunks using OpenAI's text-embedding-3-large model.
7
+ * Processes in batches of BATCH_SIZE to stay within API limits.
8
+ */
9
+ export async function openrouterEmbedder(state) {
10
+ requireInit();
11
+ const { textChunks } = state;
12
+ logger.info(LogSource.OPENROUTER_EMBEDDER, `Embedding ${textChunks.length} chunks with ${pipelineConfig.embeddingModel}`);
13
+ const allVectors = [];
14
+ for (let i = 0; i < textChunks.length; i += BATCH_SIZE) {
15
+ const batch = textChunks.slice(i, i + BATCH_SIZE);
16
+ logger.info(LogSource.OPENROUTER_EMBEDDER, `Batch ${Math.floor(i / BATCH_SIZE) + 1}: ${batch.length} chunk(s)`);
17
+ const response = await openrouter.embeddings.create({
18
+ model: pipelineConfig.embeddingModel,
19
+ input: batch,
20
+ dimensions: 1536,
21
+ }); // Cast to any as 'dimensions' might not be in the basic OpenAI type for all SDK versions
22
+ // Sort by index to ensure correct ordering
23
+ const sorted = response.data.sort((a, b) => a.index - b.index);
24
+ for (const item of sorted) {
25
+ allVectors.push(item.embedding);
26
+ }
27
+ }
28
+ logger.info(LogSource.OPENROUTER_EMBEDDER, `Generated ${allVectors.length} vectors (${allVectors[0]?.length ?? 0}d)`);
29
+ return { vectors: allVectors };
30
+ }
31
+ //# sourceMappingURL=openrouterEmbedder.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"openrouterEmbedder.js","sourceRoot":"","sources":["../../src/nodes/openrouterEmbedder.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAEvE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEjD,0DAA0D;AAC1D,MAAM,UAAU,GAAG,EAAE,CAAC;AAEtB;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,UAAU,EAAE,GAAG,KAAK,CAAC;IAE7B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,aAAa,UAAU,CAAC,MAAM,gBAAgB,cAAc,CAAC,cAAc,EAAE,CAAC,CAAC;IAE1H,MAAM,UAAU,GAAe,EAAE,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QACvD,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QAElD,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,SAAS,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,MAAM,WAAW,CAAC,CAAC;QAEhH,MAAM,QAAQ,GAAG,MAAM,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC;YAClD,KAAK,EAAE,cAAc,CAAC,cAAc;YACpC,KAAK,EAAE,KAAK;YACZ,UAAU,EAAE,IAAI;SACV,CAAC,CAAC,CAAC,yFAAyF;QAEpG,2CAA2C;QAC3C,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QACzE,KAAK,MAAM,IAAI,IAAI,MAAM,EAAE,CAAC;YAC1B,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,aAAa,UAAU,CAAC,MAAM,aAAa,UAAU,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,IAAI,CAAC,CAAC;IAEtH,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC;AACjC,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Splits a PDF into sub-documents of PDF_PAGES_PER_CHUNK pages each.
4
+ * Each sub-document is serialised back to a Buffer for downstream Gemini processing.
5
+ */
6
+ export declare function pdfSplitter(state: PipelineState): Promise<Partial<PipelineState>>;
7
+ //# sourceMappingURL=pdfSplitter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdfSplitter.d.ts","sourceRoot":"","sources":["../../src/nodes/pdfSplitter.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,WAAW,CAC/B,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAwCjC"}
@@ -0,0 +1,41 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { PDFDocument } from "pdf-lib";
4
+ import { pipelineConfig, requireInit } from "../core/config.js";
5
+ import { logger, LogSource } from "../core/logger.js";
6
+ /**
7
+ * Splits a PDF into sub-documents of PDF_PAGES_PER_CHUNK pages each.
8
+ * Each sub-document is serialised back to a Buffer for downstream Gemini processing.
9
+ */
10
+ export async function pdfSplitter(state) {
11
+ requireInit();
12
+ if (!state.filePath)
13
+ throw new Error("[pdfSplitter] filePath is missing");
14
+ const fullPath = path.resolve(process.cwd(), state.filePath);
15
+ logger.info(LogSource.PDF_SPLITTER, `Reading file at: ${fullPath}`);
16
+ let fileBuffer;
17
+ try {
18
+ fileBuffer = await fs.readFile(fullPath);
19
+ }
20
+ catch (err) {
21
+ throw new Error(`Failed to read file at ${fullPath}: ${err.message}`);
22
+ }
23
+ const pdfDoc = await PDFDocument.load(fileBuffer);
24
+ const totalPages = pdfDoc.getPageCount();
25
+ logger.info(LogSource.PDF_SPLITTER, `Total pages: ${totalPages}`);
26
+ logger.info(LogSource.PDF_SPLITTER, `Splitting into chunks of ${pipelineConfig.pdfPagesPerChunk} pages`);
27
+ const chunks = [];
28
+ for (let start = 0; start < totalPages; start += pipelineConfig.pdfPagesPerChunk) {
29
+ const end = Math.min(start + pipelineConfig.pdfPagesPerChunk, totalPages);
30
+ const subDoc = await PDFDocument.create();
31
+ const copiedPages = await subDoc.copyPages(pdfDoc, Array.from({ length: end - start }, (_, i) => start + i));
32
+ for (const page of copiedPages) {
33
+ subDoc.addPage(page);
34
+ }
35
+ const subBytes = await subDoc.save();
36
+ chunks.push(Buffer.from(subBytes).toString("base64"));
37
+ }
38
+ logger.info(LogSource.PDF_SPLITTER, `Created ${chunks.length} PDF chunk(s)`);
39
+ return { pdfChunks: chunks };
40
+ }
41
+ //# sourceMappingURL=pdfSplitter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdfSplitter.js","sourceRoot":"","sources":["../../src/nodes/pdfSplitter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,IAAI,CAAC,KAAK,CAAC,QAAQ;QAAE,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;IAC1E,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,oBAAoB,QAAQ,EAAE,CAAC,CAAC;IAEpE,IAAI,UAAU,CAAC;IACf,IAAI,CAAC;QACH,UAAU,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC3C,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CAAC,0BAA0B,QAAQ,KAAK,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IACxE,CAAC;IACD,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAClD,MAAM,UAAU,GAAG,MAAM,CAAC,YAAY,EAAE,CAAC;IAEzC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,gBAAgB,UAAU,EAAE,CAAC,CAAC;IAClE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,4BAA4B,cAAc,CAAC,gBAAgB,QAAQ,CAAC,CAAC;IAEzG,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,UAAU,EAAE,KAAK,IAAI,cAAc,CAAC,gBAAgB,EAAE,CAAC;QACjF,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,cAAc,CAAC,gBAAgB,EAAE,UAAU,CAAC,CAAC;QAC1E,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,MAAM,EAAE,CAAC;QAE1C,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,SAAS,CACxC,MAAM,EACN,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,GAAG,GAAG,KAAK,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,KAAK,GAAG,CAAC,CAAC,CACzD,CAAC;QAEF,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QACvB,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;QACrC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,WAAW,MAAM,CAAC,MAAM,eAAe,CAAC,CAAC;IAE7E,OAAO,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC;AAC/B,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Saves the final normalized markdown to a local file.
4
+ * Creates a folder named after the document (with a unique hash) in the 'outputs' directory.
5
+ */
6
+ export declare function saveMarkdown(state: PipelineState): Promise<Partial<PipelineState>>;
7
+ //# sourceMappingURL=saveMarkdown.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"saveMarkdown.d.ts","sourceRoot":"","sources":["../../src/nodes/saveMarkdown.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,YAAY,CAChC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAyBjC"}
@@ -0,0 +1,28 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import crypto from "node:crypto";
4
+ import { logger, LogSource } from "../core/logger.js";
5
+ /**
6
+ * Saves the final normalized markdown to a local file.
7
+ * Creates a folder named after the document (with a unique hash) in the 'outputs' directory.
8
+ */
9
+ export async function saveMarkdown(state) {
10
+ const { filePath, markdown } = state;
11
+ // Create a unique hash for the output folder
12
+ const fileHash = crypto
13
+ .createHash("md5")
14
+ .update(state.filePath || "pasted_text")
15
+ .digest("hex")
16
+ .slice(0, 16);
17
+ const baseName = state.filePath ? path.parse(state.filePath).name : "pasted_text";
18
+ const outputDir = path.resolve(process.cwd(), "outputs", `${baseName}_${fileHash}`);
19
+ const outputPath = path.join(outputDir, "full_content.md");
20
+ logger.info(LogSource.SAVE_MARKDOWN, `Saving to: ${outputPath}`);
21
+ // Create directory (and parents)
22
+ await fs.mkdir(outputDir, { recursive: true });
23
+ // Write markdown
24
+ await fs.writeFile(outputPath, markdown, "utf-8");
25
+ logger.success(LogSource.SAVE_MARKDOWN, `Markdown saved successfully`);
26
+ return {};
27
+ }
28
+ //# sourceMappingURL=saveMarkdown.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"saveMarkdown.js","sourceRoot":"","sources":["../../src/nodes/saveMarkdown.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,MAAM,MAAM,aAAa,CAAC;AAEjC,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,KAAoB;IAEpB,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,KAAK,CAAC;IAErC,6CAA6C;IAC7C,MAAM,QAAQ,GAAG,MAAM;SACpB,UAAU,CAAC,KAAK,CAAC;SACjB,MAAM,CAAC,KAAK,CAAC,QAAQ,IAAI,aAAa,CAAC;SACvC,MAAM,CAAC,KAAK,CAAC;SACb,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAEhB,MAAM,QAAQ,GAAG,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,aAAa,CAAC;IAClF,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,GAAG,QAAQ,IAAI,QAAQ,EAAE,CAAC,CAAC;IACpF,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,iBAAiB,CAAC,CAAC;IAE3D,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,cAAc,UAAU,EAAE,CAAC,CAAC;IAEjE,iCAAiC;IACjC,MAAM,EAAE,CAAC,KAAK,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE/C,iBAAiB;IACjB,MAAM,EAAE,CAAC,SAAS,CAAC,UAAU,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;IAElD,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,aAAa,EAAE,6BAA6B,CAAC,CAAC;IAEvE,OAAO,EAAE,CAAC;AACZ,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Extracts raw text from office documents (DOCX, PPTX, XLSX) using officeparser,
4
+ * CSV files using csv-parse, and TXT files via direct read.
5
+ */
6
+ export declare function textExtractorNode(state: PipelineState): Promise<Partial<PipelineState>>;
7
+ //# sourceMappingURL=textExtractorNode.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"textExtractorNode.d.ts","sourceRoot":"","sources":["../../src/nodes/textExtractorNode.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA+BjC"}