npm - @virstack/doc-ingest - Versions diffs - 1.0.0 - Mend

@virstack/doc-ingest 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/README.md +203 -0
package/dist/adapters/aiAdapters.d.ts +25 -0
package/dist/adapters/aiAdapters.d.ts.map +1 -0
package/dist/adapters/aiAdapters.js +73 -0
package/dist/adapters/aiAdapters.js.map +1 -0
package/dist/adapters/vectorStore.d.ts +24 -0
package/dist/adapters/vectorStore.d.ts.map +1 -0
package/dist/adapters/vectorStore.js +22 -0
package/dist/adapters/vectorStore.js.map +1 -0
package/dist/aiAdapters.d.ts +25 -0
package/dist/aiAdapters.d.ts.map +1 -0
package/dist/aiAdapters.js +50 -0
package/dist/aiAdapters.js.map +1 -0
package/dist/assets/logo.png +0 -0
package/dist/batchPipeline.d.ts +52 -0
package/dist/batchPipeline.d.ts.map +1 -0
package/dist/batchPipeline.js +81 -0
package/dist/batchPipeline.js.map +1 -0
package/dist/cli.d.ts +3 -0
package/dist/cli.d.ts.map +1 -0
package/dist/cli.js +217 -0
package/dist/cli.js.map +1 -0
package/dist/config.d.ts +26 -0
package/dist/config.d.ts.map +1 -0
package/dist/config.js +97 -0
package/dist/config.js.map +1 -0
package/dist/core/config.d.ts +26 -0
package/dist/core/config.d.ts.map +1 -0
package/dist/core/config.js +106 -0
package/dist/core/config.js.map +1 -0
package/dist/core/logger.d.ts +31 -0
package/dist/core/logger.d.ts.map +1 -0
package/dist/core/logger.js +42 -0
package/dist/core/logger.js.map +1 -0
package/dist/core/state.d.ts +52 -0
package/dist/core/state.d.ts.map +1 -0
package/dist/core/state.js +27 -0
package/dist/core/state.js.map +1 -0
package/dist/graphs/batchProcessor.d.ts +72 -0
package/dist/graphs/batchProcessor.d.ts.map +1 -0
package/dist/graphs/batchProcessor.js +94 -0
package/dist/graphs/batchProcessor.js.map +1 -0
package/dist/graphs/singleDocument.d.ts +303 -0
package/dist/graphs/singleDocument.d.ts.map +1 -0
package/dist/graphs/singleDocument.js +93 -0
package/dist/graphs/singleDocument.js.map +1 -0
package/dist/index.d.ts +8 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +10 -0
package/dist/index.js.map +1 -0
package/dist/logger.d.ts +24 -0
package/dist/logger.d.ts.map +1 -0
package/dist/logger.js +36 -0
package/dist/logger.js.map +1 -0
package/dist/logo.d.ts +2 -0
package/dist/logo.d.ts.map +1 -0
package/dist/logo.js +3 -0
package/dist/logo.js.map +1 -0
package/dist/nodes/fileTypeRouter.d.ts +16 -0
package/dist/nodes/fileTypeRouter.d.ts.map +1 -0
package/dist/nodes/fileTypeRouter.js +72 -0
package/dist/nodes/fileTypeRouter.js.map +1 -0
package/dist/nodes/geminiExtraction.d.ts +19 -0
package/dist/nodes/geminiExtraction.d.ts.map +1 -0
package/dist/nodes/geminiExtraction.js +87 -0
package/dist/nodes/geminiExtraction.js.map +1 -0
package/dist/nodes/libreOfficeToPdf.d.ts +8 -0
package/dist/nodes/libreOfficeToPdf.d.ts.map +1 -0
package/dist/nodes/libreOfficeToPdf.js +61 -0
package/dist/nodes/libreOfficeToPdf.js.map +1 -0
package/dist/nodes/llmExtractionNode.d.ts +19 -0
package/dist/nodes/llmExtractionNode.d.ts.map +1 -0
package/dist/nodes/llmExtractionNode.js +68 -0
package/dist/nodes/llmExtractionNode.js.map +1 -0
package/dist/nodes/markdownChunker.d.ts +8 -0
package/dist/nodes/markdownChunker.d.ts.map +1 -0
package/dist/nodes/markdownChunker.js +24 -0
package/dist/nodes/markdownChunker.js.map +1 -0
package/dist/nodes/markdownMerger.d.ts +9 -0
package/dist/nodes/markdownMerger.d.ts.map +1 -0
package/dist/nodes/markdownMerger.js +33 -0
package/dist/nodes/markdownMerger.js.map +1 -0
package/dist/nodes/markdownNormalizer.d.ts +10 -0
package/dist/nodes/markdownNormalizer.d.ts.map +1 -0
package/dist/nodes/markdownNormalizer.js +46 -0
package/dist/nodes/markdownNormalizer.js.map +1 -0
package/dist/nodes/openrouterEmbedder.d.ts +7 -0
package/dist/nodes/openrouterEmbedder.d.ts.map +1 -0
package/dist/nodes/openrouterEmbedder.js +31 -0
package/dist/nodes/openrouterEmbedder.js.map +1 -0
package/dist/nodes/pdfSplitter.d.ts +7 -0
package/dist/nodes/pdfSplitter.d.ts.map +1 -0
package/dist/nodes/pdfSplitter.js +41 -0
package/dist/nodes/pdfSplitter.js.map +1 -0
package/dist/nodes/saveMarkdown.d.ts +7 -0
package/dist/nodes/saveMarkdown.d.ts.map +1 -0
package/dist/nodes/saveMarkdown.js +28 -0
package/dist/nodes/saveMarkdown.js.map +1 -0
package/dist/nodes/textExtractorNode.d.ts +7 -0
package/dist/nodes/textExtractorNode.d.ts.map +1 -0
package/dist/nodes/textExtractorNode.js +39 -0
package/dist/nodes/textExtractorNode.js.map +1 -0
package/dist/nodes/upstashUpsert.d.ts +7 -0
package/dist/nodes/upstashUpsert.d.ts.map +1 -0
package/dist/nodes/upstashUpsert.js +45 -0
package/dist/nodes/upstashUpsert.js.map +1 -0
package/dist/nodes/vectorEmbedderNode.d.ts +7 -0
package/dist/nodes/vectorEmbedderNode.d.ts.map +1 -0
package/dist/nodes/vectorEmbedderNode.js +23 -0
package/dist/nodes/vectorEmbedderNode.js.map +1 -0
package/dist/nodes/vectorUpsertNode.d.ts +7 -0
package/dist/nodes/vectorUpsertNode.d.ts.map +1 -0
package/dist/nodes/vectorUpsertNode.js +45 -0
package/dist/nodes/vectorUpsertNode.js.map +1 -0
package/dist/pipeline.d.ts +303 -0
package/dist/pipeline.d.ts.map +1 -0
package/dist/pipeline.js +93 -0
package/dist/pipeline.js.map +1 -0
package/dist/state.d.ts +52 -0
package/dist/state.d.ts.map +1 -0
package/dist/state.js +27 -0
package/dist/state.js.map +1 -0
package/dist/vectorStore.d.ts +24 -0
package/dist/vectorStore.d.ts.map +1 -0
package/dist/vectorStore.js +22 -0
package/dist/vectorStore.js.map +1 -0
package/package.json +55 -0

package/dist/nodes/geminiExtraction.js ADDED Viewed

@@ -0,0 +1,87 @@
+import { openrouter, pipelineConfig, apiLimit, requireInit } from "../config.js";
+import { logger, LogSource } from "../logger.js";
+const DEFAULT_SYSTEM_PROMPT = `You are an expert document extraction and formatting AI. Your task is to extract the exact, verbatim content from the provided document and convert it entirely into standard Markdown format.
+You must strictly adhere to the following rules:
+1. **Absolute Accuracy:** Extract the text exactly as it appears in the source document. Do not summarize, rephrase, omit, or add any text. Maintain the original spelling and punctuation.
+2. **Markdown Structure:** - Replicate the document's structure using standard Markdown.
+   - Use correct heading levels ('#', '##', '###') to match the visual hierarchy of the PDF.
+   - Preserve text formatting, utilizing '**bold**' for bold text and '*italics*' for italicized text.
+   - Convert bulleted and numbered lists into their respective Markdown list formats.
+   - Convert all tabular data into standard Markdown tables. Ensure rows and columns align with the original document.
+3. **Image Handling (CRITICAL):** For every image, photograph, chart, graph, or diagram in the PDF, you must insert a Markdown image placeholder.
+   - The format must be: '![Image Placeholder: <Detailed Description>](image_number)'
+   - Replace '<Detailed Description>' with a highly descriptive, comprehensive explanation of everything visible in the image. Include colors, subjects, layout, data points (if it's a chart), and transcribe any text that appears within the image itself.
+   - Example: '![Image Placeholder: A bar chart comparing Q1 and Q2 sales. Q1 shows $50,000 in blue, Q2 shows $75,000 in green. The x-axis is labeled 'Quarters' and the y-axis is labeled 'Revenue in USD'.](image_1)'
+4. **Headers and Footers:** Omit repetitive page numbers, document titles in the header, and footers unless they contain crucial footnotes directly referenced in the main text. If footnotes are present, append them to the end of the relevant section or document.
+5. **Formatting Artifacts:** Remove arbitrary line breaks caused by PDF page formatting. Stitch sentences back together so they flow naturally in the Markdown output.
+Output the final Markdown only. Do not include conversational filler before or after the extracted content.`;
+/**
+ * Unified Gemini node for all document extraction flows.
+ * Handles both:
+ * 1. Base64 PDF chunks via Vision (Parallel Map-Reduce branch)
+ * 2. Raw text extracted by textExtractorNode (Text branch)
+ */
+export async function geminiExtraction(state) {
+    requireInit();
+    const isChunkFlow = state.chunk !== undefined && state.index !== undefined && state.totalChunks !== undefined;
+    const isTextFlow = !!state.rawText;
+    if (!isChunkFlow && !isTextFlow) {
+        throw new Error("[geminiExtraction] Neither chunk nor rawText was provided in the state.");
+    }
+    let userContent;
+    if (isChunkFlow) {
+        const { chunk: base64, totalChunks, index } = state;
+        logger.info(LogSource.GEMINI, `Processing PDF chunk ${index + 1}/${totalChunks} (${((base64.length * 0.75) / 1024).toFixed(0)} KB)`);
+        userContent = [
+            {
+                type: "file",
+                file: {
+                    filename: `chunk_${index + 1}.pdf`,
+                    file_data: `data:application/pdf;base64,${base64}`,
+                },
+            },
+            {
+                type: "text",
+                text: `Extract all content from this PDF (chunk ${index + 1} of ${totalChunks}) into clean Markdown.`,
+            },
+        ];
+    }
+    else {
+        logger.info(LogSource.GEMINI, `Sending ${state.rawText.length} chars to ${pipelineConfig.llmModel}`);
+        userContent = `Convert the following extracted document text into clean Markdown:\n\n${state.rawText}`;
+    }
+    const finalSystemPrompt = pipelineConfig.systemPrompt || DEFAULT_SYSTEM_PROMPT;
+    const response = await apiLimit(() => openrouter.chat.completions.create({
+        model: pipelineConfig.llmModel,
+        messages: [
+            { role: "system", content: finalSystemPrompt },
+            { role: "user", content: userContent },
+        ],
+        max_tokens: pipelineConfig.maxTokens,
+        temperature: 0,
+    }));
+    const markdown = response.choices[0]?.message?.content?.trim() ?? "";
+    if (isChunkFlow) {
+        logger.info(LogSource.GEMINI, `Chunk ${state.index + 1}/${state.totalChunks} extracted (${markdown.length} chars)`);
+        return { markdownParts: [markdown] };
+    }
+    logger.info(LogSource.GEMINI, `Extracted markdown: ${markdown.length} chars`);
+    return { markdown };
+}
+/**
+ * Conditional router to determine what happens after geminiExtraction.
+ * - If from PDF branch, it returns to markdownMerger
+ * - If from Text branch, it goes straight to markdownNormalizer
+ */
+export function routeAfterGemini(state) {
+    // If markdownParts has contents but markdown is empty, we must merge
+    if (state.markdownParts && state.markdownParts.length > 0 && !state.markdown) {
+        return "markdownMerger";
+    }
+    // Otherwise, it was the raw text branch which already assigned state.markdown
+    return "markdownNormalizer";
+}
+//# sourceMappingURL=geminiExtraction.js.map

package/dist/nodes/geminiExtraction.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"geminiExtraction.js","sourceRoot":"","sources":["../../src/nodes/geminiExtraction.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,cAAc,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAEjF,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEjD,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;4GAiB8E,CAAC;AAE7G;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,KAAwF;IAGxF,WAAW,EAAE,CAAC;IAEd,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,WAAW,KAAK,SAAS,CAAC;IAC9G,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAEnC,IAAI,CAAC,WAAW,IAAI,CAAC,UAAU,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CAAC,yEAAyE,CAAC,CAAC;IAC7F,CAAC;IAED,IAAI,WAAgB,CAAC;IAErB,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,WAAW,EAAE,KAAK,EAAE,GAAG,KAAK,CAAC;QACpD,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,wBAAwB,KAAM,GAAG,CAAC,IAAI,WAAW,KAAK,CAAC,CAAC,MAAO,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QACvI,WAAW,GAAG;YACZ;gBACE,IAAI,EAAE,MAAa;gBACnB,IAAI,EAAE;oBACJ,QAAQ,EAAE,SAAS,KAAM,GAAG,CAAC,MAAM;oBACnC,SAAS,EAAE,+BAA+B,MAAM,EAAE;iBACnD;aACK;YACR;gBACE,IAAI,EAAE,MAAM;gBACZ,IAAI,EAAE,4CAA4C,KAAM,GAAG,CAAC,OAAO,WAAW,wBAAwB;aACvG;SACF,CAAC;IACJ,CAAC;SAAM,CAAC;QACN,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,WAAW,KAAK,CAAC,OAAQ,CAAC,MAAM,aAAa,cAAc,CAAC,QAAQ,EAAE,CAAC,CAAC;QACtG,WAAW,GAAG,yEAAyE,KAAK,CAAC,OAAO,EAAE,CAAC;IACzG,CAAC;IAED,MAAM,iBAAiB,GAAG,cAAc,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAE/E,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,CACnC,UAAU,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC;QACjC,KAAK,EAAE,cAAc,CAAC,QAAQ;QAC9B,QAAQ,EAAE;YACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,iBAAiB,EAAE;YAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE;SACvC;QACD,UAAU,EAAE,cAAc,CAAC,SAAS;QACpC,WAAW,EAAE,CAAC;KACf,CAAC,CACH,CAAC;IAEF,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAErE,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,SAAS,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,eAAe,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAC;QACrH,OAAO,EAAE,aAAa,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;IACvC,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,uBAAuB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IAC9E,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,gBAAgB,CAAC,KAAoB;IACnD,qEAAqE;IACrE,IAAI,KAAK,CAAC,aAAa,IAAI,KAAK,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QAC7E,OAAO,gBAAgB,CAAC;IAC1B,CAAC;IACD,8EAA8E;IAC9E,OAAO,oBAAoB,CAAC;AAC9B,CAAC"}

package/dist/nodes/libreOfficeToPdf.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+import type { PipelineState } from "../core/state.js";
+/**
+ * Converts the input file to PDF using LibreOffice headless.
+ * Updates state.filePath to point to the newly generated PDF.
+ * Supported: DOCX, DOC, RTF, ODT, EPUB, PPTX, PPT, ODP
+ */
+export declare function libreOfficeToPdf(state: PipelineState): Promise<Partial<PipelineState>>;
+//# sourceMappingURL=libreOfficeToPdf.d.ts.map

package/dist/nodes/libreOfficeToPdf.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"libreOfficeToPdf.d.ts","sourceRoot":"","sources":["../../src/nodes/libreOfficeToPdf.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AActD;;;;GAIG;AACH,wBAAsB,gBAAgB,CACpC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA6CjC"}

package/dist/nodes/libreOfficeToPdf.js ADDED Viewed

@@ -0,0 +1,61 @@
+import { execFile } from "node:child_process";
+import { promisify } from "node:util";
+import path from "node:path";
+import os from "node:os";
+import fs from "node:fs/promises";
+import { logger, LogSource } from "../core/logger.js";
+const execFileAsync = promisify(execFile);
+/**
+ * Resolves the LibreOffice soffice binary.
+ * Checks SOFFICE_PATH env var first, otherwise relies on system PATH.
+ */
+function getSofficePath() {
+    if (process.env.SOFFICE_PATH)
+        return process.env.SOFFICE_PATH;
+    return "soffice"; // Default to whatever is in the PATH
+}
+/**
+ * Converts the input file to PDF using LibreOffice headless.
+ * Updates state.filePath to point to the newly generated PDF.
+ * Supported: DOCX, DOC, RTF, ODT, EPUB, PPTX, PPT, ODP
+ */
+export async function libreOfficeToPdf(state) {
+    const sofficePath = getSofficePath();
+    if (!state.filePath)
+        throw new Error("[libreOfficeToPdf] filePath is missing");
+    const inputPath = path.resolve(process.cwd(), state.filePath);
+    const outputDir = await fs.mkdtemp(path.join(os.tmpdir(), "lo-pdf-"));
+    logger.info(LogSource.LIBRE_OFFICE, `Converting: ${path.basename(inputPath)}`);
+    logger.info(LogSource.LIBRE_OFFICE, `Using soffice: ${sofficePath}`);
+    logger.info(LogSource.LIBRE_OFFICE, `Output dir: ${outputDir}`);
+    try {
+        await execFileAsync(sofficePath, [
+            "--headless",
+            "--norestore",
+            "--convert-to", "pdf",
+            "--outdir", outputDir,
+            inputPath,
+        ]);
+    }
+    catch (err) {
+        throw new Error(`LibreOffice conversion failed. Is LibreOffice installed?\n` +
+            `  Tried: ${sofficePath}\n` +
+            `  On macOS: brew install --cask libreoffice\n` +
+            `  Set SOFFICE_PATH in .env to override.\n` +
+            `  Original error: ${err.message}`);
+    }
+    // LibreOffice names the output file after the input file with .pdf extension
+    const baseName = path.parse(inputPath).name;
+    const pdfPath = path.join(outputDir, `${baseName}.pdf`);
+    // Verify the file exists
+    try {
+        await fs.access(pdfPath);
+    }
+    catch {
+        throw new Error(`LibreOffice ran but output PDF not found at: ${pdfPath}. ` +
+            `Check LibreOffice installation.`);
+    }
+    logger.success(LogSource.LIBRE_OFFICE, `Converted to: ${pdfPath}`);
+    return { filePath: pdfPath };
+}
+//# sourceMappingURL=libreOfficeToPdf.js.map

package/dist/nodes/libreOfficeToPdf.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"libreOfficeToPdf.js","sourceRoot":"","sources":["../../src/nodes/libreOfficeToPdf.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACtC,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAElC,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD,MAAM,aAAa,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAC;AAE1C;;;GAGG;AACH,SAAS,cAAc;IACrB,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY;QAAE,OAAO,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC;IAC9D,OAAO,SAAS,CAAC,CAAC,qCAAqC;AACzD,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,KAAoB;IAEpB,MAAM,WAAW,GAAG,cAAc,EAAE,CAAC;IACrC,IAAI,CAAC,KAAK,CAAC,QAAQ;QAAE,MAAM,IAAI,KAAK,CAAC,wCAAwC,CAAC,CAAC;IAC/E,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC9D,MAAM,SAAS,GAAG,MAAM,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,SAAS,CAAC,CAAC,CAAC;IAEtE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,eAAe,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;IAC/E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,kBAAkB,WAAW,EAAE,CAAC,CAAC;IACrE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,eAAe,SAAS,EAAE,CAAC,CAAC;IAEhE,IAAI,CAAC;QACH,MAAM,aAAa,CAAC,WAAW,EAAE;YAC/B,YAAY;YACZ,aAAa;YACb,cAAc,EAAE,KAAK;YACrB,UAAU,EAAE,SAAS;YACrB,SAAS;SACV,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CACb,4DAA4D;YAC5D,YAAY,WAAW,IAAI;YAC3B,+CAA+C;YAC/C,2CAA2C;YAC3C,qBAAqB,GAAG,CAAC,OAAO,EAAE,CACnC,CAAC;IACJ,CAAC;IAED,6EAA6E;IAC7E,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC;IAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,QAAQ,MAAM,CAAC,CAAC;IAExD,yBAAyB;IACzB,IAAI,CAAC;QACH,MAAM,EAAE,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAC3B,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,gDAAgD,OAAO,IAAI;YAC3D,iCAAiC,CAClC,CAAC;IACJ,CAAC;IAED,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,YAAY,EAAE,iBAAiB,OAAO,EAAE,CAAC,CAAC;IAEnE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC;AAC/B,CAAC"}

package/dist/nodes/llmExtractionNode.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import type { PipelineState } from "../core/state.js";
+/**
+ * Unified LLM node for all document extraction flows.
+ * Handles both:
+ * 1. Base64 PDF chunks via Vision (Parallel Map-Reduce branch)
+ * 2. Raw text extracted by textExtractorNode (Text branch)
+ */
+export declare function llmExtractionNode(state: Partial<PipelineState> & {
+    chunk?: string;
+    index?: number;
+    totalChunks?: number;
+}): Promise<Partial<PipelineState>>;
+/**
+ * Conditional router to determine what happens after llmExtractionNode.
+ * - If from PDF branch, it returns to markdownMerger
+ * - If from Text branch, it goes straight to markdownNormalizer
+ */
+export declare function routeAfterLlm(state: PipelineState): string;
+//# sourceMappingURL=llmExtractionNode.d.ts.map

package/dist/nodes/llmExtractionNode.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"llmExtractionNode.d.ts","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAuBtD;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,OAAO,CAAC,aAAa,CAAC,GAAG;IAAE,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,GACvF,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAuCjC;AAED;;;;GAIG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,CAK1D"}

package/dist/nodes/llmExtractionNode.js ADDED Viewed

@@ -0,0 +1,68 @@
+import { pipelineConfig, apiLimit, requireInit } from "../core/config.js";
+import { logger, LogSource } from "../core/logger.js";
+const DEFAULT_SYSTEM_PROMPT = `You are an expert document extraction and formatting AI. Your task is to extract the exact, verbatim content from the provided document and convert it entirely into standard Markdown format.
+You must strictly adhere to the following rules:
+1. **Absolute Accuracy:** Extract the text exactly as it appears in the source document. Do not summarize, rephrase, omit, or add any text. Maintain the original spelling and punctuation.
+2. **Markdown Structure:** - Replicate the document's structure using standard Markdown.
+   - Use correct heading levels ('#', '##', '###') to match the visual hierarchy of the PDF.
+   - Preserve text formatting, utilizing '**bold**' for bold text and '*italics*' for italicized text.
+   - Convert bulleted and numbered lists into their respective Markdown list formats.
+   - Convert all tabular data into standard Markdown tables. Ensure rows and columns align with the original document.
+3. **Image Handling (CRITICAL):** For every image, photograph, chart, graph, or diagram in the PDF, you must insert a Markdown image placeholder.
+   - The format must be: '![Image Placeholder: <Detailed Description>](image_number)'
+   - Replace '<Detailed Description>' with a highly descriptive, comprehensive explanation of everything visible in the image. Include colors, subjects, layout, data points (if it's a chart), and transcribe any text that appears within the image itself.
+   - Example: '![Image Placeholder: A bar chart comparing Q1 and Q2 sales. Q1 shows $50,000 in blue, Q2 shows $75,000 in green. The x-axis is labeled 'Quarters' and the y-axis is labeled 'Revenue in USD'.](image_1)'
+4. **Headers and Footers:** Omit repetitive page numbers, document titles in the header, and footers unless they contain crucial footnotes directly referenced in the main text. If footnotes are present, append them to the end of the relevant section or document.
+5. **Formatting Artifacts:** Remove arbitrary line breaks caused by PDF page formatting. Stitch sentences back together so they flow naturally in the Markdown output.
+Output the final Markdown only. Do not include conversational filler before or after the extracted content.`;
+/**
+ * Unified LLM node for all document extraction flows.
+ * Handles both:
+ * 1. Base64 PDF chunks via Vision (Parallel Map-Reduce branch)
+ * 2. Raw text extracted by textExtractorNode (Text branch)
+ */
+export async function llmExtractionNode(state) {
+    requireInit();
+    const isChunkFlow = state.chunk !== undefined && state.index !== undefined && state.totalChunks !== undefined;
+    const isTextFlow = !!state.rawText;
+    if (!isChunkFlow && !isTextFlow) {
+        throw new Error("[llmExtractionNode] Neither chunk nor rawText was provided in the state.");
+    }
+    const finalSystemPrompt = pipelineConfig.systemPrompt || DEFAULT_SYSTEM_PROMPT;
+    const promptInput = {
+        systemPrompt: finalSystemPrompt,
+        userText: isChunkFlow
+            ? `Extract all content from this PDF (chunk ${state.index + 1} of ${state.totalChunks}) into clean Markdown.`
+            : `Convert the following extracted document text into clean Markdown:\n\n${state.rawText}`,
+        base64PdfChunk: isChunkFlow ? state.chunk : undefined
+    };
+    if (isChunkFlow) {
+        logger.info(LogSource.LLM_EXTRACTION, `Processing PDF chunk ${state.index + 1}/${state.totalChunks} (${((state.chunk.length * 0.75) / 1024).toFixed(0)} KB)`);
+    }
+    else {
+        logger.info(LogSource.LLM_EXTRACTION, `Sending ${state.rawText.length} chars to generic LLM Adapter`);
+    }
+    // Call the injected LLM adapter wrapped in your rate limiter!
+    const markdown = await apiLimit(() => pipelineConfig.llm.generateMarkdown(promptInput));
+    if (isChunkFlow) {
+        logger.info(LogSource.LLM_EXTRACTION, `Chunk ${state.index + 1}/${state.totalChunks} extracted (${markdown.length} chars)`);
+        return { markdownParts: [markdown] };
+    }
+    logger.info(LogSource.LLM_EXTRACTION, `Extracted markdown: ${markdown.length} chars`);
+    return { markdown };
+}
+/**
+ * Conditional router to determine what happens after llmExtractionNode.
+ * - If from PDF branch, it returns to markdownMerger
+ * - If from Text branch, it goes straight to markdownNormalizer
+ */
+export function routeAfterLlm(state) {
+    if (state.markdownParts && state.markdownParts.length > 0 && !state.markdown) {
+        return "markdownMerger";
+    }
+    return "markdownNormalizer";
+}
+//# sourceMappingURL=llmExtractionNode.js.map

package/dist/nodes/llmExtractionNode.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"llmExtractionNode.js","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAE1E,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAGtD,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;4GAiB8E,CAAC;AAE7G;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAAwF;IAGxF,WAAW,EAAE,CAAC;IAEd,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,WAAW,KAAK,SAAS,CAAC;IAC9G,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAEnC,IAAI,CAAC,WAAW,IAAI,CAAC,UAAU,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;IAC9F,CAAC;IAED,MAAM,iBAAiB,GAAG,cAAc,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAE/E,MAAM,WAAW,GAAa;QAC5B,YAAY,EAAE,iBAAiB;QAC/B,QAAQ,EAAE,WAAW;YACnB,CAAC,CAAC,4CAA4C,KAAK,CAAC,KAAM,GAAG,CAAC,OAAO,KAAK,CAAC,WAAW,wBAAwB;YAC9G,CAAC,CAAC,yEAAyE,KAAK,CAAC,OAAO,EAAE;QAC5F,cAAc,EAAE,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;KACtD,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,wBAAwB,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,KAAK,CAAC,CAAC,KAAK,CAAC,KAAM,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAClK,CAAC;SAAM,CAAC;QACN,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,WAAW,KAAK,CAAC,OAAQ,CAAC,MAAM,+BAA+B,CAAC,CAAC;IACzG,CAAC;IAED,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,CACnC,cAAc,CAAC,GAAG,CAAC,gBAAgB,CAAC,WAAW,CAAC,CACjD,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,SAAS,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,eAAe,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAC;QAC7H,OAAO,EAAE,aAAa,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;IACvC,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,uBAAuB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IACtF,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,aAAa,CAAC,KAAoB;IAChD,IAAI,KAAK,CAAC,aAAa,IAAI,KAAK,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QAC7E,OAAO,gBAAgB,CAAC;IAC1B,CAAC;IACD,OAAO,oBAAoB,CAAC;AAC9B,CAAC"}

package/dist/nodes/markdownChunker.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+import type { PipelineState } from "../core/state.js";
+/**
+ * Splits markdown text into semantic chunks using LangChain's RecursiveCharacterTextSplitter.
+ * This splitter tries to split on paragraphs, then sentences, then words to keep
+ * related content together while respecting the chunk size.
+ */
+export declare function markdownChunker(state: PipelineState): Promise<Partial<PipelineState>>;
+//# sourceMappingURL=markdownChunker.d.ts.map

package/dist/nodes/markdownChunker.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"markdownChunker.d.ts","sourceRoot":"","sources":["../../src/nodes/markdownChunker.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;;GAIG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAmBjC"}

package/dist/nodes/markdownChunker.js ADDED Viewed

@@ -0,0 +1,24 @@
+import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
+import { pipelineConfig, requireInit } from "../core/config.js";
+import { logger, LogSource } from "../core/logger.js";
+/**
+ * Splits markdown text into semantic chunks using LangChain's RecursiveCharacterTextSplitter.
+ * This splitter tries to split on paragraphs, then sentences, then words to keep
+ * related content together while respecting the chunk size.
+ */
+export async function markdownChunker(state) {
+    requireInit();
+    const { markdown } = state;
+    logger.info(LogSource.MARKDOWN_CHUNKER, `Input: ${markdown.length} chars`);
+    const splitter = new RecursiveCharacterTextSplitter({
+        chunkSize: pipelineConfig.chunkSize,
+        chunkOverlap: pipelineConfig.chunkOverlap,
+        // Optimal separators for Markdown
+        separators: ["\n\n", "\n", " ", ""],
+    });
+    const docs = await splitter.createDocuments([markdown]);
+    const textChunks = docs.map((doc) => doc.pageContent.trim());
+    logger.info(LogSource.MARKDOWN_CHUNKER, `Created ${textChunks.length} chunks`);
+    return { textChunks };
+}
+//# sourceMappingURL=markdownChunker.js.map

package/dist/nodes/markdownChunker.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"markdownChunker.js","sourceRoot":"","sources":["../../src/nodes/markdownChunker.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,8BAA8B,EAAE,MAAM,0BAA0B,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,QAAQ,EAAE,GAAG,KAAK,CAAC;IAE3B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,UAAU,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IAE3E,MAAM,QAAQ,GAAG,IAAI,8BAA8B,CAAC;QAClD,SAAS,EAAE,cAAc,CAAC,SAAS;QACnC,YAAY,EAAE,cAAc,CAAC,YAAY;QACzC,kCAAkC;QAClC,UAAU,EAAE,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC;KACpC,CAAC,CAAC;IAEH,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,eAAe,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;IACxD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC;IAE7D,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,WAAW,UAAU,CAAC,MAAM,SAAS,CAAC,CAAC;IAE/E,OAAO,EAAE,UAAU,EAAE,CAAC;AACxB,CAAC"}

package/dist/nodes/markdownMerger.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+import type { PipelineState } from "../core/state.js";
+/**
+ * Joins markdownParts from the PDF branch into a single markdown string.
+ * Performs basic boundary-seam fixes:
+ *  - Removes duplicate headings at chunk boundaries
+ *  - Trims excessive whitespace between chunks
+ */
+export declare function markdownMerger(state: PipelineState): Promise<Partial<PipelineState>>;
+//# sourceMappingURL=markdownMerger.d.ts.map

package/dist/nodes/markdownMerger.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"markdownMerger.d.ts","sourceRoot":"","sources":["../../src/nodes/markdownMerger.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;;;GAKG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAoCjC"}

package/dist/nodes/markdownMerger.js ADDED Viewed

@@ -0,0 +1,33 @@
+import { logger, LogSource } from "../core/logger.js";
+/**
+ * Joins markdownParts from the PDF branch into a single markdown string.
+ * Performs basic boundary-seam fixes:
+ *  - Removes duplicate headings at chunk boundaries
+ *  - Trims excessive whitespace between chunks
+ */
+export async function markdownMerger(state) {
+    const { markdownParts } = state;
+    logger.info(LogSource.MARKDOWN_MERGER, `Merging ${markdownParts.length} markdown part(s)`);
+    let merged = "";
+    for (let i = 0; i < markdownParts.length; i++) {
+        let part = markdownParts[i].trim();
+        if (i > 0) {
+            // Check if the previous chunk ended with a heading that this chunk starts with
+            const prevLines = merged.trimEnd().split("\n");
+            const lastPrevLine = prevLines[prevLines.length - 1]?.trim() ?? "";
+            const firstLine = part.split("\n")[0]?.trim() ?? "";
+            if (lastPrevLine === firstLine &&
+                firstLine.startsWith("#")) {
+                // Duplicate heading at boundary — skip it in current chunk
+                part = part.split("\n").slice(1).join("\n").trim();
+            }
+            merged += "\n\n";
+        }
+        merged += part;
+    }
+    // Collapse triple+ newlines into double
+    merged = merged.replace(/\n{3,}/g, "\n\n");
+    logger.info(LogSource.MARKDOWN_MERGER, `Merged markdown: ${merged.length} chars`);
+    return { markdown: merged };
+}
+//# sourceMappingURL=markdownMerger.js.map

package/dist/nodes/markdownMerger.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"markdownMerger.js","sourceRoot":"","sources":["../../src/nodes/markdownMerger.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,KAAoB;IAEpB,MAAM,EAAE,aAAa,EAAE,GAAG,KAAK,CAAC;IAEhC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,WAAW,aAAa,CAAC,MAAM,mBAAmB,CAAC,CAAC;IAE3F,IAAI,MAAM,GAAG,EAAE,CAAC;IAEhB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9C,IAAI,IAAI,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAEnC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACV,+EAA+E;YAC/E,MAAM,SAAS,GAAG,MAAM,CAAC,OAAO,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAC/C,MAAM,YAAY,GAAG,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YACnE,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAEpD,IACE,YAAY,KAAK,SAAS;gBAC1B,SAAS,CAAC,UAAU,CAAC,GAAG,CAAC,EACzB,CAAC;gBACD,2DAA2D;gBAC3D,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;YACrD,CAAC;YAED,MAAM,IAAI,MAAM,CAAC;QACnB,CAAC;QAED,MAAM,IAAI,IAAI,CAAC;IACjB,CAAC;IAED,wCAAwC;IACxC,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE3C,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,oBAAoB,MAAM,CAAC,MAAM,QAAQ,CAAC,CAAC;IAElF,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC;AAC9B,CAAC"}

package/dist/nodes/markdownNormalizer.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+import type { PipelineState } from "../core/state.js";
+/**
+ * Normalizes the merged/extracted markdown:
+ *  - Strips residual HTML tags
+ *  - Fixes broken Markdown table alignment
+ *  - Deduplicates repeated headers/footers
+ *  - Trims excessive whitespace
+ */
+export declare function markdownNormalizer(state: PipelineState): Promise<Partial<PipelineState>>;
+//# sourceMappingURL=markdownNormalizer.d.ts.map

package/dist/nodes/markdownNormalizer.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"markdownNormalizer.d.ts","sourceRoot":"","sources":["../../src/nodes/markdownNormalizer.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;;;;GAMG;AACH,wBAAsB,kBAAkB,CACtC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA8CjC"}

package/dist/nodes/markdownNormalizer.js ADDED Viewed

@@ -0,0 +1,46 @@
+import { logger, LogSource } from "../core/logger.js";
+/**
+ * Normalizes the merged/extracted markdown:
+ *  - Strips residual HTML tags
+ *  - Fixes broken Markdown table alignment
+ *  - Deduplicates repeated headers/footers
+ *  - Trims excessive whitespace
+ */
+export async function markdownNormalizer(state) {
+    let md = state.markdown;
+    logger.info(LogSource.MARKDOWN_NORMALIZER, `Input: ${md.length} chars`);
+    // 1. Strip HTML tags (but keep content)
+    md = md.replace(/<\/?[^>]+(>|$)/g, "");
+    // 2. Fix table alignment: ensure pipes are balanced
+    md = md.replace(/^\|(.+)\|$/gm, (match) => {
+        // Clean up cells — trim whitespace around pipes
+        return match
+            .split("|")
+            .map((cell) => cell.trim())
+            .filter(Boolean)
+            .map((cell) => ` ${cell} `)
+            .join("|")
+            .replace(/^/, "|")
+            .replace(/$/, "|");
+    });
+    // 3. Deduplicate consecutive identical headings
+    const lines = md.split("\n");
+    const deduped = [];
+    for (let i = 0; i < lines.length; i++) {
+        const line = lines[i];
+        const prev = deduped[deduped.length - 1];
+        if (line.startsWith("#") &&
+            prev === line) {
+            continue; // skip duplicate heading
+        }
+        deduped.push(line);
+    }
+    md = deduped.join("\n");
+    // 4. Collapse excessive blank lines
+    md = md.replace(/\n{3,}/g, "\n\n");
+    // 5. Trim
+    md = md.trim();
+    logger.info(LogSource.MARKDOWN_NORMALIZER, `Output: ${md.length} chars`);
+    return { markdown: md };
+}
+//# sourceMappingURL=markdownNormalizer.js.map

package/dist/nodes/markdownNormalizer.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"markdownNormalizer.js","sourceRoot":"","sources":["../../src/nodes/markdownNormalizer.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAoB;IAEpB,IAAI,EAAE,GAAG,KAAK,CAAC,QAAQ,CAAC;IAExB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,UAAU,EAAE,CAAC,MAAM,QAAQ,CAAC,CAAC;IAExE,wCAAwC;IACxC,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;IAEvC,oDAAoD;IACpD,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC,KAAK,EAAE,EAAE;QACxC,gDAAgD;QAChD,OAAO,KAAK;aACT,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;aAC1B,MAAM,CAAC,OAAO,CAAC;aACf,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,GAAG,CAAC;aAC1B,IAAI,CAAC,GAAG,CAAC;aACT,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC;aACjB,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,gDAAgD;IAChD,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACzC,IACE,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YACpB,IAAI,KAAK,IAAI,EACb,CAAC;YACD,SAAS,CAAC,yBAAyB;QACrC,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,CAAC;IACD,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAExB,oCAAoC;IACpC,EAAE,GAAG,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAEnC,UAAU;IACV,EAAE,GAAG,EAAE,CAAC,IAAI,EAAE,CAAC;IAEf,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,WAAW,EAAE,CAAC,MAAM,QAAQ,CAAC,CAAC;IAEzE,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC;AAC1B,CAAC"}

package/dist/nodes/openrouterEmbedder.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { PipelineState } from "../state.js";
+/**
+ * Embeds all textChunks using OpenAI's text-embedding-3-large model.
+ * Processes in batches of BATCH_SIZE to stay within API limits.
+ */
+export declare function openrouterEmbedder(state: PipelineState): Promise<Partial<PipelineState>>;
+//# sourceMappingURL=openrouterEmbedder.d.ts.map

package/dist/nodes/openrouterEmbedder.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"openrouterEmbedder.d.ts","sourceRoot":"","sources":["../../src/nodes/openrouterEmbedder.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAMjD;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA6BjC"}

package/dist/nodes/openrouterEmbedder.js ADDED Viewed

@@ -0,0 +1,31 @@
+import { openrouter, pipelineConfig, requireInit } from "../config.js";
+import { logger, LogSource } from "../logger.js";
+/** Maximum chunks to embed in a single OpenAI API call */
+const BATCH_SIZE = 50;
+/**
+ * Embeds all textChunks using OpenAI's text-embedding-3-large model.
+ * Processes in batches of BATCH_SIZE to stay within API limits.
+ */
+export async function openrouterEmbedder(state) {
+    requireInit();
+    const { textChunks } = state;
+    logger.info(LogSource.OPENROUTER_EMBEDDER, `Embedding ${textChunks.length} chunks with ${pipelineConfig.embeddingModel}`);
+    const allVectors = [];
+    for (let i = 0; i < textChunks.length; i += BATCH_SIZE) {
+        const batch = textChunks.slice(i, i + BATCH_SIZE);
+        logger.info(LogSource.OPENROUTER_EMBEDDER, `Batch ${Math.floor(i / BATCH_SIZE) + 1}: ${batch.length} chunk(s)`);
+        const response = await openrouter.embeddings.create({
+            model: pipelineConfig.embeddingModel,
+            input: batch,
+            dimensions: 1536,
+        }); // Cast to any as 'dimensions' might not be in the basic OpenAI type for all SDK versions
+        // Sort by index to ensure correct ordering
+        const sorted = response.data.sort((a, b) => a.index - b.index);
+        for (const item of sorted) {
+            allVectors.push(item.embedding);
+        }
+    }
+    logger.info(LogSource.OPENROUTER_EMBEDDER, `Generated ${allVectors.length} vectors (${allVectors[0]?.length ?? 0}d)`);
+    return { vectors: allVectors };
+}
+//# sourceMappingURL=openrouterEmbedder.js.map

package/dist/nodes/openrouterEmbedder.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"openrouterEmbedder.js","sourceRoot":"","sources":["../../src/nodes/openrouterEmbedder.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAEvE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEjD,0DAA0D;AAC1D,MAAM,UAAU,GAAG,EAAE,CAAC;AAEtB;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,UAAU,EAAE,GAAG,KAAK,CAAC;IAE7B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,aAAa,UAAU,CAAC,MAAM,gBAAgB,cAAc,CAAC,cAAc,EAAE,CAAC,CAAC;IAE1H,MAAM,UAAU,GAAe,EAAE,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QACvD,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QAElD,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,SAAS,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,MAAM,WAAW,CAAC,CAAC;QAEhH,MAAM,QAAQ,GAAG,MAAM,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC;YAClD,KAAK,EAAE,cAAc,CAAC,cAAc;YACpC,KAAK,EAAE,KAAK;YACZ,UAAU,EAAE,IAAI;SACV,CAAC,CAAC,CAAC,yFAAyF;QAEpG,2CAA2C;QAC3C,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QACzE,KAAK,MAAM,IAAI,IAAI,MAAM,EAAE,CAAC;YAC1B,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,aAAa,UAAU,CAAC,MAAM,aAAa,UAAU,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,IAAI,CAAC,CAAC;IAEtH,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC;AACjC,CAAC"}

package/dist/nodes/pdfSplitter.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { PipelineState } from "../core/state.js";
+/**
+ * Splits a PDF into sub-documents of PDF_PAGES_PER_CHUNK pages each.
+ * Each sub-document is serialised back to a Buffer for downstream Gemini processing.
+ */
+export declare function pdfSplitter(state: PipelineState): Promise<Partial<PipelineState>>;
+//# sourceMappingURL=pdfSplitter.d.ts.map

package/dist/nodes/pdfSplitter.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"pdfSplitter.d.ts","sourceRoot":"","sources":["../../src/nodes/pdfSplitter.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,WAAW,CAC/B,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAwCjC"}

package/dist/nodes/pdfSplitter.js ADDED Viewed

@@ -0,0 +1,41 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { PDFDocument } from "pdf-lib";
+import { pipelineConfig, requireInit } from "../core/config.js";
+import { logger, LogSource } from "../core/logger.js";
+/**
+ * Splits a PDF into sub-documents of PDF_PAGES_PER_CHUNK pages each.
+ * Each sub-document is serialised back to a Buffer for downstream Gemini processing.
+ */
+export async function pdfSplitter(state) {
+    requireInit();
+    if (!state.filePath)
+        throw new Error("[pdfSplitter] filePath is missing");
+    const fullPath = path.resolve(process.cwd(), state.filePath);
+    logger.info(LogSource.PDF_SPLITTER, `Reading file at: ${fullPath}`);
+    let fileBuffer;
+    try {
+        fileBuffer = await fs.readFile(fullPath);
+    }
+    catch (err) {
+        throw new Error(`Failed to read file at ${fullPath}: ${err.message}`);
+    }
+    const pdfDoc = await PDFDocument.load(fileBuffer);
+    const totalPages = pdfDoc.getPageCount();
+    logger.info(LogSource.PDF_SPLITTER, `Total pages: ${totalPages}`);
+    logger.info(LogSource.PDF_SPLITTER, `Splitting into chunks of ${pipelineConfig.pdfPagesPerChunk} pages`);
+    const chunks = [];
+    for (let start = 0; start < totalPages; start += pipelineConfig.pdfPagesPerChunk) {
+        const end = Math.min(start + pipelineConfig.pdfPagesPerChunk, totalPages);
+        const subDoc = await PDFDocument.create();
+        const copiedPages = await subDoc.copyPages(pdfDoc, Array.from({ length: end - start }, (_, i) => start + i));
+        for (const page of copiedPages) {
+            subDoc.addPage(page);
+        }
+        const subBytes = await subDoc.save();
+        chunks.push(Buffer.from(subBytes).toString("base64"));
+    }
+    logger.info(LogSource.PDF_SPLITTER, `Created ${chunks.length} PDF chunk(s)`);
+    return { pdfChunks: chunks };
+}
+//# sourceMappingURL=pdfSplitter.js.map

package/dist/nodes/pdfSplitter.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"pdfSplitter.js","sourceRoot":"","sources":["../../src/nodes/pdfSplitter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,IAAI,CAAC,KAAK,CAAC,QAAQ;QAAE,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;IAC1E,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,oBAAoB,QAAQ,EAAE,CAAC,CAAC;IAEpE,IAAI,UAAU,CAAC;IACf,IAAI,CAAC;QACH,UAAU,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC3C,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CAAC,0BAA0B,QAAQ,KAAK,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IACxE,CAAC;IACD,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAClD,MAAM,UAAU,GAAG,MAAM,CAAC,YAAY,EAAE,CAAC;IAEzC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,gBAAgB,UAAU,EAAE,CAAC,CAAC;IAClE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,4BAA4B,cAAc,CAAC,gBAAgB,QAAQ,CAAC,CAAC;IAEzG,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,UAAU,EAAE,KAAK,IAAI,cAAc,CAAC,gBAAgB,EAAE,CAAC;QACjF,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,cAAc,CAAC,gBAAgB,EAAE,UAAU,CAAC,CAAC;QAC1E,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,MAAM,EAAE,CAAC;QAE1C,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,SAAS,CACxC,MAAM,EACN,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,GAAG,GAAG,KAAK,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,KAAK,GAAG,CAAC,CAAC,CACzD,CAAC;QAEF,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QACvB,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;QACrC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,WAAW,MAAM,CAAC,MAAM,eAAe,CAAC,CAAC;IAE7E,OAAO,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC;AAC/B,CAAC"}

package/dist/nodes/saveMarkdown.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { PipelineState } from "../core/state.js";
+/**
+ * Saves the final normalized markdown to a local file.
+ * Creates a folder named after the document (with a unique hash) in the 'outputs' directory.
+ */
+export declare function saveMarkdown(state: PipelineState): Promise<Partial<PipelineState>>;
+//# sourceMappingURL=saveMarkdown.d.ts.map

package/dist/nodes/saveMarkdown.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"saveMarkdown.d.ts","sourceRoot":"","sources":["../../src/nodes/saveMarkdown.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,YAAY,CAChC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAyBjC"}

package/dist/nodes/saveMarkdown.js ADDED Viewed

@@ -0,0 +1,28 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import crypto from "node:crypto";
+import { logger, LogSource } from "../core/logger.js";
+/**
+ * Saves the final normalized markdown to a local file.
+ * Creates a folder named after the document (with a unique hash) in the 'outputs' directory.
+ */
+export async function saveMarkdown(state) {
+    const { filePath, markdown } = state;
+    // Create a unique hash for the output folder
+    const fileHash = crypto
+        .createHash("md5")
+        .update(state.filePath || "pasted_text")
+        .digest("hex")
+        .slice(0, 16);
+    const baseName = state.filePath ? path.parse(state.filePath).name : "pasted_text";
+    const outputDir = path.resolve(process.cwd(), "outputs", `${baseName}_${fileHash}`);
+    const outputPath = path.join(outputDir, "full_content.md");
+    logger.info(LogSource.SAVE_MARKDOWN, `Saving to: ${outputPath}`);
+    // Create directory (and parents)
+    await fs.mkdir(outputDir, { recursive: true });
+    // Write markdown
+    await fs.writeFile(outputPath, markdown, "utf-8");
+    logger.success(LogSource.SAVE_MARKDOWN, `Markdown saved successfully`);
+    return {};
+}
+//# sourceMappingURL=saveMarkdown.js.map

package/dist/nodes/saveMarkdown.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"saveMarkdown.js","sourceRoot":"","sources":["../../src/nodes/saveMarkdown.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,MAAM,MAAM,aAAa,CAAC;AAEjC,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,KAAoB;IAEpB,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,KAAK,CAAC;IAErC,6CAA6C;IAC7C,MAAM,QAAQ,GAAG,MAAM;SACpB,UAAU,CAAC,KAAK,CAAC;SACjB,MAAM,CAAC,KAAK,CAAC,QAAQ,IAAI,aAAa,CAAC;SACvC,MAAM,CAAC,KAAK,CAAC;SACb,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAEhB,MAAM,QAAQ,GAAG,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,aAAa,CAAC;IAClF,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,GAAG,QAAQ,IAAI,QAAQ,EAAE,CAAC,CAAC;IACpF,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,iBAAiB,CAAC,CAAC;IAE3D,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,cAAc,UAAU,EAAE,CAAC,CAAC;IAEjE,iCAAiC;IACjC,MAAM,EAAE,CAAC,KAAK,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE/C,iBAAiB;IACjB,MAAM,EAAE,CAAC,SAAS,CAAC,UAAU,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;IAElD,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,aAAa,EAAE,6BAA6B,CAAC,CAAC;IAEvE,OAAO,EAAE,CAAC;AACZ,CAAC"}

package/dist/nodes/textExtractorNode.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { PipelineState } from "../core/state.js";
+/**
+ * Extracts raw text from office documents (DOCX, PPTX, XLSX) using officeparser,
+ * CSV files using csv-parse, and TXT files via direct read.
+ */
+export declare function textExtractorNode(state: PipelineState): Promise<Partial<PipelineState>>;
+//# sourceMappingURL=textExtractorNode.d.ts.map

package/dist/nodes/textExtractorNode.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"textExtractorNode.d.ts","sourceRoot":"","sources":["../../src/nodes/textExtractorNode.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA+BjC"}