@virstack/doc-ingest 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +203 -0
- package/dist/adapters/aiAdapters.d.ts +25 -0
- package/dist/adapters/aiAdapters.d.ts.map +1 -0
- package/dist/adapters/aiAdapters.js +73 -0
- package/dist/adapters/aiAdapters.js.map +1 -0
- package/dist/adapters/vectorStore.d.ts +24 -0
- package/dist/adapters/vectorStore.d.ts.map +1 -0
- package/dist/adapters/vectorStore.js +22 -0
- package/dist/adapters/vectorStore.js.map +1 -0
- package/dist/aiAdapters.d.ts +25 -0
- package/dist/aiAdapters.d.ts.map +1 -0
- package/dist/aiAdapters.js +50 -0
- package/dist/aiAdapters.js.map +1 -0
- package/dist/assets/logo.png +0 -0
- package/dist/batchPipeline.d.ts +52 -0
- package/dist/batchPipeline.d.ts.map +1 -0
- package/dist/batchPipeline.js +81 -0
- package/dist/batchPipeline.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +217 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +26 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +97 -0
- package/dist/config.js.map +1 -0
- package/dist/core/config.d.ts +26 -0
- package/dist/core/config.d.ts.map +1 -0
- package/dist/core/config.js +106 -0
- package/dist/core/config.js.map +1 -0
- package/dist/core/logger.d.ts +31 -0
- package/dist/core/logger.d.ts.map +1 -0
- package/dist/core/logger.js +42 -0
- package/dist/core/logger.js.map +1 -0
- package/dist/core/state.d.ts +52 -0
- package/dist/core/state.d.ts.map +1 -0
- package/dist/core/state.js +27 -0
- package/dist/core/state.js.map +1 -0
- package/dist/graphs/batchProcessor.d.ts +72 -0
- package/dist/graphs/batchProcessor.d.ts.map +1 -0
- package/dist/graphs/batchProcessor.js +94 -0
- package/dist/graphs/batchProcessor.js.map +1 -0
- package/dist/graphs/singleDocument.d.ts +303 -0
- package/dist/graphs/singleDocument.d.ts.map +1 -0
- package/dist/graphs/singleDocument.js +93 -0
- package/dist/graphs/singleDocument.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +10 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +24 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +36 -0
- package/dist/logger.js.map +1 -0
- package/dist/logo.d.ts +2 -0
- package/dist/logo.d.ts.map +1 -0
- package/dist/logo.js +3 -0
- package/dist/logo.js.map +1 -0
- package/dist/nodes/fileTypeRouter.d.ts +16 -0
- package/dist/nodes/fileTypeRouter.d.ts.map +1 -0
- package/dist/nodes/fileTypeRouter.js +72 -0
- package/dist/nodes/fileTypeRouter.js.map +1 -0
- package/dist/nodes/geminiExtraction.d.ts +19 -0
- package/dist/nodes/geminiExtraction.d.ts.map +1 -0
- package/dist/nodes/geminiExtraction.js +87 -0
- package/dist/nodes/geminiExtraction.js.map +1 -0
- package/dist/nodes/libreOfficeToPdf.d.ts +8 -0
- package/dist/nodes/libreOfficeToPdf.d.ts.map +1 -0
- package/dist/nodes/libreOfficeToPdf.js +61 -0
- package/dist/nodes/libreOfficeToPdf.js.map +1 -0
- package/dist/nodes/llmExtractionNode.d.ts +19 -0
- package/dist/nodes/llmExtractionNode.d.ts.map +1 -0
- package/dist/nodes/llmExtractionNode.js +68 -0
- package/dist/nodes/llmExtractionNode.js.map +1 -0
- package/dist/nodes/markdownChunker.d.ts +8 -0
- package/dist/nodes/markdownChunker.d.ts.map +1 -0
- package/dist/nodes/markdownChunker.js +24 -0
- package/dist/nodes/markdownChunker.js.map +1 -0
- package/dist/nodes/markdownMerger.d.ts +9 -0
- package/dist/nodes/markdownMerger.d.ts.map +1 -0
- package/dist/nodes/markdownMerger.js +33 -0
- package/dist/nodes/markdownMerger.js.map +1 -0
- package/dist/nodes/markdownNormalizer.d.ts +10 -0
- package/dist/nodes/markdownNormalizer.d.ts.map +1 -0
- package/dist/nodes/markdownNormalizer.js +46 -0
- package/dist/nodes/markdownNormalizer.js.map +1 -0
- package/dist/nodes/openrouterEmbedder.d.ts +7 -0
- package/dist/nodes/openrouterEmbedder.d.ts.map +1 -0
- package/dist/nodes/openrouterEmbedder.js +31 -0
- package/dist/nodes/openrouterEmbedder.js.map +1 -0
- package/dist/nodes/pdfSplitter.d.ts +7 -0
- package/dist/nodes/pdfSplitter.d.ts.map +1 -0
- package/dist/nodes/pdfSplitter.js +41 -0
- package/dist/nodes/pdfSplitter.js.map +1 -0
- package/dist/nodes/saveMarkdown.d.ts +7 -0
- package/dist/nodes/saveMarkdown.d.ts.map +1 -0
- package/dist/nodes/saveMarkdown.js +28 -0
- package/dist/nodes/saveMarkdown.js.map +1 -0
- package/dist/nodes/textExtractorNode.d.ts +7 -0
- package/dist/nodes/textExtractorNode.d.ts.map +1 -0
- package/dist/nodes/textExtractorNode.js +39 -0
- package/dist/nodes/textExtractorNode.js.map +1 -0
- package/dist/nodes/upstashUpsert.d.ts +7 -0
- package/dist/nodes/upstashUpsert.d.ts.map +1 -0
- package/dist/nodes/upstashUpsert.js +45 -0
- package/dist/nodes/upstashUpsert.js.map +1 -0
- package/dist/nodes/vectorEmbedderNode.d.ts +7 -0
- package/dist/nodes/vectorEmbedderNode.d.ts.map +1 -0
- package/dist/nodes/vectorEmbedderNode.js +23 -0
- package/dist/nodes/vectorEmbedderNode.js.map +1 -0
- package/dist/nodes/vectorUpsertNode.d.ts +7 -0
- package/dist/nodes/vectorUpsertNode.d.ts.map +1 -0
- package/dist/nodes/vectorUpsertNode.js +45 -0
- package/dist/nodes/vectorUpsertNode.js.map +1 -0
- package/dist/pipeline.d.ts +303 -0
- package/dist/pipeline.d.ts.map +1 -0
- package/dist/pipeline.js +93 -0
- package/dist/pipeline.js.map +1 -0
- package/dist/state.d.ts +52 -0
- package/dist/state.d.ts.map +1 -0
- package/dist/state.js +27 -0
- package/dist/state.js.map +1 -0
- package/dist/vectorStore.d.ts +24 -0
- package/dist/vectorStore.d.ts.map +1 -0
- package/dist/vectorStore.js +22 -0
- package/dist/vectorStore.js.map +1 -0
- package/package.json +55 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import officeparser from "officeparser";
|
|
3
|
+
import { parse } from "csv-parse/sync";
|
|
4
|
+
import { logger, LogSource } from "../core/logger.js";
|
|
5
|
+
/**
|
|
6
|
+
* Extracts raw text from office documents (DOCX, PPTX, XLSX) using officeparser,
|
|
7
|
+
* CSV files using csv-parse, and TXT files via direct read.
|
|
8
|
+
*/
|
|
9
|
+
export async function textExtractorNode(state) {
|
|
10
|
+
const { filePath, mimeType } = state;
|
|
11
|
+
logger.info(LogSource.TEXT_EXTRACTOR, `Parsing: ${filePath} (${mimeType})`);
|
|
12
|
+
let rawText;
|
|
13
|
+
if (mimeType === "text/plain") {
|
|
14
|
+
// Plain text — just read directly
|
|
15
|
+
rawText = filePath ? await fs.readFile(filePath, "utf-8") : state.rawText;
|
|
16
|
+
}
|
|
17
|
+
else if (mimeType === "text/csv") {
|
|
18
|
+
// CSV — parse and convert to a readable text table
|
|
19
|
+
if (!filePath)
|
|
20
|
+
throw new Error("filePath required for CSV parsing");
|
|
21
|
+
const csvBuffer = await fs.readFile(filePath, "utf-8");
|
|
22
|
+
const records = parse(csvBuffer, {
|
|
23
|
+
skip_empty_lines: true,
|
|
24
|
+
});
|
|
25
|
+
// Convert to a simple text representation
|
|
26
|
+
rawText = records
|
|
27
|
+
.map((row) => row.join(" | "))
|
|
28
|
+
.join("\n");
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
// DOCX, PPTX, XLSX — use officeparser
|
|
32
|
+
if (!filePath)
|
|
33
|
+
throw new Error("filePath required for office document parsing");
|
|
34
|
+
rawText = await officeparser.parseOfficeAsync(filePath);
|
|
35
|
+
}
|
|
36
|
+
logger.info(LogSource.TEXT_EXTRACTOR, `Extracted ${rawText.length} chars of raw text`);
|
|
37
|
+
return { rawText };
|
|
38
|
+
}
|
|
39
|
+
//# sourceMappingURL=textExtractorNode.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"textExtractorNode.js","sourceRoot":"","sources":["../../src/nodes/textExtractorNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,YAAY,MAAM,cAAc,CAAC;AACxC,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAC;AAEvC,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAAoB;IAEpB,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,KAAK,CAAC;IAErC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,YAAY,QAAQ,KAAK,QAAQ,GAAG,CAAC,CAAC;IAE5E,IAAI,OAAe,CAAC;IAEpB,IAAI,QAAQ,KAAK,YAAY,EAAE,CAAC;QAC9B,kCAAkC;QAClC,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAC5E,CAAC;SAAM,IAAI,QAAQ,KAAK,UAAU,EAAE,CAAC;QACnC,mDAAmD;QACnD,IAAI,CAAC,QAAQ;YAAE,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACpE,MAAM,SAAS,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACvD,MAAM,OAAO,GAAe,KAAK,CAAC,SAAS,EAAE;YAC3C,gBAAgB,EAAE,IAAI;SACvB,CAAC,CAAC;QAEH,0CAA0C;QAC1C,OAAO,GAAG,OAAO;aACd,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;aAC7B,IAAI,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC;SAAM,CAAC;QACN,sCAAsC;QACtC,IAAI,CAAC,QAAQ;YAAE,MAAM,IAAI,KAAK,CAAC,+CAA+C,CAAC,CAAC;QAChF,OAAO,GAAG,MAAM,YAAY,CAAC,gBAAgB,CAAC,QAAQ,CAAW,CAAC;IACpE,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,aAAa,OAAO,CAAC,MAAM,oBAAoB,CAAC,CAAC;IAEvF,OAAO,EAAE,OAAO,EAAE,CAAC;AACrB,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { PipelineState } from "../state.js";
|
|
2
|
+
/**
|
|
3
|
+
* Upserts text chunks + their embedding vectors into Upstash Vector.
|
|
4
|
+
* Each chunk is stored with rich metadata for RAG filtering.
|
|
5
|
+
*/
|
|
6
|
+
export declare function upstashUpsert(state: PipelineState): Promise<Partial<PipelineState>>;
|
|
7
|
+
//# sourceMappingURL=upstashUpsert.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"upstashUpsert.d.ts","sourceRoot":"","sources":["../../src/nodes/upstashUpsert.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAGjD;;;GAGG;AACH,wBAAsB,aAAa,CACjC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA0CjC"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import crypto from "node:crypto";
|
|
3
|
+
import { vectorIndex, requireInit } from "../config.js";
|
|
4
|
+
import { logger, LogSource } from "../logger.js";
|
|
5
|
+
/**
|
|
6
|
+
* Upserts text chunks + their embedding vectors into Upstash Vector.
|
|
7
|
+
* Each chunk is stored with rich metadata for RAG filtering.
|
|
8
|
+
*/
|
|
9
|
+
export async function upstashUpsert(state) {
|
|
10
|
+
requireInit();
|
|
11
|
+
const { filePath, mimeType, textChunks, vectors } = state;
|
|
12
|
+
// Generate a stable document ID from the file path
|
|
13
|
+
const docId = crypto
|
|
14
|
+
.createHash("sha256")
|
|
15
|
+
.update(filePath || "pasted_text")
|
|
16
|
+
.digest("hex")
|
|
17
|
+
.slice(0, 8);
|
|
18
|
+
logger.info(LogSource.UPSTASH_UPSERT, `Upserting ${textChunks.length} chunks for doc ${docId}`);
|
|
19
|
+
// Upstash Vector supports batch upserts
|
|
20
|
+
const upsertPayload = textChunks.map((chunk, i) => ({
|
|
21
|
+
id: `${docId}-chunk-${i}`,
|
|
22
|
+
vector: vectors[i],
|
|
23
|
+
data: chunk,
|
|
24
|
+
metadata: {
|
|
25
|
+
text: chunk,
|
|
26
|
+
source: state.filePath ? path.basename(state.filePath) : "pasted_text",
|
|
27
|
+
sourcePath: filePath,
|
|
28
|
+
mimeType: mimeType,
|
|
29
|
+
chunkIndex: i,
|
|
30
|
+
totalChunks: textChunks.length,
|
|
31
|
+
docId: docId,
|
|
32
|
+
ingestedAt: new Date().toISOString(),
|
|
33
|
+
},
|
|
34
|
+
}));
|
|
35
|
+
// Upsert in batches of 100 (Upstash limit)
|
|
36
|
+
const BATCH_SIZE = 100;
|
|
37
|
+
for (let i = 0; i < upsertPayload.length; i += BATCH_SIZE) {
|
|
38
|
+
const batch = upsertPayload.slice(i, i + BATCH_SIZE);
|
|
39
|
+
await vectorIndex.upsert(batch);
|
|
40
|
+
logger.info(LogSource.UPSTASH_UPSERT, `Upserted batch ${Math.floor(i / BATCH_SIZE) + 1}: ${batch.length} vectors`);
|
|
41
|
+
}
|
|
42
|
+
logger.success(LogSource.UPSTASH_UPSERT, `All ${textChunks.length} chunks upserted`);
|
|
43
|
+
return {};
|
|
44
|
+
}
|
|
45
|
+
//# sourceMappingURL=upstashUpsert.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"upstashUpsert.js","sourceRoot":"","sources":["../../src/nodes/upstashUpsert.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,MAAM,MAAM,aAAa,CAAC;AACjC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAExD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEjD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,KAAK,CAAC;IAE1D,mDAAmD;IACnD,MAAM,KAAK,GAAG,MAAM;SACjB,UAAU,CAAC,QAAQ,CAAC;SACpB,MAAM,CAAC,QAAQ,IAAI,aAAa,CAAC;SACjC,MAAM,CAAC,KAAK,CAAC;SACb,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAEf,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,aAAa,UAAU,CAAC,MAAM,mBAAmB,KAAK,EAAE,CAAC,CAAC;IAEhG,wCAAwC;IACxC,MAAM,aAAa,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAClD,EAAE,EAAE,GAAG,KAAK,UAAU,CAAC,EAAE;QACzB,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;QAClB,IAAI,EAAE,KAAK;QACX,QAAQ,EAAE;YACR,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,aAAa;YACtE,UAAU,EAAE,QAAQ;YACpB,QAAQ,EAAE,QAAQ;YAClB,UAAU,EAAE,CAAC;YACb,WAAW,EAAE,UAAU,CAAC,MAAM;YAC9B,KAAK,EAAE,KAAK;YACZ,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACrC;KACF,CAAC,CAAC,CAAC;IAEJ,2CAA2C;IAC3C,MAAM,UAAU,GAAG,GAAG,CAAC;IACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QAC1D,MAAM,KAAK,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QACrD,MAAM,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAEhC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,kBAAkB,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,MAAM,UAAU,CAAC,CAAC;IACrH,CAAC;IAED,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,cAAc,EAAE,OAAO,UAAU,CAAC,MAAM,kBAAkB,CAAC,CAAC;IAErF,OAAO,EAAE,CAAC;AACZ,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { PipelineState } from "../core/state.js";
|
|
2
|
+
/**
|
|
3
|
+
* Embeds all textChunks using the injected EmbeddingAdapter.
|
|
4
|
+
* Processes in batches to stay within API limits.
|
|
5
|
+
*/
|
|
6
|
+
export declare function vectorEmbedderNode(state: PipelineState): Promise<Partial<PipelineState>>;
|
|
7
|
+
//# sourceMappingURL=vectorEmbedderNode.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vectorEmbedderNode.d.ts","sourceRoot":"","sources":["../../src/nodes/vectorEmbedderNode.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAsBjC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { pipelineConfig, requireInit } from "../core/config.js";
|
|
2
|
+
import { logger, LogSource } from "../core/logger.js";
|
|
3
|
+
/**
|
|
4
|
+
* Embeds all textChunks using the injected EmbeddingAdapter.
|
|
5
|
+
* Processes in batches to stay within API limits.
|
|
6
|
+
*/
|
|
7
|
+
export async function vectorEmbedderNode(state) {
|
|
8
|
+
requireInit();
|
|
9
|
+
const { textChunks } = state;
|
|
10
|
+
logger.info(LogSource.VECTOR_EMBEDDER, `Embedding ${textChunks.length} chunks via injected Embedder Node`);
|
|
11
|
+
const allVectors = [];
|
|
12
|
+
const BATCH_SIZE = 50; // Common safe default, though adapters might handle their own batching internally
|
|
13
|
+
for (let i = 0; i < textChunks.length; i += BATCH_SIZE) {
|
|
14
|
+
const batch = textChunks.slice(i, i + BATCH_SIZE);
|
|
15
|
+
logger.info(LogSource.VECTOR_EMBEDDER, `Batch ${Math.floor(i / BATCH_SIZE) + 1}: ${batch.length} chunk(s)`);
|
|
16
|
+
// Call the injected Embedding adapter!
|
|
17
|
+
const vectors = await pipelineConfig.embedder.embed(batch);
|
|
18
|
+
allVectors.push(...vectors);
|
|
19
|
+
}
|
|
20
|
+
logger.info(LogSource.VECTOR_EMBEDDER, `Generated ${allVectors.length} vectors (${allVectors[0]?.length ?? 0}d)`);
|
|
21
|
+
return { vectors: allVectors };
|
|
22
|
+
}
|
|
23
|
+
//# sourceMappingURL=vectorEmbedderNode.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vectorEmbedderNode.js","sourceRoot":"","sources":["../../src/nodes/vectorEmbedderNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,UAAU,EAAE,GAAG,KAAK,CAAC;IAE7B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,aAAa,UAAU,CAAC,MAAM,oCAAoC,CAAC,CAAC;IAE3G,MAAM,UAAU,GAAe,EAAE,CAAC;IAClC,MAAM,UAAU,GAAG,EAAE,CAAC,CAAC,kFAAkF;IAEzG,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QACvD,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QAElD,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,SAAS,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,MAAM,WAAW,CAAC,CAAC;QAE5G,uCAAuC;QACvC,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAC3D,UAAU,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC;IAC9B,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,aAAa,UAAU,CAAC,MAAM,aAAa,UAAU,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,IAAI,CAAC,CAAC;IAElH,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC;AACjC,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { PipelineState } from "../core/state.js";
|
|
2
|
+
/**
|
|
3
|
+
* Upserts text chunks + their embedding vectors into a generic Vector Store Adapter.
|
|
4
|
+
* Each chunk is stored with rich metadata for vector filtering.
|
|
5
|
+
*/
|
|
6
|
+
export declare function vectorUpsertNode(state: PipelineState): Promise<Partial<PipelineState>>;
|
|
7
|
+
//# sourceMappingURL=vectorUpsertNode.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vectorUpsertNode.d.ts","sourceRoot":"","sources":["../../src/nodes/vectorUpsertNode.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAItD;;;GAGG;AACH,wBAAsB,gBAAgB,CACpC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA2CjC"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import crypto from "node:crypto";
|
|
3
|
+
import { pipelineConfig, requireInit } from "../core/config.js";
|
|
4
|
+
import { logger, LogSource } from "../core/logger.js";
|
|
5
|
+
/**
|
|
6
|
+
* Upserts text chunks + their embedding vectors into a generic Vector Store Adapter.
|
|
7
|
+
* Each chunk is stored with rich metadata for vector filtering.
|
|
8
|
+
*/
|
|
9
|
+
export async function vectorUpsertNode(state) {
|
|
10
|
+
requireInit();
|
|
11
|
+
const { filePath, mimeType, textChunks, vectors } = state;
|
|
12
|
+
// Generate a stable document ID from the file path
|
|
13
|
+
const docId = crypto
|
|
14
|
+
.createHash("sha256")
|
|
15
|
+
.update(filePath || "pasted_text")
|
|
16
|
+
.digest("hex")
|
|
17
|
+
.slice(0, 8);
|
|
18
|
+
logger.info(LogSource.VECTOR_UPSERT, `Upserting ${textChunks.length} chunks via Vector Store Adapter for doc ${docId}`);
|
|
19
|
+
// Format the data into our standard contract
|
|
20
|
+
const records = textChunks.map((chunk, i) => ({
|
|
21
|
+
id: `${docId}-chunk-${i}`,
|
|
22
|
+
vector: vectors[i],
|
|
23
|
+
metadata: {
|
|
24
|
+
text: chunk,
|
|
25
|
+
source: state.filePath ? path.basename(state.filePath) : "pasted_text",
|
|
26
|
+
sourcePath: filePath,
|
|
27
|
+
mimeType: mimeType,
|
|
28
|
+
chunkIndex: i,
|
|
29
|
+
totalChunks: textChunks.length,
|
|
30
|
+
docId: docId,
|
|
31
|
+
ingestedAt: new Date().toISOString(),
|
|
32
|
+
},
|
|
33
|
+
}));
|
|
34
|
+
// Upsert in batches of 100 (Common limit for many vector DBs)
|
|
35
|
+
const BATCH_SIZE = 100;
|
|
36
|
+
for (let i = 0; i < records.length; i += BATCH_SIZE) {
|
|
37
|
+
const batch = records.slice(i, i + BATCH_SIZE);
|
|
38
|
+
// Call the user's database adapter instead of Upstash directly!
|
|
39
|
+
await pipelineConfig.vectorStore.upsert(batch);
|
|
40
|
+
logger.info(LogSource.VECTOR_UPSERT, `Upserted batch ${Math.floor(i / BATCH_SIZE) + 1} (${batch.length} vectors)`);
|
|
41
|
+
}
|
|
42
|
+
logger.success(LogSource.VECTOR_UPSERT, `All ${textChunks.length} chunks upserted`);
|
|
43
|
+
return {};
|
|
44
|
+
}
|
|
45
|
+
//# sourceMappingURL=vectorUpsertNode.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vectorUpsertNode.js","sourceRoot":"","sources":["../../src/nodes/vectorUpsertNode.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,MAAM,MAAM,aAAa,CAAC;AACjC,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAGtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,KAAK,CAAC;IAE1D,mDAAmD;IACnD,MAAM,KAAK,GAAG,MAAM;SACjB,UAAU,CAAC,QAAQ,CAAC;SACpB,MAAM,CAAC,QAAQ,IAAI,aAAa,CAAC;SACjC,MAAM,CAAC,KAAK,CAAC;SACb,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAEf,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,aAAa,UAAU,CAAC,MAAM,4CAA4C,KAAK,EAAE,CAAC,CAAC;IAExH,6CAA6C;IAC7C,MAAM,OAAO,GAAmB,UAAU,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5D,EAAE,EAAE,GAAG,KAAK,UAAU,CAAC,EAAE;QACzB,MAAM,EAAE,OAAO,CAAC,CAAC,CAAa;QAC9B,QAAQ,EAAE;YACR,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,aAAa;YACtE,UAAU,EAAE,QAAQ;YACpB,QAAQ,EAAE,QAAQ;YAClB,UAAU,EAAE,CAAC;YACb,WAAW,EAAE,UAAU,CAAC,MAAM;YAC9B,KAAK,EAAE,KAAK;YACZ,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACrC;KACF,CAAC,CAAC,CAAC;IAEJ,8DAA8D;IAC9D,MAAM,UAAU,GAAG,GAAG,CAAC;IACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QACpD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QAE/C,gEAAgE;QAChE,MAAM,cAAc,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAE/C,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,kBAAkB,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,MAAM,WAAW,CAAC,CAAC;IACrH,CAAC;IAED,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,aAAa,EAAE,OAAO,UAAU,CAAC,MAAM,kBAAkB,CAAC,CAAC;IAEpF,OAAO,EAAE,CAAC;AACZ,CAAC"}
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
export declare function buildPipeline(): import("@langchain/langgraph").CompiledStateGraph<import("@langchain/langgraph").StateType<{
|
|
2
|
+
filePath: {
|
|
3
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
4
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
5
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
6
|
+
};
|
|
7
|
+
mimeType: {
|
|
8
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
9
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
10
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
11
|
+
};
|
|
12
|
+
rawText: {
|
|
13
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
14
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
15
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
16
|
+
};
|
|
17
|
+
pdfChunks: {
|
|
18
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
19
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
20
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
21
|
+
};
|
|
22
|
+
markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
23
|
+
markdown: {
|
|
24
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
25
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
26
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
27
|
+
};
|
|
28
|
+
textChunks: {
|
|
29
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
30
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
31
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
32
|
+
};
|
|
33
|
+
vectors: {
|
|
34
|
+
(): import("@langchain/langgraph").LastValue<number[][]>;
|
|
35
|
+
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
36
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
37
|
+
};
|
|
38
|
+
}>, import("@langchain/langgraph").UpdateType<{
|
|
39
|
+
filePath: {
|
|
40
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
41
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
42
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
43
|
+
};
|
|
44
|
+
mimeType: {
|
|
45
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
46
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
47
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
48
|
+
};
|
|
49
|
+
rawText: {
|
|
50
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
51
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
52
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
53
|
+
};
|
|
54
|
+
pdfChunks: {
|
|
55
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
56
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
57
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
58
|
+
};
|
|
59
|
+
markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
60
|
+
markdown: {
|
|
61
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
62
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
63
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
64
|
+
};
|
|
65
|
+
textChunks: {
|
|
66
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
67
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
68
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
69
|
+
};
|
|
70
|
+
vectors: {
|
|
71
|
+
(): import("@langchain/langgraph").LastValue<number[][]>;
|
|
72
|
+
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
73
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
74
|
+
};
|
|
75
|
+
}>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
|
|
76
|
+
filePath: {
|
|
77
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
78
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
79
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
80
|
+
};
|
|
81
|
+
mimeType: {
|
|
82
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
83
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
84
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
85
|
+
};
|
|
86
|
+
rawText: {
|
|
87
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
88
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
89
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
90
|
+
};
|
|
91
|
+
pdfChunks: {
|
|
92
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
93
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
94
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
95
|
+
};
|
|
96
|
+
markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
97
|
+
markdown: {
|
|
98
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
99
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
100
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
101
|
+
};
|
|
102
|
+
textChunks: {
|
|
103
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
104
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
105
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
106
|
+
};
|
|
107
|
+
vectors: {
|
|
108
|
+
(): import("@langchain/langgraph").LastValue<number[][]>;
|
|
109
|
+
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
110
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
111
|
+
};
|
|
112
|
+
}, {
|
|
113
|
+
filePath: {
|
|
114
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
115
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
116
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
117
|
+
};
|
|
118
|
+
mimeType: {
|
|
119
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
120
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
121
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
122
|
+
};
|
|
123
|
+
rawText: {
|
|
124
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
125
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
126
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
127
|
+
};
|
|
128
|
+
pdfChunks: {
|
|
129
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
130
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
131
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
132
|
+
};
|
|
133
|
+
markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
134
|
+
markdown: {
|
|
135
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
136
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
137
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
138
|
+
};
|
|
139
|
+
textChunks: {
|
|
140
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
141
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
142
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
143
|
+
};
|
|
144
|
+
vectors: {
|
|
145
|
+
(): import("@langchain/langgraph").LastValue<number[][]>;
|
|
146
|
+
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
147
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
148
|
+
};
|
|
149
|
+
}, import("@langchain/langgraph").StateDefinition>;
|
|
150
|
+
/**
|
|
151
|
+
* The compiled graph instance.
|
|
152
|
+
* Exported specifically for LangGraph Studio and the LangGraph CLI.
|
|
153
|
+
*/
|
|
154
|
+
export declare const graph: import("@langchain/langgraph").CompiledStateGraph<import("@langchain/langgraph").StateType<{
|
|
155
|
+
filePath: {
|
|
156
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
157
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
158
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
159
|
+
};
|
|
160
|
+
mimeType: {
|
|
161
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
162
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
163
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
164
|
+
};
|
|
165
|
+
rawText: {
|
|
166
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
167
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
168
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
169
|
+
};
|
|
170
|
+
pdfChunks: {
|
|
171
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
172
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
173
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
174
|
+
};
|
|
175
|
+
markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
176
|
+
markdown: {
|
|
177
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
178
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
179
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
180
|
+
};
|
|
181
|
+
textChunks: {
|
|
182
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
183
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
184
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
185
|
+
};
|
|
186
|
+
vectors: {
|
|
187
|
+
(): import("@langchain/langgraph").LastValue<number[][]>;
|
|
188
|
+
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
189
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
190
|
+
};
|
|
191
|
+
}>, import("@langchain/langgraph").UpdateType<{
|
|
192
|
+
filePath: {
|
|
193
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
194
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
195
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
196
|
+
};
|
|
197
|
+
mimeType: {
|
|
198
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
199
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
200
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
201
|
+
};
|
|
202
|
+
rawText: {
|
|
203
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
204
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
205
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
206
|
+
};
|
|
207
|
+
pdfChunks: {
|
|
208
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
209
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
210
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
211
|
+
};
|
|
212
|
+
markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
213
|
+
markdown: {
|
|
214
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
215
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
216
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
217
|
+
};
|
|
218
|
+
textChunks: {
|
|
219
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
220
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
221
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
222
|
+
};
|
|
223
|
+
vectors: {
|
|
224
|
+
(): import("@langchain/langgraph").LastValue<number[][]>;
|
|
225
|
+
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
226
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
227
|
+
};
|
|
228
|
+
}>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
|
|
229
|
+
filePath: {
|
|
230
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
231
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
232
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
233
|
+
};
|
|
234
|
+
mimeType: {
|
|
235
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
236
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
237
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
238
|
+
};
|
|
239
|
+
rawText: {
|
|
240
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
241
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
242
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
243
|
+
};
|
|
244
|
+
pdfChunks: {
|
|
245
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
246
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
247
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
248
|
+
};
|
|
249
|
+
markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
250
|
+
markdown: {
|
|
251
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
252
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
253
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
254
|
+
};
|
|
255
|
+
textChunks: {
|
|
256
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
257
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
258
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
259
|
+
};
|
|
260
|
+
vectors: {
|
|
261
|
+
(): import("@langchain/langgraph").LastValue<number[][]>;
|
|
262
|
+
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
263
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
264
|
+
};
|
|
265
|
+
}, {
|
|
266
|
+
filePath: {
|
|
267
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
268
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
269
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
270
|
+
};
|
|
271
|
+
mimeType: {
|
|
272
|
+
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
273
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
274
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
275
|
+
};
|
|
276
|
+
rawText: {
|
|
277
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
278
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
279
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
280
|
+
};
|
|
281
|
+
pdfChunks: {
|
|
282
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
283
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
284
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
285
|
+
};
|
|
286
|
+
markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
287
|
+
markdown: {
|
|
288
|
+
(): import("@langchain/langgraph").LastValue<string>;
|
|
289
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
|
|
290
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
291
|
+
};
|
|
292
|
+
textChunks: {
|
|
293
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
294
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
295
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
296
|
+
};
|
|
297
|
+
vectors: {
|
|
298
|
+
(): import("@langchain/langgraph").LastValue<number[][]>;
|
|
299
|
+
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
300
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
301
|
+
};
|
|
302
|
+
}, import("@langchain/langgraph").StateDefinition>;
|
|
303
|
+
//# sourceMappingURL=pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AA0CA,wBAAgB,aAAa;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mDA4D5B;AAED;;;GAGG;AACH,eAAO,MAAM,KAAK;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kDAAkB,CAAC"}
|
package/dist/pipeline.js
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { StateGraph, END, Send } from "@langchain/langgraph";
|
|
2
|
+
import { PipelineStateAnnotation } from "./state.js";
|
|
3
|
+
import { fileTypeRouter, routeByMimeType } from "./nodes/fileTypeRouter.js";
|
|
4
|
+
import { pdfSplitter } from "./nodes/pdfSplitter.js";
|
|
5
|
+
import { llmExtractionNode, routeAfterLlm } from "./nodes/llmExtractionNode.js";
|
|
6
|
+
import { markdownMerger } from "./nodes/markdownMerger.js";
|
|
7
|
+
import { textExtractorNode } from "./nodes/textExtractorNode.js";
|
|
8
|
+
import { markdownNormalizer } from "./nodes/markdownNormalizer.js";
|
|
9
|
+
import { markdownChunker } from "./nodes/markdownChunker.js";
|
|
10
|
+
import { vectorEmbedderNode } from "./nodes/vectorEmbedderNode.js";
|
|
11
|
+
import { vectorUpsertNode } from "./nodes/vectorUpsertNode.js";
|
|
12
|
+
import { saveMarkdown } from "./nodes/saveMarkdown.js";
|
|
13
|
+
import { libreOfficeToPdf } from "./nodes/libreOfficeToPdf.js";
|
|
14
|
+
/**
|
|
15
|
+
* Builds and compiles the RAG ingestion pipeline as a LangGraph StateGraph.
|
|
16
|
+
*
|
|
17
|
+
* Flow:
|
|
18
|
+
* START → fileTypeRouter
|
|
19
|
+
* ├─ "pdf" → pdfSplitter → [llmExtractionNode (Parallel)] → markdownMerger → markdownNormalizer
|
|
20
|
+
* ├─ "convert" → libreOfficeToPdf → pdfSplitter → (same as pdf branch)
|
|
21
|
+
* └─ "extract" → textExtractorNode → llmExtractionNode → markdownNormalizer
|
|
22
|
+
* markdownNormalizer → saveMarkdown → markdownChunker → vectorEmbedderNode → vectorUpsertNode → END
|
|
23
|
+
*/
|
|
24
|
+
/**
|
|
25
|
+
* Returns an array of 'Send' objects to process each PDF chunk in parallel.
|
|
26
|
+
*/
|
|
27
|
+
function dispatchPdfChunks(state) {
|
|
28
|
+
if (!state.pdfChunks || state.pdfChunks.length === 0) {
|
|
29
|
+
console.warn("[dispatchPdfChunks] No PDF chunks found to process.");
|
|
30
|
+
return [];
|
|
31
|
+
}
|
|
32
|
+
return state.pdfChunks.map((chunk, index) => {
|
|
33
|
+
return new Send("llmExtractionNode", {
|
|
34
|
+
chunk,
|
|
35
|
+
index,
|
|
36
|
+
totalChunks: state.pdfChunks.length,
|
|
37
|
+
});
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
export function buildPipeline() {
|
|
41
|
+
const graph = new StateGraph(PipelineStateAnnotation)
|
|
42
|
+
// ── Phase 1: Routing ──
|
|
43
|
+
.addNode("fileTypeRouter", fileTypeRouter)
|
|
44
|
+
// ── Phase 2a: PDF Branch ──
|
|
45
|
+
.addNode("libreOfficeToPdf", libreOfficeToPdf)
|
|
46
|
+
.addNode("pdfSplitter", pdfSplitter)
|
|
47
|
+
.addNode("markdownMerger", markdownMerger)
|
|
48
|
+
// ── Phase 2b: Text / Data Extraction Branch ──
|
|
49
|
+
.addNode("textExtractorNode", textExtractorNode)
|
|
50
|
+
.addNode("llmExtractionNode", llmExtractionNode)
|
|
51
|
+
// ── Phase 3: Normalization & Chunking ──
|
|
52
|
+
.addNode("markdownNormalizer", markdownNormalizer)
|
|
53
|
+
.addNode("saveMarkdown", saveMarkdown)
|
|
54
|
+
.addNode("markdownChunker", markdownChunker)
|
|
55
|
+
// ── Phase 4: Embedding & Indexing ──
|
|
56
|
+
.addNode("vectorEmbedderNode", vectorEmbedderNode)
|
|
57
|
+
.addNode("vectorUpsertNode", vectorUpsertNode)
|
|
58
|
+
// ── Edges ──
|
|
59
|
+
// Start → Router
|
|
60
|
+
.addEdge("__start__", "fileTypeRouter")
|
|
61
|
+
// Router → conditional branch
|
|
62
|
+
.addConditionalEdges("fileTypeRouter", routeByMimeType, {
|
|
63
|
+
pdf: "pdfSplitter",
|
|
64
|
+
convert: "libreOfficeToPdf",
|
|
65
|
+
extract: "textExtractorNode",
|
|
66
|
+
})
|
|
67
|
+
// Convert branch: LibreOffice → pdfSplitter → (joins PDF branch)
|
|
68
|
+
.addEdge("libreOfficeToPdf", "pdfSplitter")
|
|
69
|
+
// PDF branch dispatcher
|
|
70
|
+
.addConditionalEdges("pdfSplitter", dispatchPdfChunks, ["llmExtractionNode"])
|
|
71
|
+
// Unified Document/Text branch flow
|
|
72
|
+
.addEdge("textExtractorNode", "llmExtractionNode")
|
|
73
|
+
// After llmExtractionNode, conditionally merge PDF chunks or normalize Text
|
|
74
|
+
.addConditionalEdges("llmExtractionNode", routeAfterLlm, {
|
|
75
|
+
markdownMerger: "markdownMerger",
|
|
76
|
+
markdownNormalizer: "markdownNormalizer",
|
|
77
|
+
})
|
|
78
|
+
// If PDF branch, finish merger
|
|
79
|
+
.addEdge("markdownMerger", "markdownNormalizer")
|
|
80
|
+
// Shared tail: normalize → save → chunk → embed → upsert → end
|
|
81
|
+
.addEdge("markdownNormalizer", "saveMarkdown")
|
|
82
|
+
.addEdge("saveMarkdown", "markdownChunker")
|
|
83
|
+
.addEdge("markdownChunker", "vectorEmbedderNode")
|
|
84
|
+
.addEdge("vectorEmbedderNode", "vectorUpsertNode")
|
|
85
|
+
.addEdge("vectorUpsertNode", END);
|
|
86
|
+
return graph.compile();
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* The compiled graph instance.
|
|
90
|
+
* Exported specifically for LangGraph Studio and the LangGraph CLI.
|
|
91
|
+
*/
|
|
92
|
+
export const graph = buildPipeline();
|
|
93
|
+
//# sourceMappingURL=pipeline.js.map
|