@virstack/doc-ingest 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/README.md +203 -0
  2. package/dist/adapters/aiAdapters.d.ts +25 -0
  3. package/dist/adapters/aiAdapters.d.ts.map +1 -0
  4. package/dist/adapters/aiAdapters.js +73 -0
  5. package/dist/adapters/aiAdapters.js.map +1 -0
  6. package/dist/adapters/vectorStore.d.ts +24 -0
  7. package/dist/adapters/vectorStore.d.ts.map +1 -0
  8. package/dist/adapters/vectorStore.js +22 -0
  9. package/dist/adapters/vectorStore.js.map +1 -0
  10. package/dist/aiAdapters.d.ts +25 -0
  11. package/dist/aiAdapters.d.ts.map +1 -0
  12. package/dist/aiAdapters.js +50 -0
  13. package/dist/aiAdapters.js.map +1 -0
  14. package/dist/assets/logo.png +0 -0
  15. package/dist/batchPipeline.d.ts +52 -0
  16. package/dist/batchPipeline.d.ts.map +1 -0
  17. package/dist/batchPipeline.js +81 -0
  18. package/dist/batchPipeline.js.map +1 -0
  19. package/dist/cli.d.ts +3 -0
  20. package/dist/cli.d.ts.map +1 -0
  21. package/dist/cli.js +217 -0
  22. package/dist/cli.js.map +1 -0
  23. package/dist/config.d.ts +26 -0
  24. package/dist/config.d.ts.map +1 -0
  25. package/dist/config.js +97 -0
  26. package/dist/config.js.map +1 -0
  27. package/dist/core/config.d.ts +26 -0
  28. package/dist/core/config.d.ts.map +1 -0
  29. package/dist/core/config.js +106 -0
  30. package/dist/core/config.js.map +1 -0
  31. package/dist/core/logger.d.ts +31 -0
  32. package/dist/core/logger.d.ts.map +1 -0
  33. package/dist/core/logger.js +42 -0
  34. package/dist/core/logger.js.map +1 -0
  35. package/dist/core/state.d.ts +52 -0
  36. package/dist/core/state.d.ts.map +1 -0
  37. package/dist/core/state.js +27 -0
  38. package/dist/core/state.js.map +1 -0
  39. package/dist/graphs/batchProcessor.d.ts +72 -0
  40. package/dist/graphs/batchProcessor.d.ts.map +1 -0
  41. package/dist/graphs/batchProcessor.js +94 -0
  42. package/dist/graphs/batchProcessor.js.map +1 -0
  43. package/dist/graphs/singleDocument.d.ts +303 -0
  44. package/dist/graphs/singleDocument.d.ts.map +1 -0
  45. package/dist/graphs/singleDocument.js +93 -0
  46. package/dist/graphs/singleDocument.js.map +1 -0
  47. package/dist/index.d.ts +8 -0
  48. package/dist/index.d.ts.map +1 -0
  49. package/dist/index.js +10 -0
  50. package/dist/index.js.map +1 -0
  51. package/dist/logger.d.ts +24 -0
  52. package/dist/logger.d.ts.map +1 -0
  53. package/dist/logger.js +36 -0
  54. package/dist/logger.js.map +1 -0
  55. package/dist/logo.d.ts +2 -0
  56. package/dist/logo.d.ts.map +1 -0
  57. package/dist/logo.js +3 -0
  58. package/dist/logo.js.map +1 -0
  59. package/dist/nodes/fileTypeRouter.d.ts +16 -0
  60. package/dist/nodes/fileTypeRouter.d.ts.map +1 -0
  61. package/dist/nodes/fileTypeRouter.js +72 -0
  62. package/dist/nodes/fileTypeRouter.js.map +1 -0
  63. package/dist/nodes/geminiExtraction.d.ts +19 -0
  64. package/dist/nodes/geminiExtraction.d.ts.map +1 -0
  65. package/dist/nodes/geminiExtraction.js +87 -0
  66. package/dist/nodes/geminiExtraction.js.map +1 -0
  67. package/dist/nodes/libreOfficeToPdf.d.ts +8 -0
  68. package/dist/nodes/libreOfficeToPdf.d.ts.map +1 -0
  69. package/dist/nodes/libreOfficeToPdf.js +61 -0
  70. package/dist/nodes/libreOfficeToPdf.js.map +1 -0
  71. package/dist/nodes/llmExtractionNode.d.ts +19 -0
  72. package/dist/nodes/llmExtractionNode.d.ts.map +1 -0
  73. package/dist/nodes/llmExtractionNode.js +68 -0
  74. package/dist/nodes/llmExtractionNode.js.map +1 -0
  75. package/dist/nodes/markdownChunker.d.ts +8 -0
  76. package/dist/nodes/markdownChunker.d.ts.map +1 -0
  77. package/dist/nodes/markdownChunker.js +24 -0
  78. package/dist/nodes/markdownChunker.js.map +1 -0
  79. package/dist/nodes/markdownMerger.d.ts +9 -0
  80. package/dist/nodes/markdownMerger.d.ts.map +1 -0
  81. package/dist/nodes/markdownMerger.js +33 -0
  82. package/dist/nodes/markdownMerger.js.map +1 -0
  83. package/dist/nodes/markdownNormalizer.d.ts +10 -0
  84. package/dist/nodes/markdownNormalizer.d.ts.map +1 -0
  85. package/dist/nodes/markdownNormalizer.js +46 -0
  86. package/dist/nodes/markdownNormalizer.js.map +1 -0
  87. package/dist/nodes/openrouterEmbedder.d.ts +7 -0
  88. package/dist/nodes/openrouterEmbedder.d.ts.map +1 -0
  89. package/dist/nodes/openrouterEmbedder.js +31 -0
  90. package/dist/nodes/openrouterEmbedder.js.map +1 -0
  91. package/dist/nodes/pdfSplitter.d.ts +7 -0
  92. package/dist/nodes/pdfSplitter.d.ts.map +1 -0
  93. package/dist/nodes/pdfSplitter.js +41 -0
  94. package/dist/nodes/pdfSplitter.js.map +1 -0
  95. package/dist/nodes/saveMarkdown.d.ts +7 -0
  96. package/dist/nodes/saveMarkdown.d.ts.map +1 -0
  97. package/dist/nodes/saveMarkdown.js +28 -0
  98. package/dist/nodes/saveMarkdown.js.map +1 -0
  99. package/dist/nodes/textExtractorNode.d.ts +7 -0
  100. package/dist/nodes/textExtractorNode.d.ts.map +1 -0
  101. package/dist/nodes/textExtractorNode.js +39 -0
  102. package/dist/nodes/textExtractorNode.js.map +1 -0
  103. package/dist/nodes/upstashUpsert.d.ts +7 -0
  104. package/dist/nodes/upstashUpsert.d.ts.map +1 -0
  105. package/dist/nodes/upstashUpsert.js +45 -0
  106. package/dist/nodes/upstashUpsert.js.map +1 -0
  107. package/dist/nodes/vectorEmbedderNode.d.ts +7 -0
  108. package/dist/nodes/vectorEmbedderNode.d.ts.map +1 -0
  109. package/dist/nodes/vectorEmbedderNode.js +23 -0
  110. package/dist/nodes/vectorEmbedderNode.js.map +1 -0
  111. package/dist/nodes/vectorUpsertNode.d.ts +7 -0
  112. package/dist/nodes/vectorUpsertNode.d.ts.map +1 -0
  113. package/dist/nodes/vectorUpsertNode.js +45 -0
  114. package/dist/nodes/vectorUpsertNode.js.map +1 -0
  115. package/dist/pipeline.d.ts +303 -0
  116. package/dist/pipeline.d.ts.map +1 -0
  117. package/dist/pipeline.js +93 -0
  118. package/dist/pipeline.js.map +1 -0
  119. package/dist/state.d.ts +52 -0
  120. package/dist/state.d.ts.map +1 -0
  121. package/dist/state.js +27 -0
  122. package/dist/state.js.map +1 -0
  123. package/dist/vectorStore.d.ts +24 -0
  124. package/dist/vectorStore.d.ts.map +1 -0
  125. package/dist/vectorStore.js +22 -0
  126. package/dist/vectorStore.js.map +1 -0
  127. package/package.json +55 -0
@@ -0,0 +1,39 @@
1
+ import fs from "node:fs/promises";
2
+ import officeparser from "officeparser";
3
+ import { parse } from "csv-parse/sync";
4
+ import { logger, LogSource } from "../core/logger.js";
5
+ /**
6
+ * Extracts raw text from office documents (DOCX, PPTX, XLSX) using officeparser,
7
+ * CSV files using csv-parse, and TXT files via direct read.
8
+ */
9
+ export async function textExtractorNode(state) {
10
+ const { filePath, mimeType } = state;
11
+ logger.info(LogSource.TEXT_EXTRACTOR, `Parsing: ${filePath} (${mimeType})`);
12
+ let rawText;
13
+ if (mimeType === "text/plain") {
14
+ // Plain text — just read directly
15
+ rawText = filePath ? await fs.readFile(filePath, "utf-8") : state.rawText;
16
+ }
17
+ else if (mimeType === "text/csv") {
18
+ // CSV — parse and convert to a readable text table
19
+ if (!filePath)
20
+ throw new Error("filePath required for CSV parsing");
21
+ const csvBuffer = await fs.readFile(filePath, "utf-8");
22
+ const records = parse(csvBuffer, {
23
+ skip_empty_lines: true,
24
+ });
25
+ // Convert to a simple text representation
26
+ rawText = records
27
+ .map((row) => row.join(" | "))
28
+ .join("\n");
29
+ }
30
+ else {
31
+ // DOCX, PPTX, XLSX — use officeparser
32
+ if (!filePath)
33
+ throw new Error("filePath required for office document parsing");
34
+ rawText = await officeparser.parseOfficeAsync(filePath);
35
+ }
36
+ logger.info(LogSource.TEXT_EXTRACTOR, `Extracted ${rawText.length} chars of raw text`);
37
+ return { rawText };
38
+ }
39
+ //# sourceMappingURL=textExtractorNode.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"textExtractorNode.js","sourceRoot":"","sources":["../../src/nodes/textExtractorNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,YAAY,MAAM,cAAc,CAAC;AACxC,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAC;AAEvC,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAAoB;IAEpB,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,KAAK,CAAC;IAErC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,YAAY,QAAQ,KAAK,QAAQ,GAAG,CAAC,CAAC;IAE5E,IAAI,OAAe,CAAC;IAEpB,IAAI,QAAQ,KAAK,YAAY,EAAE,CAAC;QAC9B,kCAAkC;QAClC,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAC5E,CAAC;SAAM,IAAI,QAAQ,KAAK,UAAU,EAAE,CAAC;QACnC,mDAAmD;QACnD,IAAI,CAAC,QAAQ;YAAE,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACpE,MAAM,SAAS,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACvD,MAAM,OAAO,GAAe,KAAK,CAAC,SAAS,EAAE;YAC3C,gBAAgB,EAAE,IAAI;SACvB,CAAC,CAAC;QAEH,0CAA0C;QAC1C,OAAO,GAAG,OAAO;aACd,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;aAC7B,IAAI,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC;SAAM,CAAC;QACN,sCAAsC;QACtC,IAAI,CAAC,QAAQ;YAAE,MAAM,IAAI,KAAK,CAAC,+CAA+C,CAAC,CAAC;QAChF,OAAO,GAAG,MAAM,YAAY,CAAC,gBAAgB,CAAC,QAAQ,CAAW,CAAC;IACpE,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,aAAa,OAAO,CAAC,MAAM,oBAAoB,CAAC,CAAC;IAEvF,OAAO,EAAE,OAAO,EAAE,CAAC;AACrB,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { PipelineState } from "../state.js";
2
+ /**
3
+ * Upserts text chunks + their embedding vectors into Upstash Vector.
4
+ * Each chunk is stored with rich metadata for RAG filtering.
5
+ */
6
+ export declare function upstashUpsert(state: PipelineState): Promise<Partial<PipelineState>>;
7
+ //# sourceMappingURL=upstashUpsert.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"upstashUpsert.d.ts","sourceRoot":"","sources":["../../src/nodes/upstashUpsert.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAGjD;;;GAGG;AACH,wBAAsB,aAAa,CACjC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA0CjC"}
@@ -0,0 +1,45 @@
1
+ import path from "node:path";
2
+ import crypto from "node:crypto";
3
+ import { vectorIndex, requireInit } from "../config.js";
4
+ import { logger, LogSource } from "../logger.js";
5
+ /**
6
+ * Upserts text chunks + their embedding vectors into Upstash Vector.
7
+ * Each chunk is stored with rich metadata for RAG filtering.
8
+ */
9
+ export async function upstashUpsert(state) {
10
+ requireInit();
11
+ const { filePath, mimeType, textChunks, vectors } = state;
12
+ // Generate a stable document ID from the file path
13
+ const docId = crypto
14
+ .createHash("sha256")
15
+ .update(filePath || "pasted_text")
16
+ .digest("hex")
17
+ .slice(0, 8);
18
+ logger.info(LogSource.UPSTASH_UPSERT, `Upserting ${textChunks.length} chunks for doc ${docId}`);
19
+ // Upstash Vector supports batch upserts
20
+ const upsertPayload = textChunks.map((chunk, i) => ({
21
+ id: `${docId}-chunk-${i}`,
22
+ vector: vectors[i],
23
+ data: chunk,
24
+ metadata: {
25
+ text: chunk,
26
+ source: state.filePath ? path.basename(state.filePath) : "pasted_text",
27
+ sourcePath: filePath,
28
+ mimeType: mimeType,
29
+ chunkIndex: i,
30
+ totalChunks: textChunks.length,
31
+ docId: docId,
32
+ ingestedAt: new Date().toISOString(),
33
+ },
34
+ }));
35
+ // Upsert in batches of 100 (Upstash limit)
36
+ const BATCH_SIZE = 100;
37
+ for (let i = 0; i < upsertPayload.length; i += BATCH_SIZE) {
38
+ const batch = upsertPayload.slice(i, i + BATCH_SIZE);
39
+ await vectorIndex.upsert(batch);
40
+ logger.info(LogSource.UPSTASH_UPSERT, `Upserted batch ${Math.floor(i / BATCH_SIZE) + 1}: ${batch.length} vectors`);
41
+ }
42
+ logger.success(LogSource.UPSTASH_UPSERT, `All ${textChunks.length} chunks upserted`);
43
+ return {};
44
+ }
45
+ //# sourceMappingURL=upstashUpsert.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"upstashUpsert.js","sourceRoot":"","sources":["../../src/nodes/upstashUpsert.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,MAAM,MAAM,aAAa,CAAC;AACjC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAExD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEjD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,KAAK,CAAC;IAE1D,mDAAmD;IACnD,MAAM,KAAK,GAAG,MAAM;SACjB,UAAU,CAAC,QAAQ,CAAC;SACpB,MAAM,CAAC,QAAQ,IAAI,aAAa,CAAC;SACjC,MAAM,CAAC,KAAK,CAAC;SACb,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAEf,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,aAAa,UAAU,CAAC,MAAM,mBAAmB,KAAK,EAAE,CAAC,CAAC;IAEhG,wCAAwC;IACxC,MAAM,aAAa,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAClD,EAAE,EAAE,GAAG,KAAK,UAAU,CAAC,EAAE;QACzB,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;QAClB,IAAI,EAAE,KAAK;QACX,QAAQ,EAAE;YACR,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,aAAa;YACtE,UAAU,EAAE,QAAQ;YACpB,QAAQ,EAAE,QAAQ;YAClB,UAAU,EAAE,CAAC;YACb,WAAW,EAAE,UAAU,CAAC,MAAM;YAC9B,KAAK,EAAE,KAAK;YACZ,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACrC;KACF,CAAC,CAAC,CAAC;IAEJ,2CAA2C;IAC3C,MAAM,UAAU,GAAG,GAAG,CAAC;IACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QAC1D,MAAM,KAAK,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QACrD,MAAM,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAEhC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,kBAAkB,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,MAAM,UAAU,CAAC,CAAC;IACrH,CAAC;IAED,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,cAAc,EAAE,OAAO,UAAU,CAAC,MAAM,kBAAkB,CAAC,CAAC;IAErF,OAAO,EAAE,CAAC;AACZ,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Embeds all textChunks using the injected EmbeddingAdapter.
4
+ * Processes in batches to stay within API limits.
5
+ */
6
+ export declare function vectorEmbedderNode(state: PipelineState): Promise<Partial<PipelineState>>;
7
+ //# sourceMappingURL=vectorEmbedderNode.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vectorEmbedderNode.d.ts","sourceRoot":"","sources":["../../src/nodes/vectorEmbedderNode.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,kBAAkB,CACtC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAsBjC"}
@@ -0,0 +1,23 @@
1
+ import { pipelineConfig, requireInit } from "../core/config.js";
2
+ import { logger, LogSource } from "../core/logger.js";
3
+ /**
4
+ * Embeds all textChunks using the injected EmbeddingAdapter.
5
+ * Processes in batches to stay within API limits.
6
+ */
7
+ export async function vectorEmbedderNode(state) {
8
+ requireInit();
9
+ const { textChunks } = state;
10
+ logger.info(LogSource.VECTOR_EMBEDDER, `Embedding ${textChunks.length} chunks via injected Embedder Node`);
11
+ const allVectors = [];
12
+ const BATCH_SIZE = 50; // Common safe default, though adapters might handle their own batching internally
13
+ for (let i = 0; i < textChunks.length; i += BATCH_SIZE) {
14
+ const batch = textChunks.slice(i, i + BATCH_SIZE);
15
+ logger.info(LogSource.VECTOR_EMBEDDER, `Batch ${Math.floor(i / BATCH_SIZE) + 1}: ${batch.length} chunk(s)`);
16
+ // Call the injected Embedding adapter!
17
+ const vectors = await pipelineConfig.embedder.embed(batch);
18
+ allVectors.push(...vectors);
19
+ }
20
+ logger.info(LogSource.VECTOR_EMBEDDER, `Generated ${allVectors.length} vectors (${allVectors[0]?.length ?? 0}d)`);
21
+ return { vectors: allVectors };
22
+ }
23
+ //# sourceMappingURL=vectorEmbedderNode.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vectorEmbedderNode.js","sourceRoot":"","sources":["../../src/nodes/vectorEmbedderNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,UAAU,EAAE,GAAG,KAAK,CAAC;IAE7B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,aAAa,UAAU,CAAC,MAAM,oCAAoC,CAAC,CAAC;IAE3G,MAAM,UAAU,GAAe,EAAE,CAAC;IAClC,MAAM,UAAU,GAAG,EAAE,CAAC,CAAC,kFAAkF;IAEzG,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QACvD,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QAElD,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,SAAS,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,MAAM,WAAW,CAAC,CAAC;QAE5G,uCAAuC;QACvC,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAC3D,UAAU,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC;IAC9B,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,aAAa,UAAU,CAAC,MAAM,aAAa,UAAU,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,IAAI,CAAC,CAAC;IAElH,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC;AACjC,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Upserts text chunks + their embedding vectors into a generic Vector Store Adapter.
4
+ * Each chunk is stored with rich metadata for vector filtering.
5
+ */
6
+ export declare function vectorUpsertNode(state: PipelineState): Promise<Partial<PipelineState>>;
7
+ //# sourceMappingURL=vectorUpsertNode.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vectorUpsertNode.d.ts","sourceRoot":"","sources":["../../src/nodes/vectorUpsertNode.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAItD;;;GAGG;AACH,wBAAsB,gBAAgB,CACpC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CA2CjC"}
@@ -0,0 +1,45 @@
1
+ import path from "node:path";
2
+ import crypto from "node:crypto";
3
+ import { pipelineConfig, requireInit } from "../core/config.js";
4
+ import { logger, LogSource } from "../core/logger.js";
5
+ /**
6
+ * Upserts text chunks + their embedding vectors into a generic Vector Store Adapter.
7
+ * Each chunk is stored with rich metadata for vector filtering.
8
+ */
9
+ export async function vectorUpsertNode(state) {
10
+ requireInit();
11
+ const { filePath, mimeType, textChunks, vectors } = state;
12
+ // Generate a stable document ID from the file path
13
+ const docId = crypto
14
+ .createHash("sha256")
15
+ .update(filePath || "pasted_text")
16
+ .digest("hex")
17
+ .slice(0, 8);
18
+ logger.info(LogSource.VECTOR_UPSERT, `Upserting ${textChunks.length} chunks via Vector Store Adapter for doc ${docId}`);
19
+ // Format the data into our standard contract
20
+ const records = textChunks.map((chunk, i) => ({
21
+ id: `${docId}-chunk-${i}`,
22
+ vector: vectors[i],
23
+ metadata: {
24
+ text: chunk,
25
+ source: state.filePath ? path.basename(state.filePath) : "pasted_text",
26
+ sourcePath: filePath,
27
+ mimeType: mimeType,
28
+ chunkIndex: i,
29
+ totalChunks: textChunks.length,
30
+ docId: docId,
31
+ ingestedAt: new Date().toISOString(),
32
+ },
33
+ }));
34
+ // Upsert in batches of 100 (Common limit for many vector DBs)
35
+ const BATCH_SIZE = 100;
36
+ for (let i = 0; i < records.length; i += BATCH_SIZE) {
37
+ const batch = records.slice(i, i + BATCH_SIZE);
38
+ // Call the user's database adapter instead of Upstash directly!
39
+ await pipelineConfig.vectorStore.upsert(batch);
40
+ logger.info(LogSource.VECTOR_UPSERT, `Upserted batch ${Math.floor(i / BATCH_SIZE) + 1} (${batch.length} vectors)`);
41
+ }
42
+ logger.success(LogSource.VECTOR_UPSERT, `All ${textChunks.length} chunks upserted`);
43
+ return {};
44
+ }
45
+ //# sourceMappingURL=vectorUpsertNode.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vectorUpsertNode.js","sourceRoot":"","sources":["../../src/nodes/vectorUpsertNode.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,MAAM,MAAM,aAAa,CAAC;AACjC,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAGtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IACd,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,KAAK,CAAC;IAE1D,mDAAmD;IACnD,MAAM,KAAK,GAAG,MAAM;SACjB,UAAU,CAAC,QAAQ,CAAC;SACpB,MAAM,CAAC,QAAQ,IAAI,aAAa,CAAC;SACjC,MAAM,CAAC,KAAK,CAAC;SACb,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAEf,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,aAAa,UAAU,CAAC,MAAM,4CAA4C,KAAK,EAAE,CAAC,CAAC;IAExH,6CAA6C;IAC7C,MAAM,OAAO,GAAmB,UAAU,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5D,EAAE,EAAE,GAAG,KAAK,UAAU,CAAC,EAAE;QACzB,MAAM,EAAE,OAAO,CAAC,CAAC,CAAa;QAC9B,QAAQ,EAAE;YACR,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,aAAa;YACtE,UAAU,EAAE,QAAQ;YACpB,QAAQ,EAAE,QAAQ;YAClB,UAAU,EAAE,CAAC;YACb,WAAW,EAAE,UAAU,CAAC,MAAM;YAC9B,KAAK,EAAE,KAAK;YACZ,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACrC;KACF,CAAC,CAAC,CAAC;IAEJ,8DAA8D;IAC9D,MAAM,UAAU,GAAG,GAAG,CAAC;IACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QACpD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QAE/C,gEAAgE;QAChE,MAAM,cAAc,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAE/C,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,kBAAkB,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,MAAM,WAAW,CAAC,CAAC;IACrH,CAAC;IAED,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,aAAa,EAAE,OAAO,UAAU,CAAC,MAAM,kBAAkB,CAAC,CAAC;IAEpF,OAAO,EAAE,CAAC;AACZ,CAAC"}
@@ -0,0 +1,303 @@
1
+ export declare function buildPipeline(): import("@langchain/langgraph").CompiledStateGraph<import("@langchain/langgraph").StateType<{
2
+ filePath: {
3
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
4
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
5
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
6
+ };
7
+ mimeType: {
8
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
9
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
10
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
11
+ };
12
+ rawText: {
13
+ (): import("@langchain/langgraph").LastValue<string>;
14
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
15
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
16
+ };
17
+ pdfChunks: {
18
+ (): import("@langchain/langgraph").LastValue<string[]>;
19
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
20
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
21
+ };
22
+ markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
23
+ markdown: {
24
+ (): import("@langchain/langgraph").LastValue<string>;
25
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
26
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
27
+ };
28
+ textChunks: {
29
+ (): import("@langchain/langgraph").LastValue<string[]>;
30
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
31
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
32
+ };
33
+ vectors: {
34
+ (): import("@langchain/langgraph").LastValue<number[][]>;
35
+ (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
36
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
37
+ };
38
+ }>, import("@langchain/langgraph").UpdateType<{
39
+ filePath: {
40
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
41
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
42
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
43
+ };
44
+ mimeType: {
45
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
46
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
47
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
48
+ };
49
+ rawText: {
50
+ (): import("@langchain/langgraph").LastValue<string>;
51
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
52
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
53
+ };
54
+ pdfChunks: {
55
+ (): import("@langchain/langgraph").LastValue<string[]>;
56
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
57
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
58
+ };
59
+ markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
60
+ markdown: {
61
+ (): import("@langchain/langgraph").LastValue<string>;
62
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
63
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
64
+ };
65
+ textChunks: {
66
+ (): import("@langchain/langgraph").LastValue<string[]>;
67
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
68
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
69
+ };
70
+ vectors: {
71
+ (): import("@langchain/langgraph").LastValue<number[][]>;
72
+ (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
73
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
74
+ };
75
+ }>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
76
+ filePath: {
77
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
78
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
79
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
80
+ };
81
+ mimeType: {
82
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
83
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
84
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
85
+ };
86
+ rawText: {
87
+ (): import("@langchain/langgraph").LastValue<string>;
88
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
89
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
90
+ };
91
+ pdfChunks: {
92
+ (): import("@langchain/langgraph").LastValue<string[]>;
93
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
94
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
95
+ };
96
+ markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
97
+ markdown: {
98
+ (): import("@langchain/langgraph").LastValue<string>;
99
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
100
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
101
+ };
102
+ textChunks: {
103
+ (): import("@langchain/langgraph").LastValue<string[]>;
104
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
105
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
106
+ };
107
+ vectors: {
108
+ (): import("@langchain/langgraph").LastValue<number[][]>;
109
+ (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
110
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
111
+ };
112
+ }, {
113
+ filePath: {
114
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
115
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
116
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
117
+ };
118
+ mimeType: {
119
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
120
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
121
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
122
+ };
123
+ rawText: {
124
+ (): import("@langchain/langgraph").LastValue<string>;
125
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
126
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
127
+ };
128
+ pdfChunks: {
129
+ (): import("@langchain/langgraph").LastValue<string[]>;
130
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
131
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
132
+ };
133
+ markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
134
+ markdown: {
135
+ (): import("@langchain/langgraph").LastValue<string>;
136
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
137
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
138
+ };
139
+ textChunks: {
140
+ (): import("@langchain/langgraph").LastValue<string[]>;
141
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
142
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
143
+ };
144
+ vectors: {
145
+ (): import("@langchain/langgraph").LastValue<number[][]>;
146
+ (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
147
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
148
+ };
149
+ }, import("@langchain/langgraph").StateDefinition>;
150
+ /**
151
+ * The compiled graph instance.
152
+ * Exported specifically for LangGraph Studio and the LangGraph CLI.
153
+ */
154
+ export declare const graph: import("@langchain/langgraph").CompiledStateGraph<import("@langchain/langgraph").StateType<{
155
+ filePath: {
156
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
157
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
158
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
159
+ };
160
+ mimeType: {
161
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
162
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
163
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
164
+ };
165
+ rawText: {
166
+ (): import("@langchain/langgraph").LastValue<string>;
167
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
168
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
169
+ };
170
+ pdfChunks: {
171
+ (): import("@langchain/langgraph").LastValue<string[]>;
172
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
173
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
174
+ };
175
+ markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
176
+ markdown: {
177
+ (): import("@langchain/langgraph").LastValue<string>;
178
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
179
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
180
+ };
181
+ textChunks: {
182
+ (): import("@langchain/langgraph").LastValue<string[]>;
183
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
184
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
185
+ };
186
+ vectors: {
187
+ (): import("@langchain/langgraph").LastValue<number[][]>;
188
+ (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
189
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
190
+ };
191
+ }>, import("@langchain/langgraph").UpdateType<{
192
+ filePath: {
193
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
194
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
195
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
196
+ };
197
+ mimeType: {
198
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
199
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
200
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
201
+ };
202
+ rawText: {
203
+ (): import("@langchain/langgraph").LastValue<string>;
204
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
205
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
206
+ };
207
+ pdfChunks: {
208
+ (): import("@langchain/langgraph").LastValue<string[]>;
209
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
210
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
211
+ };
212
+ markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
213
+ markdown: {
214
+ (): import("@langchain/langgraph").LastValue<string>;
215
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
216
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
217
+ };
218
+ textChunks: {
219
+ (): import("@langchain/langgraph").LastValue<string[]>;
220
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
221
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
222
+ };
223
+ vectors: {
224
+ (): import("@langchain/langgraph").LastValue<number[][]>;
225
+ (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
226
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
227
+ };
228
+ }>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
229
+ filePath: {
230
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
231
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
232
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
233
+ };
234
+ mimeType: {
235
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
236
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
237
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
238
+ };
239
+ rawText: {
240
+ (): import("@langchain/langgraph").LastValue<string>;
241
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
242
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
243
+ };
244
+ pdfChunks: {
245
+ (): import("@langchain/langgraph").LastValue<string[]>;
246
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
247
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
248
+ };
249
+ markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
250
+ markdown: {
251
+ (): import("@langchain/langgraph").LastValue<string>;
252
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
253
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
254
+ };
255
+ textChunks: {
256
+ (): import("@langchain/langgraph").LastValue<string[]>;
257
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
258
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
259
+ };
260
+ vectors: {
261
+ (): import("@langchain/langgraph").LastValue<number[][]>;
262
+ (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
263
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
264
+ };
265
+ }, {
266
+ filePath: {
267
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
268
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
269
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
270
+ };
271
+ mimeType: {
272
+ (): import("@langchain/langgraph").LastValue<string | undefined>;
273
+ (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
274
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
275
+ };
276
+ rawText: {
277
+ (): import("@langchain/langgraph").LastValue<string>;
278
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
279
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
280
+ };
281
+ pdfChunks: {
282
+ (): import("@langchain/langgraph").LastValue<string[]>;
283
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
284
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
285
+ };
286
+ markdownParts: import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
287
+ markdown: {
288
+ (): import("@langchain/langgraph").LastValue<string>;
289
+ (annotation: import("@langchain/langgraph").SingleReducer<string, string>): import("@langchain/langgraph").BinaryOperatorAggregate<string, string>;
290
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
291
+ };
292
+ textChunks: {
293
+ (): import("@langchain/langgraph").LastValue<string[]>;
294
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
295
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
296
+ };
297
+ vectors: {
298
+ (): import("@langchain/langgraph").LastValue<number[][]>;
299
+ (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
300
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
301
+ };
302
+ }, import("@langchain/langgraph").StateDefinition>;
303
+ //# sourceMappingURL=pipeline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AA0CA,wBAAgB,aAAa;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mDA4D5B;AAED;;;GAGG;AACH,eAAO,MAAM,KAAK;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kDAAkB,CAAC"}
@@ -0,0 +1,93 @@
1
+ import { StateGraph, END, Send } from "@langchain/langgraph";
2
+ import { PipelineStateAnnotation } from "./state.js";
3
+ import { fileTypeRouter, routeByMimeType } from "./nodes/fileTypeRouter.js";
4
+ import { pdfSplitter } from "./nodes/pdfSplitter.js";
5
+ import { llmExtractionNode, routeAfterLlm } from "./nodes/llmExtractionNode.js";
6
+ import { markdownMerger } from "./nodes/markdownMerger.js";
7
+ import { textExtractorNode } from "./nodes/textExtractorNode.js";
8
+ import { markdownNormalizer } from "./nodes/markdownNormalizer.js";
9
+ import { markdownChunker } from "./nodes/markdownChunker.js";
10
+ import { vectorEmbedderNode } from "./nodes/vectorEmbedderNode.js";
11
+ import { vectorUpsertNode } from "./nodes/vectorUpsertNode.js";
12
+ import { saveMarkdown } from "./nodes/saveMarkdown.js";
13
+ import { libreOfficeToPdf } from "./nodes/libreOfficeToPdf.js";
14
+ /**
15
+ * Builds and compiles the RAG ingestion pipeline as a LangGraph StateGraph.
16
+ *
17
+ * Flow:
18
+ * START → fileTypeRouter
19
+ * ├─ "pdf" → pdfSplitter → [llmExtractionNode (Parallel)] → markdownMerger → markdownNormalizer
20
+ * ├─ "convert" → libreOfficeToPdf → pdfSplitter → (same as pdf branch)
21
+ * └─ "extract" → textExtractorNode → llmExtractionNode → markdownNormalizer
22
+ * markdownNormalizer → saveMarkdown → markdownChunker → vectorEmbedderNode → vectorUpsertNode → END
23
+ */
24
+ /**
25
+ * Returns an array of 'Send' objects to process each PDF chunk in parallel.
26
+ */
27
+ function dispatchPdfChunks(state) {
28
+ if (!state.pdfChunks || state.pdfChunks.length === 0) {
29
+ console.warn("[dispatchPdfChunks] No PDF chunks found to process.");
30
+ return [];
31
+ }
32
+ return state.pdfChunks.map((chunk, index) => {
33
+ return new Send("llmExtractionNode", {
34
+ chunk,
35
+ index,
36
+ totalChunks: state.pdfChunks.length,
37
+ });
38
+ });
39
+ }
40
+ export function buildPipeline() {
41
+ const graph = new StateGraph(PipelineStateAnnotation)
42
+ // ── Phase 1: Routing ──
43
+ .addNode("fileTypeRouter", fileTypeRouter)
44
+ // ── Phase 2a: PDF Branch ──
45
+ .addNode("libreOfficeToPdf", libreOfficeToPdf)
46
+ .addNode("pdfSplitter", pdfSplitter)
47
+ .addNode("markdownMerger", markdownMerger)
48
+ // ── Phase 2b: Text / Data Extraction Branch ──
49
+ .addNode("textExtractorNode", textExtractorNode)
50
+ .addNode("llmExtractionNode", llmExtractionNode)
51
+ // ── Phase 3: Normalization & Chunking ──
52
+ .addNode("markdownNormalizer", markdownNormalizer)
53
+ .addNode("saveMarkdown", saveMarkdown)
54
+ .addNode("markdownChunker", markdownChunker)
55
+ // ── Phase 4: Embedding & Indexing ──
56
+ .addNode("vectorEmbedderNode", vectorEmbedderNode)
57
+ .addNode("vectorUpsertNode", vectorUpsertNode)
58
+ // ── Edges ──
59
+ // Start → Router
60
+ .addEdge("__start__", "fileTypeRouter")
61
+ // Router → conditional branch
62
+ .addConditionalEdges("fileTypeRouter", routeByMimeType, {
63
+ pdf: "pdfSplitter",
64
+ convert: "libreOfficeToPdf",
65
+ extract: "textExtractorNode",
66
+ })
67
+ // Convert branch: LibreOffice → pdfSplitter → (joins PDF branch)
68
+ .addEdge("libreOfficeToPdf", "pdfSplitter")
69
+ // PDF branch dispatcher
70
+ .addConditionalEdges("pdfSplitter", dispatchPdfChunks, ["llmExtractionNode"])
71
+ // Unified Document/Text branch flow
72
+ .addEdge("textExtractorNode", "llmExtractionNode")
73
+ // After llmExtractionNode, conditionally merge PDF chunks or normalize Text
74
+ .addConditionalEdges("llmExtractionNode", routeAfterLlm, {
75
+ markdownMerger: "markdownMerger",
76
+ markdownNormalizer: "markdownNormalizer",
77
+ })
78
+ // If PDF branch, finish merger
79
+ .addEdge("markdownMerger", "markdownNormalizer")
80
+ // Shared tail: normalize → save → chunk → embed → upsert → end
81
+ .addEdge("markdownNormalizer", "saveMarkdown")
82
+ .addEdge("saveMarkdown", "markdownChunker")
83
+ .addEdge("markdownChunker", "vectorEmbedderNode")
84
+ .addEdge("vectorEmbedderNode", "vectorUpsertNode")
85
+ .addEdge("vectorUpsertNode", END);
86
+ return graph.compile();
87
+ }
88
+ /**
89
+ * The compiled graph instance.
90
+ * Exported specifically for LangGraph Studio and the LangGraph CLI.
91
+ */
92
+ export const graph = buildPipeline();
93
+ //# sourceMappingURL=pipeline.js.map