@virstack/doc-ingest 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/README.md +203 -0
  2. package/dist/adapters/aiAdapters.d.ts +25 -0
  3. package/dist/adapters/aiAdapters.d.ts.map +1 -0
  4. package/dist/adapters/aiAdapters.js +73 -0
  5. package/dist/adapters/aiAdapters.js.map +1 -0
  6. package/dist/adapters/vectorStore.d.ts +24 -0
  7. package/dist/adapters/vectorStore.d.ts.map +1 -0
  8. package/dist/adapters/vectorStore.js +22 -0
  9. package/dist/adapters/vectorStore.js.map +1 -0
  10. package/dist/aiAdapters.d.ts +25 -0
  11. package/dist/aiAdapters.d.ts.map +1 -0
  12. package/dist/aiAdapters.js +50 -0
  13. package/dist/aiAdapters.js.map +1 -0
  14. package/dist/assets/logo.png +0 -0
  15. package/dist/batchPipeline.d.ts +52 -0
  16. package/dist/batchPipeline.d.ts.map +1 -0
  17. package/dist/batchPipeline.js +81 -0
  18. package/dist/batchPipeline.js.map +1 -0
  19. package/dist/cli.d.ts +3 -0
  20. package/dist/cli.d.ts.map +1 -0
  21. package/dist/cli.js +217 -0
  22. package/dist/cli.js.map +1 -0
  23. package/dist/config.d.ts +26 -0
  24. package/dist/config.d.ts.map +1 -0
  25. package/dist/config.js +97 -0
  26. package/dist/config.js.map +1 -0
  27. package/dist/core/config.d.ts +26 -0
  28. package/dist/core/config.d.ts.map +1 -0
  29. package/dist/core/config.js +106 -0
  30. package/dist/core/config.js.map +1 -0
  31. package/dist/core/logger.d.ts +31 -0
  32. package/dist/core/logger.d.ts.map +1 -0
  33. package/dist/core/logger.js +42 -0
  34. package/dist/core/logger.js.map +1 -0
  35. package/dist/core/state.d.ts +52 -0
  36. package/dist/core/state.d.ts.map +1 -0
  37. package/dist/core/state.js +27 -0
  38. package/dist/core/state.js.map +1 -0
  39. package/dist/graphs/batchProcessor.d.ts +72 -0
  40. package/dist/graphs/batchProcessor.d.ts.map +1 -0
  41. package/dist/graphs/batchProcessor.js +94 -0
  42. package/dist/graphs/batchProcessor.js.map +1 -0
  43. package/dist/graphs/singleDocument.d.ts +303 -0
  44. package/dist/graphs/singleDocument.d.ts.map +1 -0
  45. package/dist/graphs/singleDocument.js +93 -0
  46. package/dist/graphs/singleDocument.js.map +1 -0
  47. package/dist/index.d.ts +8 -0
  48. package/dist/index.d.ts.map +1 -0
  49. package/dist/index.js +10 -0
  50. package/dist/index.js.map +1 -0
  51. package/dist/logger.d.ts +24 -0
  52. package/dist/logger.d.ts.map +1 -0
  53. package/dist/logger.js +36 -0
  54. package/dist/logger.js.map +1 -0
  55. package/dist/logo.d.ts +2 -0
  56. package/dist/logo.d.ts.map +1 -0
  57. package/dist/logo.js +3 -0
  58. package/dist/logo.js.map +1 -0
  59. package/dist/nodes/fileTypeRouter.d.ts +16 -0
  60. package/dist/nodes/fileTypeRouter.d.ts.map +1 -0
  61. package/dist/nodes/fileTypeRouter.js +72 -0
  62. package/dist/nodes/fileTypeRouter.js.map +1 -0
  63. package/dist/nodes/geminiExtraction.d.ts +19 -0
  64. package/dist/nodes/geminiExtraction.d.ts.map +1 -0
  65. package/dist/nodes/geminiExtraction.js +87 -0
  66. package/dist/nodes/geminiExtraction.js.map +1 -0
  67. package/dist/nodes/libreOfficeToPdf.d.ts +8 -0
  68. package/dist/nodes/libreOfficeToPdf.d.ts.map +1 -0
  69. package/dist/nodes/libreOfficeToPdf.js +61 -0
  70. package/dist/nodes/libreOfficeToPdf.js.map +1 -0
  71. package/dist/nodes/llmExtractionNode.d.ts +19 -0
  72. package/dist/nodes/llmExtractionNode.d.ts.map +1 -0
  73. package/dist/nodes/llmExtractionNode.js +68 -0
  74. package/dist/nodes/llmExtractionNode.js.map +1 -0
  75. package/dist/nodes/markdownChunker.d.ts +8 -0
  76. package/dist/nodes/markdownChunker.d.ts.map +1 -0
  77. package/dist/nodes/markdownChunker.js +24 -0
  78. package/dist/nodes/markdownChunker.js.map +1 -0
  79. package/dist/nodes/markdownMerger.d.ts +9 -0
  80. package/dist/nodes/markdownMerger.d.ts.map +1 -0
  81. package/dist/nodes/markdownMerger.js +33 -0
  82. package/dist/nodes/markdownMerger.js.map +1 -0
  83. package/dist/nodes/markdownNormalizer.d.ts +10 -0
  84. package/dist/nodes/markdownNormalizer.d.ts.map +1 -0
  85. package/dist/nodes/markdownNormalizer.js +46 -0
  86. package/dist/nodes/markdownNormalizer.js.map +1 -0
  87. package/dist/nodes/openrouterEmbedder.d.ts +7 -0
  88. package/dist/nodes/openrouterEmbedder.d.ts.map +1 -0
  89. package/dist/nodes/openrouterEmbedder.js +31 -0
  90. package/dist/nodes/openrouterEmbedder.js.map +1 -0
  91. package/dist/nodes/pdfSplitter.d.ts +7 -0
  92. package/dist/nodes/pdfSplitter.d.ts.map +1 -0
  93. package/dist/nodes/pdfSplitter.js +41 -0
  94. package/dist/nodes/pdfSplitter.js.map +1 -0
  95. package/dist/nodes/saveMarkdown.d.ts +7 -0
  96. package/dist/nodes/saveMarkdown.d.ts.map +1 -0
  97. package/dist/nodes/saveMarkdown.js +28 -0
  98. package/dist/nodes/saveMarkdown.js.map +1 -0
  99. package/dist/nodes/textExtractorNode.d.ts +7 -0
  100. package/dist/nodes/textExtractorNode.d.ts.map +1 -0
  101. package/dist/nodes/textExtractorNode.js +39 -0
  102. package/dist/nodes/textExtractorNode.js.map +1 -0
  103. package/dist/nodes/upstashUpsert.d.ts +7 -0
  104. package/dist/nodes/upstashUpsert.d.ts.map +1 -0
  105. package/dist/nodes/upstashUpsert.js +45 -0
  106. package/dist/nodes/upstashUpsert.js.map +1 -0
  107. package/dist/nodes/vectorEmbedderNode.d.ts +7 -0
  108. package/dist/nodes/vectorEmbedderNode.d.ts.map +1 -0
  109. package/dist/nodes/vectorEmbedderNode.js +23 -0
  110. package/dist/nodes/vectorEmbedderNode.js.map +1 -0
  111. package/dist/nodes/vectorUpsertNode.d.ts +7 -0
  112. package/dist/nodes/vectorUpsertNode.d.ts.map +1 -0
  113. package/dist/nodes/vectorUpsertNode.js +45 -0
  114. package/dist/nodes/vectorUpsertNode.js.map +1 -0
  115. package/dist/pipeline.d.ts +303 -0
  116. package/dist/pipeline.d.ts.map +1 -0
  117. package/dist/pipeline.js +93 -0
  118. package/dist/pipeline.js.map +1 -0
  119. package/dist/state.d.ts +52 -0
  120. package/dist/state.d.ts.map +1 -0
  121. package/dist/state.js +27 -0
  122. package/dist/state.js.map +1 -0
  123. package/dist/vectorStore.d.ts +24 -0
  124. package/dist/vectorStore.d.ts.map +1 -0
  125. package/dist/vectorStore.js +22 -0
  126. package/dist/vectorStore.js.map +1 -0
  127. package/package.json +55 -0
package/README.md ADDED
@@ -0,0 +1,203 @@
1
+ # 🚀 Virstack Doc Ingest
2
+
3
+ **Virstack Doc Ingest** is a high-performance, parallelized document ingestion and vectorization pipeline designed for scalable Retrieval-Augmented Generation (RAG) applications.
4
+
5
+ Powered by **LangGraph** for resilient orchestration, **OpenRouter / Gemini** for advanced vision/text extraction, and natively supporting **Upstash Vector** (with easily injectable custom adapters), this library acts as a universal bridge between your raw documents and your AI applications.
6
+
7
+ ---
8
+
9
+ ## ✨ Key Features
10
+
11
+ - **Universal Multi-Format Support:** Natively processes PDF, DOCX, XLSX, PPTX, CSV, TXT, HTML, and EPUB files.
12
+ - **Dual-Tier Parallelism:** Concurrently processes multiple files while simultaneously splitting and routing large PDFs into parallel Vision-API execution nodes.
13
+ - **Smart Type Routing:** Automatically identifies MIME types and dynamically routes files to the most optimal, parser-specific extraction graph.
14
+ - **Provider Agnostic Architecture:** Built entirely on Dependency Injection. Easily swap out LLMs, Embeddings, and Vector Databases (Pinecone, Qdrant, etc.) to fit your specific stack.
15
+ - **Gorgeous TUI (Text User Interface):** Features a beautiful, interactive command-line interface with interactive menus and live, non-tearing spinners.
16
+
17
+ ---
18
+
19
+ ## 🛠️ System Prerequisites
20
+
21
+ **IMPORTANT:** For parsing complex Office documents (e.g., `.docx`, `.pptx`, `.xlsx`, `.epub`), the pipeline relies on **LibreOffice** for high-fidelity conversion.
22
+
23
+ If you are only parsing PDFs, TXT, or CSV files, LibreOffice is **not** required.
24
+
25
+ ### Installing LibreOffice:
26
+
27
+ - **macOS:** `brew install --cask libreoffice`
28
+ - **Ubuntu/Debian:** `sudo apt-get install libreoffice`
29
+ - **Windows:** Download the installer from [libreoffice.org](https://www.libreoffice.org/)
30
+
31
+ ---
32
+
33
+ ## 📦 Installation
34
+
35
+ You can install Virstack Doc Ingest globally to use as a standalone CLI tool, or locally to utilize its powerful API in your custom Node.js applications.
36
+
37
+ ### Global Install (CLI Usage)
38
+
39
+ ```bash
40
+ npm install -g virstack-doc-ingest
41
+ ```
42
+
43
+ ### Local Install (Library Usage)
44
+
45
+ ```bash
46
+ npm install virstack-doc-ingest
47
+ ```
48
+
49
+ ---
50
+
51
+ ## 💻 Usage Mode 1: Interactive CLI
52
+
53
+ The CLI offers a completely interactive, wizard-based experience.
54
+
55
+ ### 1. Environment Configuration
56
+
57
+ Create a `.env` file in the directory where you plan to run the command:
58
+
59
+ ```env
60
+ OPENROUTER_API_KEY=sk-or-v1-...
61
+ UPSTASH_VECTOR_URL=https://...
62
+ UPSTASH_VECTOR_TOKEN=...
63
+ LLM_MODEL=google/gemini-2.0-flash-001
64
+ EMBEDDING_MODEL=text-embedding-3-large
65
+ MAX_CONCURRENT_FILES=3
66
+ MAX_CONCURRENT_API_CALLS=15
67
+ ```
68
+
69
+ ### 2. Running the Tool
70
+
71
+ To launch the interactive wizard (which allows you to select files, folders, or paste raw text):
72
+
73
+ ```bash
74
+ virstack-doc-ingest
75
+ ```
76
+
77
+ To bypass the wizard and directly ingest a specific file or directory:
78
+
79
+ ```bash
80
+ # Process a single contract
81
+ virstack-doc-ingest ./documents/contract.pdf
82
+
83
+ # Process all documents in a directory
84
+ virstack-doc-ingest ./company-knowledge-base/
85
+
86
+ # Run with verbose, node-level diagnostics
87
+ virstack-doc-ingest ./documents/ --verbose
88
+ ```
89
+
90
+ ### Example Output
91
+
92
+ ```text
93
+ __ __ _ _ _ ____ ___ _
94
+ \ \ / / (_) _ __ ___ | |_ __ _ ___ | | __ | _ \ ___ ___ |_ _| _ __ __ _ ___ ___ | |_
95
+ \ \ / / | | | '__| / __| | __| / _` | / __| | |/ / | | | | / _ \ / __| | | | '_ \ / _` | / _ \ / __| | __|
96
+ \ V / | | | | \__ \ | |_ | (_| | | (__ | < | |_| | | (_) | | (__ | | | | | | | (_| | | __/ \__ \ | |_
97
+ \_/ |_| |_| |___/ \__| \__,_| \___| |_|\_\ |____/ \___/ \___| |___| |_| |_| \__, | \___| |___/ \__|
98
+ |___/
99
+ ┌ Welcome to Virstack Doc Ingest
100
+
101
+ ◇ What file or directory would you like to process?
102
+ │ ./docs
103
+
104
+ ◇ Found 2 file(s). Ready to process?
105
+ │ Yes, start ingestion
106
+
107
+ ◇ ✔ Processing complete in 41.8s!
108
+
109
+ ◇ Final Results: 2 succeeded, 0 failed
110
+
111
+ │ ✔ PRES1 CIS 6006-Updated Assessment.p │ 28 chunks │ 28 vectors │ 41.7s
112
+
113
+ │ ✔ VAI-020-021-Webhook-Implementation. │ 12 chunks │ 12 vectors │ 27.9s
114
+
115
+ └ Pipeline Finished Successfully!
116
+ ```
117
+
118
+ ---
119
+
120
+ ## 🛠️ Usage Mode 2: Node.js Library (100% Provider Agnostic)
121
+
122
+ Virstack Doc Ingest is designed to be fully embedded into your own SaaS backends or ETL pipelines. It is rigidly decoupled from concrete implementations.
123
+
124
+ ### Default Built-In Adapters
125
+
126
+ The package exports fully functional adapters for typical stacks:
127
+
128
+ - `OpenRouterLlmAdapter`
129
+ - `OpenRouterEmbeddingAdapter`
130
+ - `UpstashAdapter`
131
+
132
+ ### Custom Adapter Example (Pinecone & Local LLM)
133
+
134
+ Here is how you inject your own custom logic into the LangGraph pipeline:
135
+
136
+ ```typescript
137
+ import {
138
+ initializeConfig,
139
+ batchGraph,
140
+ type VectorStoreAdapter,
141
+ type LlmAdapter,
142
+ type EmbeddingAdapter,
143
+ OpenRouterEmbeddingAdapter,
144
+ } from "virstack-doc-ingest";
145
+ import { Pinecone } from "@pinecone-database/pinecone";
146
+
147
+ // 1. Define your own Vector Store connection
148
+ class CustomPineconeAdapter implements VectorStoreAdapter {
149
+ async upsert(records: any[]) {
150
+ /* ... */
151
+ }
152
+ }
153
+
154
+ // 2. Define a custom Local AI processor (e.g. Ollama)
155
+ class LocalLLMAdapter implements LlmAdapter {
156
+ async extractText(image: Buffer, mime: string) {
157
+ return "extracted text";
158
+ }
159
+ }
160
+
161
+ // 3. Mount the adapters to the global configuration
162
+ initializeConfig({
163
+ llm: new LocalLLMAdapter(),
164
+ embedder: new OpenRouterEmbeddingAdapter(
165
+ process.env.OPENROUTER_API_KEY!,
166
+ "text-embedding-3-large",
167
+ ),
168
+ vectorStore: new CustomPineconeAdapter(),
169
+ maxConcurrentFiles: 5,
170
+ });
171
+
172
+ // 4. Invoke the ingestion orchestrator
173
+ async function processData() {
174
+ const files = ["./uploads/report_2024.pdf", "./uploads/financials.xlsx"];
175
+
176
+ console.log("Orchestrating batch ingestion...");
177
+ const result = await batchGraph.invoke({ files });
178
+ console.log("Success! Extracted documents:", result.results.length);
179
+ }
180
+
181
+ processData();
182
+ ```
183
+
184
+ ---
185
+
186
+ ## ⚙️ Configuration Reference
187
+
188
+ When invoking `initializeConfig(options)`, the `VirstackDocIngestConfig` interface accepts the following properties:
189
+
190
+ | Property | Type | Default | Description |
191
+ | :-------------------- | :------------------- | :----------- | :-------------------------------------------------------------------------------- |
192
+ | `llm` | `LlmAdapter` | **Required** | Provider for extracting text (especially from PDF images via Vision APIs). |
193
+ | `embedder` | `EmbeddingAdapter` | **Required** | Provider for transforming text chunks into vector arrays. |
194
+ | `vectorStore` | `VectorStoreAdapter` | **Required** | Provider targeting your target vector database for final persistence. |
195
+ | `openRouterApiKey` | `string` | `undefined` | Required if utilizing the built-in OpenRouter adapters. |
196
+ | `maxConcurrentFiles` | `number` | `3` | Maximum files mapped into the parallel processing queue simultaneously. |
197
+ | `maxConcurrentApi` | `number` | `15` | Global connection limit to prevent 429 Rate Limit errors across all active nodes. |
198
+ | `maxTokens` | `number` | `16384` | Maximum allowable context window for the Vision LLM extraction. |
199
+ | `embeddingDimensions` | `number` | `1536` | Target dimensions for the output vectors. |
200
+ | `chunkSize` | `number` | `1000` | Target character length for Markdown recursive section chunking. |
201
+ | `chunkOverlap` | `number` | `100` | Overlapping character padding between contiguous chunk segments. |
202
+ | `pdfPagesPerChunk` | `number` | `10` | Number of PDF pages grouped together before a parallel Vision evaluation. |
203
+ | `systemPrompt` | `string` | _(default)_ | Injection of custom instructions overriding the default extraction constraints. |
@@ -0,0 +1,25 @@
1
+ export interface LlmInput {
2
+ systemPrompt: string;
3
+ userText: string;
4
+ base64PdfChunk?: string;
5
+ }
6
+ export interface LlmAdapter {
7
+ generateMarkdown(input: LlmInput): Promise<string>;
8
+ }
9
+ export interface EmbeddingAdapter {
10
+ embed(chunks: string[]): Promise<number[][]>;
11
+ }
12
+ export declare class OpenRouterLlmAdapter implements LlmAdapter {
13
+ private client;
14
+ private model;
15
+ constructor(apiKey: string, model: string);
16
+ generateMarkdown(input: LlmInput): Promise<string>;
17
+ }
18
+ export declare class OpenRouterEmbeddingAdapter implements EmbeddingAdapter {
19
+ private client;
20
+ private model;
21
+ private dimensions;
22
+ constructor(apiKey: string, model: string, dimensions?: number);
23
+ embed(chunks: string[]): Promise<number[][]>;
24
+ }
25
+ //# sourceMappingURL=aiAdapters.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"aiAdapters.d.ts","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,QAAQ;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;CAC9C;AAID,qBAAa,oBAAqB,YAAW,UAAU;IACrD,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;gBAEV,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAKnC,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;CAgCzD;AAED,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,GAAE,MAAa;IAM9D,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CA4BnD"}
@@ -0,0 +1,73 @@
1
+ import { OpenRouter } from "@openrouter/sdk";
2
+ // --- BUILT-IN ADAPTERS (For CLI to use by default) ---
3
+ export class OpenRouterLlmAdapter {
4
+ client;
5
+ model;
6
+ constructor(apiKey, model) {
7
+ this.client = new OpenRouter({ apiKey });
8
+ this.model = model;
9
+ }
10
+ async generateMarkdown(input) {
11
+ const userContent = [];
12
+ if (input.base64PdfChunk) {
13
+ userContent.push({
14
+ type: "image_url",
15
+ imageUrl: { url: `data:application/pdf;base64,${input.base64PdfChunk}` },
16
+ });
17
+ }
18
+ userContent.push({ type: "text", text: input.userText });
19
+ const response = await this.client.chat.send({
20
+ chatGenerationParams: {
21
+ model: this.model,
22
+ messages: [
23
+ { role: "system", content: input.systemPrompt },
24
+ { role: "user", content: userContent },
25
+ ],
26
+ temperature: 0,
27
+ }
28
+ });
29
+ // The SDK returns ChatResponse when not streaming
30
+ const chatResponse = response;
31
+ const content = chatResponse.choices?.[0]?.message?.content;
32
+ if (Array.isArray(content)) {
33
+ return content.map(item => (item.type === 'text' ? item.text : '')).join('').trim();
34
+ }
35
+ return (typeof content === "string" ? content.trim() : "");
36
+ }
37
+ }
38
+ export class OpenRouterEmbeddingAdapter {
39
+ client;
40
+ model;
41
+ dimensions;
42
+ constructor(apiKey, model, dimensions = 1536) {
43
+ this.client = new OpenRouter({ apiKey });
44
+ this.model = model;
45
+ this.dimensions = dimensions;
46
+ }
47
+ async embed(chunks) {
48
+ const response = await this.client.embeddings.generate({
49
+ requestBody: {
50
+ model: this.model,
51
+ input: chunks,
52
+ dimensions: this.dimensions,
53
+ }
54
+ });
55
+ if (typeof response === "string") {
56
+ throw new Error(`OpenRouter Embeddings API returned unexpected string response: ${response}`);
57
+ }
58
+ // Maintain chunk order based on OpenRouter response structure
59
+ let embeddingsList = response.data;
60
+ if (embeddingsList.length > 0 && typeof embeddingsList[0].index === "number") {
61
+ embeddingsList = embeddingsList.sort((a, b) => a.index - b.index);
62
+ }
63
+ return embeddingsList.map((item) => {
64
+ const emb = item.embedding;
65
+ if (typeof emb === "string") {
66
+ // Some models might return base64 if requested, but we expect float arrays
67
+ throw new Error("Received unexpected string embedding from OpenRouter");
68
+ }
69
+ return emb;
70
+ });
71
+ }
72
+ }
73
+ //# sourceMappingURL=aiAdapters.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"aiAdapters.js","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAkB7C,wDAAwD;AAExD,MAAM,OAAO,oBAAoB;IACvB,MAAM,CAAa;IACnB,KAAK,CAAS;IAEtB,YAAY,MAAc,EAAE,KAAa;QACvC,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,KAAe;QACpC,MAAM,WAAW,GAAU,EAAE,CAAC;QAE9B,IAAI,KAAK,CAAC,cAAc,EAAE,CAAC;YACzB,WAAW,CAAC,IAAI,CAAC;gBACf,IAAI,EAAE,WAAW;gBACjB,QAAQ,EAAE,EAAE,GAAG,EAAE,+BAA+B,KAAK,CAAC,cAAc,EAAE,EAAE;aACzE,CAAC,CAAC;QACL,CAAC;QACD,WAAW,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEzD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;YAC3C,oBAAoB,EAAE;gBACpB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,YAAY,EAAE;oBAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAkB,EAAE;iBAC9C;gBACD,WAAW,EAAE,CAAC;aACf;SACF,CAAC,CAAC;QAEH,kDAAkD;QAClD,MAAM,YAAY,GAAG,QAAe,CAAC;QACrC,MAAM,OAAO,GAAG,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;QAE5D,IAAI,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;YAC3B,OAAO,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACtF,CAAC;QAED,OAAO,CAAC,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;CACF;AAED,MAAM,OAAO,0BAA0B;IAC7B,MAAM,CAAa;IACnB,KAAK,CAAS;IACd,UAAU,CAAS;IAE3B,YAAY,MAAc,EAAE,KAAa,EAAE,aAAqB,IAAI;QAClE,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAgB;QAC1B,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC;YACrD,WAAW,EAAE;gBACX,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,KAAK,EAAE,MAAM;gBACb,UAAU,EAAE,IAAI,CAAC,UAAU;aAC5B;SACF,CAAC,CAAC;QAEH,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE,CAAC;YACjC,MAAM,IAAI,KAAK,CAAC,kEAAkE,QAAQ,EAAE,CAAC,CAAC;QAChG,CAAC;QAED,8DAA8D;QAC9D,IAAI,cAAc,GAAG,QAAQ,CAAC,IAAI,CAAC;QACnC,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,IAAI,OAAO,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC7E,cAAc,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAC9E,CAAC;QAED,OAAO,cAAc,CAAC,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE;YACtC,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC;YAC3B,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;gBAC3B,2EAA2E;gBAC3E,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;YAC3E,CAAC;YACD,OAAO,GAAG,CAAC;QACb,CAAC,CAAC,CAAC;IACL,CAAC;CACF"}
@@ -0,0 +1,24 @@
1
+ /**
2
+ * The standard shape of a record that the pipeline will produce.
3
+ */
4
+ export interface VectorRecord {
5
+ id: string;
6
+ vector: number[];
7
+ metadata: Record<string, any>;
8
+ }
9
+ /**
10
+ * The contract that any vector database adapter must follow.
11
+ */
12
+ export interface VectorStoreAdapter {
13
+ upsert(records: VectorRecord[]): Promise<void>;
14
+ }
15
+ /**
16
+ * Built-in adapter for Upstash Vector.
17
+ * Used by default when running via the CLI.
18
+ */
19
+ export declare class UpstashAdapter implements VectorStoreAdapter {
20
+ private index;
21
+ constructor(url: string, token: string);
22
+ upsert(records: VectorRecord[]): Promise<void>;
23
+ }
24
+ //# sourceMappingURL=vectorStore.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vectorStore.d.ts","sourceRoot":"","sources":["../../src/adapters/vectorStore.ts"],"names":[],"mappings":"AAEA;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAC/B;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CAChD;AAED;;;GAGG;AACH,qBAAa,cAAe,YAAW,kBAAkB;IACvD,OAAO,CAAC,KAAK,CAAQ;gBAET,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAIhC,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;CAWrD"}
@@ -0,0 +1,22 @@
1
+ import { Index } from "@upstash/vector";
2
+ /**
3
+ * Built-in adapter for Upstash Vector.
4
+ * Used by default when running via the CLI.
5
+ */
6
+ export class UpstashAdapter {
7
+ index;
8
+ constructor(url, token) {
9
+ this.index = new Index({ url, token });
10
+ }
11
+ async upsert(records) {
12
+ const upstashRecords = records.map((r) => ({
13
+ id: r.id,
14
+ vector: r.vector,
15
+ metadata: r.metadata,
16
+ // For Upstash, the string payload goes in 'data' usually, but metadata is fine.
17
+ data: r.metadata.text || "",
18
+ }));
19
+ await this.index.upsert(upstashRecords);
20
+ }
21
+ }
22
+ //# sourceMappingURL=vectorStore.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vectorStore.js","sourceRoot":"","sources":["../../src/adapters/vectorStore.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,iBAAiB,CAAC;AAkBxC;;;GAGG;AACH,MAAM,OAAO,cAAc;IACjB,KAAK,CAAQ;IAErB,YAAY,GAAW,EAAE,KAAa;QACpC,IAAI,CAAC,KAAK,GAAG,IAAI,KAAK,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,OAAuB;QAClC,MAAM,cAAc,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACzC,EAAE,EAAE,CAAC,CAAC,EAAE;YACR,MAAM,EAAE,CAAC,CAAC,MAAM;YAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;YACpB,gFAAgF;YAChF,IAAI,EAAE,CAAC,CAAC,QAAQ,CAAC,IAAI,IAAI,EAAE;SAC5B,CAAC,CAAC,CAAC;QAEJ,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC;IAC1C,CAAC;CACF"}
@@ -0,0 +1,25 @@
1
+ export interface LlmInput {
2
+ systemPrompt: string;
3
+ userText: string;
4
+ base64PdfChunk?: string;
5
+ }
6
+ export interface LlmAdapter {
7
+ generateMarkdown(input: LlmInput): Promise<string>;
8
+ }
9
+ export interface EmbeddingAdapter {
10
+ embed(chunks: string[]): Promise<number[][]>;
11
+ }
12
+ export declare class OpenRouterLlmAdapter implements LlmAdapter {
13
+ private client;
14
+ private model;
15
+ constructor(apiKey: string, model: string);
16
+ generateMarkdown(input: LlmInput): Promise<string>;
17
+ }
18
+ export declare class OpenRouterEmbeddingAdapter implements EmbeddingAdapter {
19
+ private client;
20
+ private model;
21
+ private dimensions;
22
+ constructor(apiKey: string, model: string, dimensions?: number);
23
+ embed(chunks: string[]): Promise<number[][]>;
24
+ }
25
+ //# sourceMappingURL=aiAdapters.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"aiAdapters.d.ts","sourceRoot":"","sources":["../src/aiAdapters.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,QAAQ;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;CAC9C;AAID,qBAAa,oBAAqB,YAAW,UAAU;IACrD,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;gBAEV,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAKnC,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;CAsBzD;AAED,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,GAAE,MAAa;IAM9D,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CAWnD"}
@@ -0,0 +1,50 @@
1
+ import { OpenAI } from "openai";
2
+ // --- BUILT-IN ADAPTERS (For CLI to use by default) ---
3
+ export class OpenRouterLlmAdapter {
4
+ client;
5
+ model;
6
+ constructor(apiKey, model) {
7
+ this.client = new OpenAI({ baseURL: "https://openrouter.ai/api/v1", apiKey });
8
+ this.model = model;
9
+ }
10
+ async generateMarkdown(input) {
11
+ const userContent = [];
12
+ if (input.base64PdfChunk) {
13
+ userContent.push({
14
+ type: "file",
15
+ file: { filename: "chunk.pdf", file_data: `data:application/pdf;base64,${input.base64PdfChunk}` },
16
+ });
17
+ }
18
+ userContent.push({ type: "text", text: input.userText });
19
+ const response = await this.client.chat.completions.create({
20
+ model: this.model,
21
+ messages: [
22
+ { role: "system", content: input.systemPrompt },
23
+ { role: "user", content: userContent },
24
+ ],
25
+ temperature: 0,
26
+ });
27
+ return response.choices[0]?.message?.content?.trim() ?? "";
28
+ }
29
+ }
30
+ export class OpenRouterEmbeddingAdapter {
31
+ client;
32
+ model;
33
+ dimensions;
34
+ constructor(apiKey, model, dimensions = 1536) {
35
+ this.client = new OpenAI({ baseURL: "https://openrouter.ai/api/v1", apiKey });
36
+ this.model = model;
37
+ this.dimensions = dimensions;
38
+ }
39
+ async embed(chunks) {
40
+ const response = await this.client.embeddings.create({
41
+ model: this.model,
42
+ input: chunks,
43
+ dimensions: this.dimensions,
44
+ });
45
+ // Sort to maintain chunk order
46
+ const sorted = response.data.sort((a, b) => a.index - b.index);
47
+ return sorted.map((item) => item.embedding);
48
+ }
49
+ }
50
+ //# sourceMappingURL=aiAdapters.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"aiAdapters.js","sourceRoot":"","sources":["../src/aiAdapters.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAkBhC,wDAAwD;AAExD,MAAM,OAAO,oBAAoB;IACvB,MAAM,CAAS;IACf,KAAK,CAAS;IAEtB,YAAY,MAAc,EAAE,KAAa;QACvC,IAAI,CAAC,MAAM,GAAG,IAAI,MAAM,CAAC,EAAE,OAAO,EAAE,8BAA8B,EAAE,MAAM,EAAE,CAAC,CAAC;QAC9E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,KAAe;QACpC,MAAM,WAAW,GAAU,EAAE,CAAC;QAE9B,IAAI,KAAK,CAAC,cAAc,EAAE,CAAC;YACzB,WAAW,CAAC,IAAI,CAAC;gBACf,IAAI,EAAE,MAAM;gBACZ,IAAI,EAAE,EAAE,QAAQ,EAAE,WAAW,EAAE,SAAS,EAAE,+BAA+B,KAAK,CAAC,cAAc,EAAE,EAAE;aAClG,CAAC,CAAC;QACL,CAAC;QACD,WAAW,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEzD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC;YACzD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ,EAAE;gBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,YAAY,EAAE;gBAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAkB,EAAE;aAC9C;YACD,WAAW,EAAE,CAAC;SACf,CAAC,CAAC;QAEH,OAAO,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC7D,CAAC;CACF;AAED,MAAM,OAAO,0BAA0B;IAC7B,MAAM,CAAS;IACf,KAAK,CAAS;IACd,UAAU,CAAS;IAE3B,YAAY,MAAc,EAAE,KAAa,EAAE,aAAqB,IAAI;QAClE,IAAI,CAAC,MAAM,GAAG,IAAI,MAAM,CAAC,EAAE,OAAO,EAAE,8BAA8B,EAAE,MAAM,EAAE,CAAC,CAAC;QAC9E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAgB;QAC1B,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;YACnD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,MAAM;YACb,UAAU,EAAE,IAAI,CAAC,UAAU;SACrB,CAAC,CAAC;QAEV,+BAA+B;QAC/B,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QACzE,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACnD,CAAC;CACF"}
Binary file
@@ -0,0 +1,52 @@
1
+ /**
2
+ * State for the batch document processing graph.
3
+ */
4
+ export declare const BatchStateAnnotation: import("@langchain/langgraph").AnnotationRoot<{
5
+ /** Input: List of absolute file paths to process */
6
+ files: {
7
+ (): import("@langchain/langgraph").LastValue<string[]>;
8
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
9
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
10
+ };
11
+ /** Output: Collection of results from each individual document run */
12
+ results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
13
+ }>;
14
+ export type BatchState = typeof BatchStateAnnotation.State;
15
+ export declare const graph: import("@langchain/langgraph").CompiledStateGraph<import("@langchain/langgraph").StateType<{
16
+ /** Input: List of absolute file paths to process */
17
+ files: {
18
+ (): import("@langchain/langgraph").LastValue<string[]>;
19
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
20
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
21
+ };
22
+ /** Output: Collection of results from each individual document run */
23
+ results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
24
+ }>, import("@langchain/langgraph").UpdateType<{
25
+ /** Input: List of absolute file paths to process */
26
+ files: {
27
+ (): import("@langchain/langgraph").LastValue<string[]>;
28
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
29
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
30
+ };
31
+ /** Output: Collection of results from each individual document run */
32
+ results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
33
+ }>, "__start__" | "workerNode" | "orchestrator" | "summaryNode", {
34
+ /** Input: List of absolute file paths to process */
35
+ files: {
36
+ (): import("@langchain/langgraph").LastValue<string[]>;
37
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
38
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
39
+ };
40
+ /** Output: Collection of results from each individual document run */
41
+ results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
42
+ }, {
43
+ /** Input: List of absolute file paths to process */
44
+ files: {
45
+ (): import("@langchain/langgraph").LastValue<string[]>;
46
+ (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
47
+ Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
48
+ };
49
+ /** Output: Collection of results from each individual document run */
50
+ results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
51
+ }, import("@langchain/langgraph").StateDefinition>;
52
+ //# sourceMappingURL=batchPipeline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"batchPipeline.d.ts","sourceRoot":"","sources":["../src/batchPipeline.ts"],"names":[],"mappings":"AAKA;;GAEG;AACH,eAAO,MAAM,oBAAoB;IAC/B,oDAAoD;;;;;;IAGpD,sEAAsE;;EAKtE,CAAC;AAEH,MAAM,MAAM,UAAU,GAAG,OAAO,oBAAoB,CAAC,KAAK,CAAC;AAyE3D,eAAO,MAAM,KAAK;IAnFhB,oDAAoD;;;;;;IAGpD,sEAAsE;;;IAHtE,oDAAoD;;;;;;IAGpD,sEAAsE;;;IAHtE,oDAAoD;;;;;;IAGpD,sEAAsE;;;IAHtE,oDAAoD;;;;;;IAGpD,sEAAsE;;kDAgF/B,CAAC"}
@@ -0,0 +1,81 @@
1
+ import { Annotation, StateGraph, Send, END } from "@langchain/langgraph";
2
+ import { graph as singleDocGraph } from "./pipeline.js";
3
+ import path from "node:path";
4
+ import { logger, LogSource } from "./logger.js";
5
+ /**
6
+ * State for the batch document processing graph.
7
+ */
8
+ export const BatchStateAnnotation = Annotation.Root({
9
+ /** Input: List of absolute file paths to process */
10
+ files: (Annotation),
11
+ /** Output: Collection of results from each individual document run */
12
+ results: Annotation({
13
+ reducer: (x, y) => x.concat(y),
14
+ default: () => [],
15
+ }),
16
+ });
17
+ /**
18
+ * Orchestrator node: Prepares the batch and sends it to workers.
19
+ */
20
+ function orchestrator(state) {
21
+ logger.info(LogSource.BATCH, `Starting processing of ${state.files.length} documents.`);
22
+ return {};
23
+ }
24
+ /**
25
+ * Conditional edge: Uses the Send API to spawn parallel worker nodes for each file.
26
+ */
27
+ function distributeFiles(state) {
28
+ return state.files.map((file) => new Send("workerNode", { filePath: file }));
29
+ }
30
+ /**
31
+ * Worker node: Invokes the original single-document pipeline.
32
+ */
33
+ async function workerNode(state) {
34
+ const fileName = path.basename(state.filePath);
35
+ const startTime = Date.now();
36
+ try {
37
+ // Invoke the existing compiled single-document graph
38
+ const result = await singleDocGraph.invoke({ filePath: state.filePath });
39
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
40
+ return {
41
+ results: [{
42
+ file: fileName,
43
+ status: "success",
44
+ chunks: result.textChunks?.length ?? 0,
45
+ vectors: result.vectors?.length ?? 0,
46
+ durationSec: elapsed,
47
+ }]
48
+ };
49
+ }
50
+ catch (error) {
51
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
52
+ return {
53
+ results: [{
54
+ file: fileName,
55
+ status: "error",
56
+ chunks: 0,
57
+ vectors: 0,
58
+ durationSec: elapsed,
59
+ error: error.message,
60
+ }]
61
+ };
62
+ }
63
+ }
64
+ /**
65
+ * Final node: Prints a summary of the entire batch.
66
+ */
67
+ function summaryNode(state) {
68
+ logger.success(LogSource.BATCH, "All documents processed.");
69
+ return {};
70
+ }
71
+ // Build the batch graph
72
+ const batchGraph = new StateGraph(BatchStateAnnotation)
73
+ .addNode("orchestrator", orchestrator)
74
+ .addNode("workerNode", workerNode)
75
+ .addNode("summaryNode", summaryNode)
76
+ .addEdge("__start__", "orchestrator")
77
+ .addConditionalEdges("orchestrator", distributeFiles, ["workerNode"])
78
+ .addEdge("workerNode", "summaryNode")
79
+ .addEdge("summaryNode", END);
80
+ export const graph = batchGraph.compile();
81
+ //# sourceMappingURL=batchPipeline.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"batchPipeline.js","sourceRoot":"","sources":["../src/batchPipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,sBAAsB,CAAC;AACzE,OAAO,EAAE,KAAK,IAAI,cAAc,EAAE,MAAM,eAAe,CAAC;AACxD,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAEhD;;GAEG;AACH,MAAM,CAAC,MAAM,oBAAoB,GAAG,UAAU,CAAC,IAAI,CAAC;IAClD,oDAAoD;IACpD,KAAK,EAAE,CAAA,UAAoB,CAAA;IAE3B,sEAAsE;IACtE,OAAO,EAAE,UAAU,CAAQ;QACzB,OAAO,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;QAC9B,OAAO,EAAE,GAAG,EAAE,CAAC,EAAE;KAClB,CAAC;CACH,CAAC,CAAC;AAIH;;GAEG;AACH,SAAS,YAAY,CAAC,KAAiB;IACrC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,0BAA0B,KAAK,CAAC,KAAK,CAAC,MAAM,aAAa,CAAC,CAAC;IACxF,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,KAAiB;IACxC,OAAO,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAC9B,IAAI,IAAI,CAAC,YAAY,EAAE,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAC3C,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,UAAU,CAAC,KAA2B;IACnD,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,IAAI,CAAC;QACH,qDAAqD;QACrD,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QACzE,MAAM,OAAO,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAE7D,OAAO;YACL,OAAO,EAAE,CAAC;oBACR,IAAI,EAAE,QAAQ;oBACd,MAAM,EAAE,SAAS;oBACjB,MAAM,EAAE,MAAM,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;oBACtC,OAAO,EAAE,MAAM,CAAC,OAAO,EAAE,MAAM,IAAI,CAAC;oBACpC,WAAW,EAAE,OAAO;iBACrB,CAAC;SACH,CAAC;IACJ,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,MAAM,OAAO,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAC7D,OAAO;YACL,OAAO,EAAE,CAAC;oBACR,IAAI,EAAE,QAAQ;oBACd,MAAM,EAAE,OAAO;oBACf,MAAM,EAAE,CAAC;oBACT,OAAO,EAAE,CAAC;oBACV,WAAW,EAAE,OAAO;oBACpB,KAAK,EAAE,KAAK,CAAC,OAAO;iBACrB,CAAC;SACH,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,KAAiB;IACpC,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,KAAK,EAAE,0BAA0B,CAAC,CAAC;IAC5D,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,wBAAwB;AACxB,MAAM,UAAU,GAAG,IAAI,UAAU,CAAC,oBAAoB,CAAC;KACpD,OAAO,CAAC,cAAc,EAAE,YAAY,CAAC;KACrC,OAAO,CAAC,YAAY,EAAE,UAAU,CAAC;KACjC,OAAO,CAAC,aAAa,EAAE,WAAW,CAAC;KACnC,OAAO,CAAC,WAAW,EAAE,cAAc,CAAC;KACpC,mBAAmB,CAAC,cAAc,EAAE,eAAe,EAAE,CAAC,YAAY,CAAC,CAAC;KACpE,OAAO,CAAC,YAAY,EAAE,aAAa,CAAC;KACpC,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;AAE/B,MAAM,CAAC,MAAM,KAAK,GAAG,UAAU,CAAC,OAAO,EAAE,CAAC"}
package/dist/cli.d.ts ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+ import "dotenv/config";
3
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,eAAe,CAAC"}