@virstack/doc-ingest 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +203 -0
- package/dist/adapters/aiAdapters.d.ts +25 -0
- package/dist/adapters/aiAdapters.d.ts.map +1 -0
- package/dist/adapters/aiAdapters.js +73 -0
- package/dist/adapters/aiAdapters.js.map +1 -0
- package/dist/adapters/vectorStore.d.ts +24 -0
- package/dist/adapters/vectorStore.d.ts.map +1 -0
- package/dist/adapters/vectorStore.js +22 -0
- package/dist/adapters/vectorStore.js.map +1 -0
- package/dist/aiAdapters.d.ts +25 -0
- package/dist/aiAdapters.d.ts.map +1 -0
- package/dist/aiAdapters.js +50 -0
- package/dist/aiAdapters.js.map +1 -0
- package/dist/assets/logo.png +0 -0
- package/dist/batchPipeline.d.ts +52 -0
- package/dist/batchPipeline.d.ts.map +1 -0
- package/dist/batchPipeline.js +81 -0
- package/dist/batchPipeline.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +217 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +26 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +97 -0
- package/dist/config.js.map +1 -0
- package/dist/core/config.d.ts +26 -0
- package/dist/core/config.d.ts.map +1 -0
- package/dist/core/config.js +106 -0
- package/dist/core/config.js.map +1 -0
- package/dist/core/logger.d.ts +31 -0
- package/dist/core/logger.d.ts.map +1 -0
- package/dist/core/logger.js +42 -0
- package/dist/core/logger.js.map +1 -0
- package/dist/core/state.d.ts +52 -0
- package/dist/core/state.d.ts.map +1 -0
- package/dist/core/state.js +27 -0
- package/dist/core/state.js.map +1 -0
- package/dist/graphs/batchProcessor.d.ts +72 -0
- package/dist/graphs/batchProcessor.d.ts.map +1 -0
- package/dist/graphs/batchProcessor.js +94 -0
- package/dist/graphs/batchProcessor.js.map +1 -0
- package/dist/graphs/singleDocument.d.ts +303 -0
- package/dist/graphs/singleDocument.d.ts.map +1 -0
- package/dist/graphs/singleDocument.js +93 -0
- package/dist/graphs/singleDocument.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +10 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +24 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +36 -0
- package/dist/logger.js.map +1 -0
- package/dist/logo.d.ts +2 -0
- package/dist/logo.d.ts.map +1 -0
- package/dist/logo.js +3 -0
- package/dist/logo.js.map +1 -0
- package/dist/nodes/fileTypeRouter.d.ts +16 -0
- package/dist/nodes/fileTypeRouter.d.ts.map +1 -0
- package/dist/nodes/fileTypeRouter.js +72 -0
- package/dist/nodes/fileTypeRouter.js.map +1 -0
- package/dist/nodes/geminiExtraction.d.ts +19 -0
- package/dist/nodes/geminiExtraction.d.ts.map +1 -0
- package/dist/nodes/geminiExtraction.js +87 -0
- package/dist/nodes/geminiExtraction.js.map +1 -0
- package/dist/nodes/libreOfficeToPdf.d.ts +8 -0
- package/dist/nodes/libreOfficeToPdf.d.ts.map +1 -0
- package/dist/nodes/libreOfficeToPdf.js +61 -0
- package/dist/nodes/libreOfficeToPdf.js.map +1 -0
- package/dist/nodes/llmExtractionNode.d.ts +19 -0
- package/dist/nodes/llmExtractionNode.d.ts.map +1 -0
- package/dist/nodes/llmExtractionNode.js +68 -0
- package/dist/nodes/llmExtractionNode.js.map +1 -0
- package/dist/nodes/markdownChunker.d.ts +8 -0
- package/dist/nodes/markdownChunker.d.ts.map +1 -0
- package/dist/nodes/markdownChunker.js +24 -0
- package/dist/nodes/markdownChunker.js.map +1 -0
- package/dist/nodes/markdownMerger.d.ts +9 -0
- package/dist/nodes/markdownMerger.d.ts.map +1 -0
- package/dist/nodes/markdownMerger.js +33 -0
- package/dist/nodes/markdownMerger.js.map +1 -0
- package/dist/nodes/markdownNormalizer.d.ts +10 -0
- package/dist/nodes/markdownNormalizer.d.ts.map +1 -0
- package/dist/nodes/markdownNormalizer.js +46 -0
- package/dist/nodes/markdownNormalizer.js.map +1 -0
- package/dist/nodes/openrouterEmbedder.d.ts +7 -0
- package/dist/nodes/openrouterEmbedder.d.ts.map +1 -0
- package/dist/nodes/openrouterEmbedder.js +31 -0
- package/dist/nodes/openrouterEmbedder.js.map +1 -0
- package/dist/nodes/pdfSplitter.d.ts +7 -0
- package/dist/nodes/pdfSplitter.d.ts.map +1 -0
- package/dist/nodes/pdfSplitter.js +41 -0
- package/dist/nodes/pdfSplitter.js.map +1 -0
- package/dist/nodes/saveMarkdown.d.ts +7 -0
- package/dist/nodes/saveMarkdown.d.ts.map +1 -0
- package/dist/nodes/saveMarkdown.js +28 -0
- package/dist/nodes/saveMarkdown.js.map +1 -0
- package/dist/nodes/textExtractorNode.d.ts +7 -0
- package/dist/nodes/textExtractorNode.d.ts.map +1 -0
- package/dist/nodes/textExtractorNode.js +39 -0
- package/dist/nodes/textExtractorNode.js.map +1 -0
- package/dist/nodes/upstashUpsert.d.ts +7 -0
- package/dist/nodes/upstashUpsert.d.ts.map +1 -0
- package/dist/nodes/upstashUpsert.js +45 -0
- package/dist/nodes/upstashUpsert.js.map +1 -0
- package/dist/nodes/vectorEmbedderNode.d.ts +7 -0
- package/dist/nodes/vectorEmbedderNode.d.ts.map +1 -0
- package/dist/nodes/vectorEmbedderNode.js +23 -0
- package/dist/nodes/vectorEmbedderNode.js.map +1 -0
- package/dist/nodes/vectorUpsertNode.d.ts +7 -0
- package/dist/nodes/vectorUpsertNode.d.ts.map +1 -0
- package/dist/nodes/vectorUpsertNode.js +45 -0
- package/dist/nodes/vectorUpsertNode.js.map +1 -0
- package/dist/pipeline.d.ts +303 -0
- package/dist/pipeline.d.ts.map +1 -0
- package/dist/pipeline.js +93 -0
- package/dist/pipeline.js.map +1 -0
- package/dist/state.d.ts +52 -0
- package/dist/state.d.ts.map +1 -0
- package/dist/state.js +27 -0
- package/dist/state.js.map +1 -0
- package/dist/vectorStore.d.ts +24 -0
- package/dist/vectorStore.d.ts.map +1 -0
- package/dist/vectorStore.js +22 -0
- package/dist/vectorStore.js.map +1 -0
- package/package.json +55 -0
package/README.md
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# 🚀 Virstack Doc Ingest
|
|
2
|
+
|
|
3
|
+
**Virstack Doc Ingest** is a high-performance, parallelized document ingestion and vectorization pipeline designed for scalable Retrieval-Augmented Generation (RAG) applications.
|
|
4
|
+
|
|
5
|
+
Powered by **LangGraph** for resilient orchestration, **OpenRouter / Gemini** for advanced vision/text extraction, and natively supporting **Upstash Vector** (with easily injectable custom adapters), this library acts as a universal bridge between your raw documents and your AI applications.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## ✨ Key Features
|
|
10
|
+
|
|
11
|
+
- **Universal Multi-Format Support:** Natively processes PDF, DOCX, XLSX, PPTX, CSV, TXT, HTML, and EPUB files.
|
|
12
|
+
- **Dual-Tier Parallelism:** Concurrently processes multiple files while simultaneously splitting and routing large PDFs into parallel Vision-API execution nodes.
|
|
13
|
+
- **Smart Type Routing:** Automatically identifies MIME types and dynamically routes files to the most optimal, parser-specific extraction graph.
|
|
14
|
+
- **Provider Agnostic Architecture:** Built entirely on Dependency Injection. Easily swap out LLMs, Embeddings, and Vector Databases (Pinecone, Qdrant, etc.) to fit your specific stack.
|
|
15
|
+
- **Gorgeous TUI (Text User Interface):** Features a beautiful, interactive command-line interface with interactive menus and live, non-tearing spinners.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 🛠️ System Prerequisites
|
|
20
|
+
|
|
21
|
+
**IMPORTANT:** For parsing complex Office documents (e.g., `.docx`, `.pptx`, `.xlsx`, `.epub`), the pipeline relies on **LibreOffice** for high-fidelity conversion.
|
|
22
|
+
|
|
23
|
+
If you are only parsing PDFs, TXT, or CSV files, LibreOffice is **not** required.
|
|
24
|
+
|
|
25
|
+
### Installing LibreOffice:
|
|
26
|
+
|
|
27
|
+
- **macOS:** `brew install --cask libreoffice`
|
|
28
|
+
- **Ubuntu/Debian:** `sudo apt-get install libreoffice`
|
|
29
|
+
- **Windows:** Download the installer from [libreoffice.org](https://www.libreoffice.org/)
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 📦 Installation
|
|
34
|
+
|
|
35
|
+
You can install Virstack Doc Ingest globally to use as a standalone CLI tool, or locally to utilize its powerful API in your custom Node.js applications.
|
|
36
|
+
|
|
37
|
+
### Global Install (CLI Usage)
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
npm install -g virstack-doc-ingest
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Local Install (Library Usage)
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
npm install virstack-doc-ingest
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## 💻 Usage Mode 1: Interactive CLI
|
|
52
|
+
|
|
53
|
+
The CLI offers a completely interactive, wizard-based experience.
|
|
54
|
+
|
|
55
|
+
### 1. Environment Configuration
|
|
56
|
+
|
|
57
|
+
Create a `.env` file in the directory where you plan to run the command:
|
|
58
|
+
|
|
59
|
+
```env
|
|
60
|
+
OPENROUTER_API_KEY=sk-or-v1-...
|
|
61
|
+
UPSTASH_VECTOR_URL=https://...
|
|
62
|
+
UPSTASH_VECTOR_TOKEN=...
|
|
63
|
+
LLM_MODEL=google/gemini-2.0-flash-001
|
|
64
|
+
EMBEDDING_MODEL=text-embedding-3-large
|
|
65
|
+
MAX_CONCURRENT_FILES=3
|
|
66
|
+
MAX_CONCURRENT_API_CALLS=15
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### 2. Running the Tool
|
|
70
|
+
|
|
71
|
+
To launch the interactive wizard (which allows you to select files, folders, or paste raw text):
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
virstack-doc-ingest
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
To bypass the wizard and directly ingest a specific file or directory:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Process a single contract
|
|
81
|
+
virstack-doc-ingest ./documents/contract.pdf
|
|
82
|
+
|
|
83
|
+
# Process all documents in a directory
|
|
84
|
+
virstack-doc-ingest ./company-knowledge-base/
|
|
85
|
+
|
|
86
|
+
# Run with verbose, node-level diagnostics
|
|
87
|
+
virstack-doc-ingest ./documents/ --verbose
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Example Output
|
|
91
|
+
|
|
92
|
+
```text
|
|
93
|
+
__ __ _ _ _ ____ ___ _
|
|
94
|
+
\ \ / / (_) _ __ ___ | |_ __ _ ___ | | __ | _ \ ___ ___ |_ _| _ __ __ _ ___ ___ | |_
|
|
95
|
+
\ \ / / | | | '__| / __| | __| / _` | / __| | |/ / | | | | / _ \ / __| | | | '_ \ / _` | / _ \ / __| | __|
|
|
96
|
+
\ V / | | | | \__ \ | |_ | (_| | | (__ | < | |_| | | (_) | | (__ | | | | | | | (_| | | __/ \__ \ | |_
|
|
97
|
+
\_/ |_| |_| |___/ \__| \__,_| \___| |_|\_\ |____/ \___/ \___| |___| |_| |_| \__, | \___| |___/ \__|
|
|
98
|
+
|___/
|
|
99
|
+
┌ Welcome to Virstack Doc Ingest
|
|
100
|
+
│
|
|
101
|
+
◇ What file or directory would you like to process?
|
|
102
|
+
│ ./docs
|
|
103
|
+
│
|
|
104
|
+
◇ Found 2 file(s). Ready to process?
|
|
105
|
+
│ Yes, start ingestion
|
|
106
|
+
│
|
|
107
|
+
◇ ✔ Processing complete in 41.8s!
|
|
108
|
+
│
|
|
109
|
+
◇ Final Results: 2 succeeded, 0 failed
|
|
110
|
+
│
|
|
111
|
+
│ ✔ PRES1 CIS 6006-Updated Assessment.p │ 28 chunks │ 28 vectors │ 41.7s
|
|
112
|
+
│
|
|
113
|
+
│ ✔ VAI-020-021-Webhook-Implementation. │ 12 chunks │ 12 vectors │ 27.9s
|
|
114
|
+
│
|
|
115
|
+
└ Pipeline Finished Successfully!
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## 🛠️ Usage Mode 2: Node.js Library (100% Provider Agnostic)
|
|
121
|
+
|
|
122
|
+
Virstack Doc Ingest is designed to be fully embedded into your own SaaS backends or ETL pipelines. It is rigidly decoupled from concrete implementations.
|
|
123
|
+
|
|
124
|
+
### Default Built-In Adapters
|
|
125
|
+
|
|
126
|
+
The package exports fully functional adapters for typical stacks:
|
|
127
|
+
|
|
128
|
+
- `OpenRouterLlmAdapter`
|
|
129
|
+
- `OpenRouterEmbeddingAdapter`
|
|
130
|
+
- `UpstashAdapter`
|
|
131
|
+
|
|
132
|
+
### Custom Adapter Example (Pinecone & Local LLM)
|
|
133
|
+
|
|
134
|
+
Here is how you inject your own custom logic into the LangGraph pipeline:
|
|
135
|
+
|
|
136
|
+
```typescript
|
|
137
|
+
import {
|
|
138
|
+
initializeConfig,
|
|
139
|
+
batchGraph,
|
|
140
|
+
type VectorStoreAdapter,
|
|
141
|
+
type LlmAdapter,
|
|
142
|
+
type EmbeddingAdapter,
|
|
143
|
+
OpenRouterEmbeddingAdapter,
|
|
144
|
+
} from "virstack-doc-ingest";
|
|
145
|
+
import { Pinecone } from "@pinecone-database/pinecone";
|
|
146
|
+
|
|
147
|
+
// 1. Define your own Vector Store connection
|
|
148
|
+
class CustomPineconeAdapter implements VectorStoreAdapter {
|
|
149
|
+
async upsert(records: any[]) {
|
|
150
|
+
/* ... */
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// 2. Define a custom Local AI processor (e.g. Ollama)
|
|
155
|
+
class LocalLLMAdapter implements LlmAdapter {
|
|
156
|
+
async extractText(image: Buffer, mime: string) {
|
|
157
|
+
return "extracted text";
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// 3. Mount the adapters to the global configuration
|
|
162
|
+
initializeConfig({
|
|
163
|
+
llm: new LocalLLMAdapter(),
|
|
164
|
+
embedder: new OpenRouterEmbeddingAdapter(
|
|
165
|
+
process.env.OPENROUTER_API_KEY!,
|
|
166
|
+
"text-embedding-3-large",
|
|
167
|
+
),
|
|
168
|
+
vectorStore: new CustomPineconeAdapter(),
|
|
169
|
+
maxConcurrentFiles: 5,
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
// 4. Invoke the ingestion orchestrator
|
|
173
|
+
async function processData() {
|
|
174
|
+
const files = ["./uploads/report_2024.pdf", "./uploads/financials.xlsx"];
|
|
175
|
+
|
|
176
|
+
console.log("Orchestrating batch ingestion...");
|
|
177
|
+
const result = await batchGraph.invoke({ files });
|
|
178
|
+
console.log("Success! Extracted documents:", result.results.length);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
processData();
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## ⚙️ Configuration Reference
|
|
187
|
+
|
|
188
|
+
When invoking `initializeConfig(options)`, the `VirstackDocIngestConfig` interface accepts the following properties:
|
|
189
|
+
|
|
190
|
+
| Property | Type | Default | Description |
|
|
191
|
+
| :-------------------- | :------------------- | :----------- | :-------------------------------------------------------------------------------- |
|
|
192
|
+
| `llm` | `LlmAdapter` | **Required** | Provider for extracting text (especially from PDF images via Vision APIs). |
|
|
193
|
+
| `embedder` | `EmbeddingAdapter` | **Required** | Provider for transforming text chunks into vector arrays. |
|
|
194
|
+
| `vectorStore` | `VectorStoreAdapter` | **Required** | Provider targeting your target vector database for final persistence. |
|
|
195
|
+
| `openRouterApiKey` | `string` | `undefined` | Required if utilizing the built-in OpenRouter adapters. |
|
|
196
|
+
| `maxConcurrentFiles` | `number` | `3` | Maximum files mapped into the parallel processing queue simultaneously. |
|
|
197
|
+
| `maxConcurrentApi` | `number` | `15` | Global connection limit to prevent 429 Rate Limit errors across all active nodes. |
|
|
198
|
+
| `maxTokens` | `number` | `16384` | Maximum allowable context window for the Vision LLM extraction. |
|
|
199
|
+
| `embeddingDimensions` | `number` | `1536` | Target dimensions for the output vectors. |
|
|
200
|
+
| `chunkSize` | `number` | `1000` | Target character length for Markdown recursive section chunking. |
|
|
201
|
+
| `chunkOverlap` | `number` | `100` | Overlapping character padding between contiguous chunk segments. |
|
|
202
|
+
| `pdfPagesPerChunk` | `number` | `10` | Number of PDF pages grouped together before a parallel Vision evaluation. |
|
|
203
|
+
| `systemPrompt` | `string` | _(default)_ | Injection of custom instructions overriding the default extraction constraints. |
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
export interface LlmInput {
|
|
2
|
+
systemPrompt: string;
|
|
3
|
+
userText: string;
|
|
4
|
+
base64PdfChunk?: string;
|
|
5
|
+
}
|
|
6
|
+
export interface LlmAdapter {
|
|
7
|
+
generateMarkdown(input: LlmInput): Promise<string>;
|
|
8
|
+
}
|
|
9
|
+
export interface EmbeddingAdapter {
|
|
10
|
+
embed(chunks: string[]): Promise<number[][]>;
|
|
11
|
+
}
|
|
12
|
+
export declare class OpenRouterLlmAdapter implements LlmAdapter {
|
|
13
|
+
private client;
|
|
14
|
+
private model;
|
|
15
|
+
constructor(apiKey: string, model: string);
|
|
16
|
+
generateMarkdown(input: LlmInput): Promise<string>;
|
|
17
|
+
}
|
|
18
|
+
export declare class OpenRouterEmbeddingAdapter implements EmbeddingAdapter {
|
|
19
|
+
private client;
|
|
20
|
+
private model;
|
|
21
|
+
private dimensions;
|
|
22
|
+
constructor(apiKey: string, model: string, dimensions?: number);
|
|
23
|
+
embed(chunks: string[]): Promise<number[][]>;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=aiAdapters.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"aiAdapters.d.ts","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,QAAQ;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;CAC9C;AAID,qBAAa,oBAAqB,YAAW,UAAU;IACrD,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;gBAEV,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAKnC,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;CAgCzD;AAED,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,GAAE,MAAa;IAM9D,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CA4BnD"}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { OpenRouter } from "@openrouter/sdk";
|
|
2
|
+
// --- BUILT-IN ADAPTERS (For CLI to use by default) ---
|
|
3
|
+
export class OpenRouterLlmAdapter {
|
|
4
|
+
client;
|
|
5
|
+
model;
|
|
6
|
+
constructor(apiKey, model) {
|
|
7
|
+
this.client = new OpenRouter({ apiKey });
|
|
8
|
+
this.model = model;
|
|
9
|
+
}
|
|
10
|
+
async generateMarkdown(input) {
|
|
11
|
+
const userContent = [];
|
|
12
|
+
if (input.base64PdfChunk) {
|
|
13
|
+
userContent.push({
|
|
14
|
+
type: "image_url",
|
|
15
|
+
imageUrl: { url: `data:application/pdf;base64,${input.base64PdfChunk}` },
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
userContent.push({ type: "text", text: input.userText });
|
|
19
|
+
const response = await this.client.chat.send({
|
|
20
|
+
chatGenerationParams: {
|
|
21
|
+
model: this.model,
|
|
22
|
+
messages: [
|
|
23
|
+
{ role: "system", content: input.systemPrompt },
|
|
24
|
+
{ role: "user", content: userContent },
|
|
25
|
+
],
|
|
26
|
+
temperature: 0,
|
|
27
|
+
}
|
|
28
|
+
});
|
|
29
|
+
// The SDK returns ChatResponse when not streaming
|
|
30
|
+
const chatResponse = response;
|
|
31
|
+
const content = chatResponse.choices?.[0]?.message?.content;
|
|
32
|
+
if (Array.isArray(content)) {
|
|
33
|
+
return content.map(item => (item.type === 'text' ? item.text : '')).join('').trim();
|
|
34
|
+
}
|
|
35
|
+
return (typeof content === "string" ? content.trim() : "");
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
export class OpenRouterEmbeddingAdapter {
|
|
39
|
+
client;
|
|
40
|
+
model;
|
|
41
|
+
dimensions;
|
|
42
|
+
constructor(apiKey, model, dimensions = 1536) {
|
|
43
|
+
this.client = new OpenRouter({ apiKey });
|
|
44
|
+
this.model = model;
|
|
45
|
+
this.dimensions = dimensions;
|
|
46
|
+
}
|
|
47
|
+
async embed(chunks) {
|
|
48
|
+
const response = await this.client.embeddings.generate({
|
|
49
|
+
requestBody: {
|
|
50
|
+
model: this.model,
|
|
51
|
+
input: chunks,
|
|
52
|
+
dimensions: this.dimensions,
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
if (typeof response === "string") {
|
|
56
|
+
throw new Error(`OpenRouter Embeddings API returned unexpected string response: ${response}`);
|
|
57
|
+
}
|
|
58
|
+
// Maintain chunk order based on OpenRouter response structure
|
|
59
|
+
let embeddingsList = response.data;
|
|
60
|
+
if (embeddingsList.length > 0 && typeof embeddingsList[0].index === "number") {
|
|
61
|
+
embeddingsList = embeddingsList.sort((a, b) => a.index - b.index);
|
|
62
|
+
}
|
|
63
|
+
return embeddingsList.map((item) => {
|
|
64
|
+
const emb = item.embedding;
|
|
65
|
+
if (typeof emb === "string") {
|
|
66
|
+
// Some models might return base64 if requested, but we expect float arrays
|
|
67
|
+
throw new Error("Received unexpected string embedding from OpenRouter");
|
|
68
|
+
}
|
|
69
|
+
return emb;
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
//# sourceMappingURL=aiAdapters.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"aiAdapters.js","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAkB7C,wDAAwD;AAExD,MAAM,OAAO,oBAAoB;IACvB,MAAM,CAAa;IACnB,KAAK,CAAS;IAEtB,YAAY,MAAc,EAAE,KAAa;QACvC,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,KAAe;QACpC,MAAM,WAAW,GAAU,EAAE,CAAC;QAE9B,IAAI,KAAK,CAAC,cAAc,EAAE,CAAC;YACzB,WAAW,CAAC,IAAI,CAAC;gBACf,IAAI,EAAE,WAAW;gBACjB,QAAQ,EAAE,EAAE,GAAG,EAAE,+BAA+B,KAAK,CAAC,cAAc,EAAE,EAAE;aACzE,CAAC,CAAC;QACL,CAAC;QACD,WAAW,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEzD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;YAC3C,oBAAoB,EAAE;gBACpB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,YAAY,EAAE;oBAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAkB,EAAE;iBAC9C;gBACD,WAAW,EAAE,CAAC;aACf;SACF,CAAC,CAAC;QAEH,kDAAkD;QAClD,MAAM,YAAY,GAAG,QAAe,CAAC;QACrC,MAAM,OAAO,GAAG,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;QAE5D,IAAI,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;YAC3B,OAAO,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACtF,CAAC;QAED,OAAO,CAAC,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;CACF;AAED,MAAM,OAAO,0BAA0B;IAC7B,MAAM,CAAa;IACnB,KAAK,CAAS;IACd,UAAU,CAAS;IAE3B,YAAY,MAAc,EAAE,KAAa,EAAE,aAAqB,IAAI;QAClE,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAgB;QAC1B,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC;YACrD,WAAW,EAAE;gBACX,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,KAAK,EAAE,MAAM;gBACb,UAAU,EAAE,IAAI,CAAC,UAAU;aAC5B;SACF,CAAC,CAAC;QAEH,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE,CAAC;YACjC,MAAM,IAAI,KAAK,CAAC,kEAAkE,QAAQ,EAAE,CAAC,CAAC;QAChG,CAAC;QAED,8DAA8D;QAC9D,IAAI,cAAc,GAAG,QAAQ,CAAC,IAAI,CAAC;QACnC,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,IAAI,OAAO,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC7E,cAAc,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAC9E,CAAC;QAED,OAAO,cAAc,CAAC,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE;YACtC,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC;YAC3B,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;gBAC3B,2EAA2E;gBAC3E,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;YAC3E,CAAC;YACD,OAAO,GAAG,CAAC;QACb,CAAC,CAAC,CAAC;IACL,CAAC;CACF"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The standard shape of a record that the pipeline will produce.
|
|
3
|
+
*/
|
|
4
|
+
export interface VectorRecord {
|
|
5
|
+
id: string;
|
|
6
|
+
vector: number[];
|
|
7
|
+
metadata: Record<string, any>;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* The contract that any vector database adapter must follow.
|
|
11
|
+
*/
|
|
12
|
+
export interface VectorStoreAdapter {
|
|
13
|
+
upsert(records: VectorRecord[]): Promise<void>;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Built-in adapter for Upstash Vector.
|
|
17
|
+
* Used by default when running via the CLI.
|
|
18
|
+
*/
|
|
19
|
+
export declare class UpstashAdapter implements VectorStoreAdapter {
|
|
20
|
+
private index;
|
|
21
|
+
constructor(url: string, token: string);
|
|
22
|
+
upsert(records: VectorRecord[]): Promise<void>;
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=vectorStore.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vectorStore.d.ts","sourceRoot":"","sources":["../../src/adapters/vectorStore.ts"],"names":[],"mappings":"AAEA;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAC/B;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CAChD;AAED;;;GAGG;AACH,qBAAa,cAAe,YAAW,kBAAkB;IACvD,OAAO,CAAC,KAAK,CAAQ;gBAET,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAIhC,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;CAWrD"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { Index } from "@upstash/vector";
|
|
2
|
+
/**
|
|
3
|
+
* Built-in adapter for Upstash Vector.
|
|
4
|
+
* Used by default when running via the CLI.
|
|
5
|
+
*/
|
|
6
|
+
export class UpstashAdapter {
|
|
7
|
+
index;
|
|
8
|
+
constructor(url, token) {
|
|
9
|
+
this.index = new Index({ url, token });
|
|
10
|
+
}
|
|
11
|
+
async upsert(records) {
|
|
12
|
+
const upstashRecords = records.map((r) => ({
|
|
13
|
+
id: r.id,
|
|
14
|
+
vector: r.vector,
|
|
15
|
+
metadata: r.metadata,
|
|
16
|
+
// For Upstash, the string payload goes in 'data' usually, but metadata is fine.
|
|
17
|
+
data: r.metadata.text || "",
|
|
18
|
+
}));
|
|
19
|
+
await this.index.upsert(upstashRecords);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
//# sourceMappingURL=vectorStore.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vectorStore.js","sourceRoot":"","sources":["../../src/adapters/vectorStore.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,iBAAiB,CAAC;AAkBxC;;;GAGG;AACH,MAAM,OAAO,cAAc;IACjB,KAAK,CAAQ;IAErB,YAAY,GAAW,EAAE,KAAa;QACpC,IAAI,CAAC,KAAK,GAAG,IAAI,KAAK,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,OAAuB;QAClC,MAAM,cAAc,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACzC,EAAE,EAAE,CAAC,CAAC,EAAE;YACR,MAAM,EAAE,CAAC,CAAC,MAAM;YAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;YACpB,gFAAgF;YAChF,IAAI,EAAE,CAAC,CAAC,QAAQ,CAAC,IAAI,IAAI,EAAE;SAC5B,CAAC,CAAC,CAAC;QAEJ,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC;IAC1C,CAAC;CACF"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
export interface LlmInput {
|
|
2
|
+
systemPrompt: string;
|
|
3
|
+
userText: string;
|
|
4
|
+
base64PdfChunk?: string;
|
|
5
|
+
}
|
|
6
|
+
export interface LlmAdapter {
|
|
7
|
+
generateMarkdown(input: LlmInput): Promise<string>;
|
|
8
|
+
}
|
|
9
|
+
export interface EmbeddingAdapter {
|
|
10
|
+
embed(chunks: string[]): Promise<number[][]>;
|
|
11
|
+
}
|
|
12
|
+
export declare class OpenRouterLlmAdapter implements LlmAdapter {
|
|
13
|
+
private client;
|
|
14
|
+
private model;
|
|
15
|
+
constructor(apiKey: string, model: string);
|
|
16
|
+
generateMarkdown(input: LlmInput): Promise<string>;
|
|
17
|
+
}
|
|
18
|
+
export declare class OpenRouterEmbeddingAdapter implements EmbeddingAdapter {
|
|
19
|
+
private client;
|
|
20
|
+
private model;
|
|
21
|
+
private dimensions;
|
|
22
|
+
constructor(apiKey: string, model: string, dimensions?: number);
|
|
23
|
+
embed(chunks: string[]): Promise<number[][]>;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=aiAdapters.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"aiAdapters.d.ts","sourceRoot":"","sources":["../src/aiAdapters.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,QAAQ;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;CAC9C;AAID,qBAAa,oBAAqB,YAAW,UAAU;IACrD,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;gBAEV,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAKnC,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;CAsBzD;AAED,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,GAAE,MAAa;IAM9D,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CAWnD"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { OpenAI } from "openai";
|
|
2
|
+
// --- BUILT-IN ADAPTERS (For CLI to use by default) ---
|
|
3
|
+
export class OpenRouterLlmAdapter {
|
|
4
|
+
client;
|
|
5
|
+
model;
|
|
6
|
+
constructor(apiKey, model) {
|
|
7
|
+
this.client = new OpenAI({ baseURL: "https://openrouter.ai/api/v1", apiKey });
|
|
8
|
+
this.model = model;
|
|
9
|
+
}
|
|
10
|
+
async generateMarkdown(input) {
|
|
11
|
+
const userContent = [];
|
|
12
|
+
if (input.base64PdfChunk) {
|
|
13
|
+
userContent.push({
|
|
14
|
+
type: "file",
|
|
15
|
+
file: { filename: "chunk.pdf", file_data: `data:application/pdf;base64,${input.base64PdfChunk}` },
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
userContent.push({ type: "text", text: input.userText });
|
|
19
|
+
const response = await this.client.chat.completions.create({
|
|
20
|
+
model: this.model,
|
|
21
|
+
messages: [
|
|
22
|
+
{ role: "system", content: input.systemPrompt },
|
|
23
|
+
{ role: "user", content: userContent },
|
|
24
|
+
],
|
|
25
|
+
temperature: 0,
|
|
26
|
+
});
|
|
27
|
+
return response.choices[0]?.message?.content?.trim() ?? "";
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
export class OpenRouterEmbeddingAdapter {
|
|
31
|
+
client;
|
|
32
|
+
model;
|
|
33
|
+
dimensions;
|
|
34
|
+
constructor(apiKey, model, dimensions = 1536) {
|
|
35
|
+
this.client = new OpenAI({ baseURL: "https://openrouter.ai/api/v1", apiKey });
|
|
36
|
+
this.model = model;
|
|
37
|
+
this.dimensions = dimensions;
|
|
38
|
+
}
|
|
39
|
+
async embed(chunks) {
|
|
40
|
+
const response = await this.client.embeddings.create({
|
|
41
|
+
model: this.model,
|
|
42
|
+
input: chunks,
|
|
43
|
+
dimensions: this.dimensions,
|
|
44
|
+
});
|
|
45
|
+
// Sort to maintain chunk order
|
|
46
|
+
const sorted = response.data.sort((a, b) => a.index - b.index);
|
|
47
|
+
return sorted.map((item) => item.embedding);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=aiAdapters.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"aiAdapters.js","sourceRoot":"","sources":["../src/aiAdapters.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAkBhC,wDAAwD;AAExD,MAAM,OAAO,oBAAoB;IACvB,MAAM,CAAS;IACf,KAAK,CAAS;IAEtB,YAAY,MAAc,EAAE,KAAa;QACvC,IAAI,CAAC,MAAM,GAAG,IAAI,MAAM,CAAC,EAAE,OAAO,EAAE,8BAA8B,EAAE,MAAM,EAAE,CAAC,CAAC;QAC9E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,KAAe;QACpC,MAAM,WAAW,GAAU,EAAE,CAAC;QAE9B,IAAI,KAAK,CAAC,cAAc,EAAE,CAAC;YACzB,WAAW,CAAC,IAAI,CAAC;gBACf,IAAI,EAAE,MAAM;gBACZ,IAAI,EAAE,EAAE,QAAQ,EAAE,WAAW,EAAE,SAAS,EAAE,+BAA+B,KAAK,CAAC,cAAc,EAAE,EAAE;aAClG,CAAC,CAAC;QACL,CAAC;QACD,WAAW,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEzD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC;YACzD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ,EAAE;gBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,YAAY,EAAE;gBAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAkB,EAAE;aAC9C;YACD,WAAW,EAAE,CAAC;SACf,CAAC,CAAC;QAEH,OAAO,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC7D,CAAC;CACF;AAED,MAAM,OAAO,0BAA0B;IAC7B,MAAM,CAAS;IACf,KAAK,CAAS;IACd,UAAU,CAAS;IAE3B,YAAY,MAAc,EAAE,KAAa,EAAE,aAAqB,IAAI;QAClE,IAAI,CAAC,MAAM,GAAG,IAAI,MAAM,CAAC,EAAE,OAAO,EAAE,8BAA8B,EAAE,MAAM,EAAE,CAAC,CAAC;QAC9E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAgB;QAC1B,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;YACnD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,MAAM;YACb,UAAU,EAAE,IAAI,CAAC,UAAU;SACrB,CAAC,CAAC;QAEV,+BAA+B;QAC/B,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QACzE,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACnD,CAAC;CACF"}
|
|
Binary file
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* State for the batch document processing graph.
|
|
3
|
+
*/
|
|
4
|
+
export declare const BatchStateAnnotation: import("@langchain/langgraph").AnnotationRoot<{
|
|
5
|
+
/** Input: List of absolute file paths to process */
|
|
6
|
+
files: {
|
|
7
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
8
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
9
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
10
|
+
};
|
|
11
|
+
/** Output: Collection of results from each individual document run */
|
|
12
|
+
results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
|
|
13
|
+
}>;
|
|
14
|
+
export type BatchState = typeof BatchStateAnnotation.State;
|
|
15
|
+
export declare const graph: import("@langchain/langgraph").CompiledStateGraph<import("@langchain/langgraph").StateType<{
|
|
16
|
+
/** Input: List of absolute file paths to process */
|
|
17
|
+
files: {
|
|
18
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
19
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
20
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
21
|
+
};
|
|
22
|
+
/** Output: Collection of results from each individual document run */
|
|
23
|
+
results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
|
|
24
|
+
}>, import("@langchain/langgraph").UpdateType<{
|
|
25
|
+
/** Input: List of absolute file paths to process */
|
|
26
|
+
files: {
|
|
27
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
28
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
29
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
30
|
+
};
|
|
31
|
+
/** Output: Collection of results from each individual document run */
|
|
32
|
+
results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
|
|
33
|
+
}>, "__start__" | "workerNode" | "orchestrator" | "summaryNode", {
|
|
34
|
+
/** Input: List of absolute file paths to process */
|
|
35
|
+
files: {
|
|
36
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
37
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
38
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
39
|
+
};
|
|
40
|
+
/** Output: Collection of results from each individual document run */
|
|
41
|
+
results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
|
|
42
|
+
}, {
|
|
43
|
+
/** Input: List of absolute file paths to process */
|
|
44
|
+
files: {
|
|
45
|
+
(): import("@langchain/langgraph").LastValue<string[]>;
|
|
46
|
+
(annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
|
|
47
|
+
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
48
|
+
};
|
|
49
|
+
/** Output: Collection of results from each individual document run */
|
|
50
|
+
results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
|
|
51
|
+
}, import("@langchain/langgraph").StateDefinition>;
|
|
52
|
+
//# sourceMappingURL=batchPipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"batchPipeline.d.ts","sourceRoot":"","sources":["../src/batchPipeline.ts"],"names":[],"mappings":"AAKA;;GAEG;AACH,eAAO,MAAM,oBAAoB;IAC/B,oDAAoD;;;;;;IAGpD,sEAAsE;;EAKtE,CAAC;AAEH,MAAM,MAAM,UAAU,GAAG,OAAO,oBAAoB,CAAC,KAAK,CAAC;AAyE3D,eAAO,MAAM,KAAK;IAnFhB,oDAAoD;;;;;;IAGpD,sEAAsE;;;IAHtE,oDAAoD;;;;;;IAGpD,sEAAsE;;;IAHtE,oDAAoD;;;;;;IAGpD,sEAAsE;;;IAHtE,oDAAoD;;;;;;IAGpD,sEAAsE;;kDAgF/B,CAAC"}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { Annotation, StateGraph, Send, END } from "@langchain/langgraph";
|
|
2
|
+
import { graph as singleDocGraph } from "./pipeline.js";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { logger, LogSource } from "./logger.js";
|
|
5
|
+
/**
|
|
6
|
+
* State for the batch document processing graph.
|
|
7
|
+
*/
|
|
8
|
+
export const BatchStateAnnotation = Annotation.Root({
|
|
9
|
+
/** Input: List of absolute file paths to process */
|
|
10
|
+
files: (Annotation),
|
|
11
|
+
/** Output: Collection of results from each individual document run */
|
|
12
|
+
results: Annotation({
|
|
13
|
+
reducer: (x, y) => x.concat(y),
|
|
14
|
+
default: () => [],
|
|
15
|
+
}),
|
|
16
|
+
});
|
|
17
|
+
/**
|
|
18
|
+
* Orchestrator node: Prepares the batch and sends it to workers.
|
|
19
|
+
*/
|
|
20
|
+
function orchestrator(state) {
|
|
21
|
+
logger.info(LogSource.BATCH, `Starting processing of ${state.files.length} documents.`);
|
|
22
|
+
return {};
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Conditional edge: Uses the Send API to spawn parallel worker nodes for each file.
|
|
26
|
+
*/
|
|
27
|
+
function distributeFiles(state) {
|
|
28
|
+
return state.files.map((file) => new Send("workerNode", { filePath: file }));
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Worker node: Invokes the original single-document pipeline.
|
|
32
|
+
*/
|
|
33
|
+
async function workerNode(state) {
|
|
34
|
+
const fileName = path.basename(state.filePath);
|
|
35
|
+
const startTime = Date.now();
|
|
36
|
+
try {
|
|
37
|
+
// Invoke the existing compiled single-document graph
|
|
38
|
+
const result = await singleDocGraph.invoke({ filePath: state.filePath });
|
|
39
|
+
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
|
|
40
|
+
return {
|
|
41
|
+
results: [{
|
|
42
|
+
file: fileName,
|
|
43
|
+
status: "success",
|
|
44
|
+
chunks: result.textChunks?.length ?? 0,
|
|
45
|
+
vectors: result.vectors?.length ?? 0,
|
|
46
|
+
durationSec: elapsed,
|
|
47
|
+
}]
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
catch (error) {
|
|
51
|
+
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
|
|
52
|
+
return {
|
|
53
|
+
results: [{
|
|
54
|
+
file: fileName,
|
|
55
|
+
status: "error",
|
|
56
|
+
chunks: 0,
|
|
57
|
+
vectors: 0,
|
|
58
|
+
durationSec: elapsed,
|
|
59
|
+
error: error.message,
|
|
60
|
+
}]
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Final node: Prints a summary of the entire batch.
|
|
66
|
+
*/
|
|
67
|
+
function summaryNode(state) {
|
|
68
|
+
logger.success(LogSource.BATCH, "All documents processed.");
|
|
69
|
+
return {};
|
|
70
|
+
}
|
|
71
|
+
// Build the batch graph
|
|
72
|
+
const batchGraph = new StateGraph(BatchStateAnnotation)
|
|
73
|
+
.addNode("orchestrator", orchestrator)
|
|
74
|
+
.addNode("workerNode", workerNode)
|
|
75
|
+
.addNode("summaryNode", summaryNode)
|
|
76
|
+
.addEdge("__start__", "orchestrator")
|
|
77
|
+
.addConditionalEdges("orchestrator", distributeFiles, ["workerNode"])
|
|
78
|
+
.addEdge("workerNode", "summaryNode")
|
|
79
|
+
.addEdge("summaryNode", END);
|
|
80
|
+
export const graph = batchGraph.compile();
|
|
81
|
+
//# sourceMappingURL=batchPipeline.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"batchPipeline.js","sourceRoot":"","sources":["../src/batchPipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,sBAAsB,CAAC;AACzE,OAAO,EAAE,KAAK,IAAI,cAAc,EAAE,MAAM,eAAe,CAAC;AACxD,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAEhD;;GAEG;AACH,MAAM,CAAC,MAAM,oBAAoB,GAAG,UAAU,CAAC,IAAI,CAAC;IAClD,oDAAoD;IACpD,KAAK,EAAE,CAAA,UAAoB,CAAA;IAE3B,sEAAsE;IACtE,OAAO,EAAE,UAAU,CAAQ;QACzB,OAAO,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;QAC9B,OAAO,EAAE,GAAG,EAAE,CAAC,EAAE;KAClB,CAAC;CACH,CAAC,CAAC;AAIH;;GAEG;AACH,SAAS,YAAY,CAAC,KAAiB;IACrC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,0BAA0B,KAAK,CAAC,KAAK,CAAC,MAAM,aAAa,CAAC,CAAC;IACxF,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,KAAiB;IACxC,OAAO,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAC9B,IAAI,IAAI,CAAC,YAAY,EAAE,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAC3C,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,UAAU,CAAC,KAA2B;IACnD,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,IAAI,CAAC;QACH,qDAAqD;QACrD,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QACzE,MAAM,OAAO,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAE7D,OAAO;YACL,OAAO,EAAE,CAAC;oBACR,IAAI,EAAE,QAAQ;oBACd,MAAM,EAAE,SAAS;oBACjB,MAAM,EAAE,MAAM,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;oBACtC,OAAO,EAAE,MAAM,CAAC,OAAO,EAAE,MAAM,IAAI,CAAC;oBACpC,WAAW,EAAE,OAAO;iBACrB,CAAC;SACH,CAAC;IACJ,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,MAAM,OAAO,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAC7D,OAAO;YACL,OAAO,EAAE,CAAC;oBACR,IAAI,EAAE,QAAQ;oBACd,MAAM,EAAE,OAAO;oBACf,MAAM,EAAE,CAAC;oBACT,OAAO,EAAE,CAAC;oBACV,WAAW,EAAE,OAAO;oBACpB,KAAK,EAAE,KAAK,CAAC,OAAO;iBACrB,CAAC;SACH,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,KAAiB;IACpC,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,KAAK,EAAE,0BAA0B,CAAC,CAAC;IAC5D,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,wBAAwB;AACxB,MAAM,UAAU,GAAG,IAAI,UAAU,CAAC,oBAAoB,CAAC;KACpD,OAAO,CAAC,cAAc,EAAE,YAAY,CAAC;KACrC,OAAO,CAAC,YAAY,EAAE,UAAU,CAAC;KACjC,OAAO,CAAC,aAAa,EAAE,WAAW,CAAC;KACnC,OAAO,CAAC,WAAW,EAAE,cAAc,CAAC;KACpC,mBAAmB,CAAC,cAAc,EAAE,eAAe,EAAE,CAAC,YAAY,CAAC,CAAC;KACpE,OAAO,CAAC,YAAY,EAAE,aAAa,CAAC;KACpC,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;AAE/B,MAAM,CAAC,MAAM,KAAK,GAAG,UAAU,CAAC,OAAO,EAAE,CAAC"}
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,eAAe,CAAC"}
|