@virstack/doc-ingest 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -1
- package/dist/adapters/aiAdapters.d.ts +3 -0
- package/dist/adapters/aiAdapters.d.ts.map +1 -1
- package/dist/adapters/aiAdapters.js +4 -2
- package/dist/adapters/aiAdapters.js.map +1 -1
- package/dist/core/constants.d.ts +6 -0
- package/dist/core/constants.d.ts.map +1 -0
- package/dist/core/constants.js +33 -0
- package/dist/core/constants.js.map +1 -0
- package/dist/graphs/singleDocument.d.ts +2 -2
- package/dist/graphs/singleDocument.d.ts.map +1 -1
- package/dist/graphs/singleDocument.js +7 -1
- package/dist/graphs/singleDocument.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/nodes/fileTypeRouter.d.ts.map +1 -1
- package/dist/nodes/fileTypeRouter.js +11 -0
- package/dist/nodes/fileTypeRouter.js.map +1 -1
- package/dist/nodes/imageReaderNode.d.ts +8 -0
- package/dist/nodes/imageReaderNode.d.ts.map +1 -0
- package/dist/nodes/imageReaderNode.js +28 -0
- package/dist/nodes/imageReaderNode.js.map +1 -0
- package/dist/nodes/llmExtractionNode.d.ts +1 -0
- package/dist/nodes/llmExtractionNode.d.ts.map +1 -1
- package/dist/nodes/llmExtractionNode.js +4 -3
- package/dist/nodes/llmExtractionNode.js.map +1 -1
- package/package.json +6 -2
package/README.md
CHANGED
|
@@ -8,7 +8,7 @@ Powered by **LangGraph** for resilient orchestration, **OpenRouter / Gemini** fo
|
|
|
8
8
|
|
|
9
9
|
## ✨ Key Features
|
|
10
10
|
|
|
11
|
-
- **Universal Multi-Format Support:** Natively processes PDF, DOCX, XLSX, PPTX, CSV, TXT, HTML, and
|
|
11
|
+
- **Universal Multi-Format Support:** Natively processes PDF, DOCX, XLSX, PPTX, CSV, TXT, HTML, EPUB, and Images (JPG, JPEG, PNG, GIF, WEBP, SVG).
|
|
12
12
|
- **Dual-Tier Parallelism:** Concurrently processes multiple files while simultaneously splitting and routing large PDFs into parallel Vision-API execution nodes.
|
|
13
13
|
- **Smart Type Routing:** Automatically identifies MIME types and dynamically routes files to the most optimal, parser-specific extraction graph.
|
|
14
14
|
- **Provider Agnostic Architecture:** Built entirely on Dependency Injection. Easily swap out LLMs, Embeddings, and Vector Databases (Pinecone, Qdrant, etc.) to fit your specific stack.
|
|
@@ -121,6 +121,21 @@ virstack-doc-ingest ./documents/ --verbose
|
|
|
121
121
|
|
|
122
122
|
Virstack Doc Ingest is designed to be fully embedded into your own SaaS backends or ETL pipelines. It is rigidly decoupled from concrete implementations.
|
|
123
123
|
|
|
124
|
+
### Validating Supported File Types
|
|
125
|
+
|
|
126
|
+
You can import the list of natively supported file extensions directly from the library to validate user uploads before sending them to the ingestion pipeline.
|
|
127
|
+
|
|
128
|
+
```typescript
|
|
129
|
+
import { SUPPORTED_FILE_EXTENSIONS, batchGraph } from "virstack-doc-ingest";
|
|
130
|
+
|
|
131
|
+
const fileExt = ".jpg"; // e.g. path.extname(file)
|
|
132
|
+
|
|
133
|
+
if (!SUPPORTED_FILE_EXTENSIONS.includes(fileExt.toLowerCase())) {
|
|
134
|
+
console.error(`Unsupported file type: ${fileExt}`);
|
|
135
|
+
// Return a 400 Bad Request to the user
|
|
136
|
+
}
|
|
137
|
+
```
|
|
138
|
+
|
|
124
139
|
### Default Built-In Adapters
|
|
125
140
|
|
|
126
141
|
The package exports fully functional adapters for typical stacks:
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
export interface LlmInput {
|
|
2
2
|
systemPrompt: string;
|
|
3
3
|
userText: string;
|
|
4
|
+
/** @deprecated use base64Data instead */
|
|
4
5
|
base64PdfChunk?: string;
|
|
6
|
+
base64Data?: string;
|
|
7
|
+
mimeType?: string;
|
|
5
8
|
}
|
|
6
9
|
export interface LlmAdapter {
|
|
7
10
|
generateMarkdown(input: LlmInput): Promise<string>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"aiAdapters.d.ts","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,QAAQ;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"aiAdapters.d.ts","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,QAAQ;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,yCAAyC;IACzC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;CAC9C;AAID,qBAAa,oBAAqB,YAAW,UAAU;IACrD,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;gBAEV,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAKnC,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;CAmCzD;AAED,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,GAAE,MAAa;IAM9D,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CA4BnD"}
|
|
@@ -9,10 +9,12 @@ export class OpenRouterLlmAdapter {
|
|
|
9
9
|
}
|
|
10
10
|
async generateMarkdown(input) {
|
|
11
11
|
const userContent = [];
|
|
12
|
-
|
|
12
|
+
const mediaObj = input.base64Data || input.base64PdfChunk;
|
|
13
|
+
if (mediaObj) {
|
|
14
|
+
const mime = input.mimeType || "application/pdf";
|
|
13
15
|
userContent.push({
|
|
14
16
|
type: "image_url",
|
|
15
|
-
imageUrl: { url: `data
|
|
17
|
+
imageUrl: { url: `data:${mime};base64,${mediaObj}` },
|
|
16
18
|
});
|
|
17
19
|
}
|
|
18
20
|
userContent.push({ type: "text", text: input.userText });
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"aiAdapters.js","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;
|
|
1
|
+
{"version":3,"file":"aiAdapters.js","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAqB7C,wDAAwD;AAExD,MAAM,OAAO,oBAAoB;IACvB,MAAM,CAAa;IACnB,KAAK,CAAS;IAEtB,YAAY,MAAc,EAAE,KAAa;QACvC,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,KAAe;QACpC,MAAM,WAAW,GAAU,EAAE,CAAC;QAE9B,MAAM,QAAQ,GAAG,KAAK,CAAC,UAAU,IAAI,KAAK,CAAC,cAAc,CAAC;QAE1D,IAAI,QAAQ,EAAE,CAAC;YACb,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,IAAI,iBAAiB,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC;gBACf,IAAI,EAAE,WAAW;gBACjB,QAAQ,EAAE,EAAE,GAAG,EAAE,QAAQ,IAAI,WAAW,QAAQ,EAAE,EAAE;aACrD,CAAC,CAAC;QACL,CAAC;QACD,WAAW,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEzD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;YAC3C,oBAAoB,EAAE;gBACpB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,YAAY,EAAE;oBAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAkB,EAAE;iBAC9C;gBACD,WAAW,EAAE,CAAC;aACf;SACF,CAAC,CAAC;QAEH,kDAAkD;QAClD,MAAM,YAAY,GAAG,QAAe,CAAC;QACrC,MAAM,OAAO,GAAG,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;QAE5D,IAAI,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;YAC3B,OAAO,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACtF,CAAC;QAED,OAAO,CAAC,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;CACF;AAED,MAAM,OAAO,0BAA0B;IAC7B,MAAM,CAAa;IACnB,KAAK,CAAS;IACd,UAAU,CAAS;IAE3B,YAAY,MAAc,EAAE,KAAa,EAAE,aAAqB,IAAI;QAClE,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAgB;QAC1B,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC;YACrD,WAAW,EAAE;gBACX,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,KAAK,EAAE,MAAM;gBACb,UAAU,EAAE,IAAI,CAAC,UAAU;aAC5B;SACF,CAAC,CAAC;QAEH,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE,CAAC;YACjC,MAAM,IAAI,KAAK,CAAC,kEAAkE,QAAQ,EAAE,CAAC,CAAC;QAChG,CAAC;QAED,8DAA8D;QAC9D,IAAI,cAAc,GAAG,QAAQ,CAAC,IAAI,CAAC;QACnC,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,IAAI,OAAO,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC7E,cAAc,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAC9E,CAAC;QAED,OAAO,cAAc,CAAC,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE;YACtC,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC;YAC3B,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;gBAC3B,2EAA2E;gBAC3E,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;YAC3E,CAAC;YACD,OAAO,GAAG,CAAC;QACb,CAAC,CAAC,CAAC;IACL,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"constants.d.ts","sourceRoot":"","sources":["../../src/core/constants.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,eAAO,MAAM,yBAAyB,UA2BrC,CAAC"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Supported file extensions for document ingestion.
|
|
3
|
+
* This array can be used externally to validate files before sending them to the pipeline.
|
|
4
|
+
*/
|
|
5
|
+
export const SUPPORTED_FILE_EXTENSIONS = [
|
|
6
|
+
// PDFs
|
|
7
|
+
".pdf",
|
|
8
|
+
// Word processing
|
|
9
|
+
".docx",
|
|
10
|
+
".doc",
|
|
11
|
+
".rtf",
|
|
12
|
+
".odt",
|
|
13
|
+
".epub",
|
|
14
|
+
// Presentations
|
|
15
|
+
".pptx",
|
|
16
|
+
".ppt",
|
|
17
|
+
".odp",
|
|
18
|
+
// Spreadsheets and data
|
|
19
|
+
".xlsx",
|
|
20
|
+
".xls",
|
|
21
|
+
".csv",
|
|
22
|
+
// Text & web
|
|
23
|
+
".txt",
|
|
24
|
+
".html",
|
|
25
|
+
// Images
|
|
26
|
+
".jpg",
|
|
27
|
+
".jpeg",
|
|
28
|
+
".png",
|
|
29
|
+
".gif",
|
|
30
|
+
".webp",
|
|
31
|
+
".svg",
|
|
32
|
+
];
|
|
33
|
+
//# sourceMappingURL=constants.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"constants.js","sourceRoot":"","sources":["../../src/core/constants.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,MAAM,CAAC,MAAM,yBAAyB,GAAG;IACvC,OAAO;IACP,MAAM;IACN,kBAAkB;IAClB,OAAO;IACP,MAAM;IACN,MAAM;IACN,MAAM;IACN,OAAO;IACP,gBAAgB;IAChB,OAAO;IACP,MAAM;IACN,MAAM;IACN,wBAAwB;IACxB,OAAO;IACP,MAAM;IACN,MAAM;IACN,aAAa;IACb,MAAM;IACN,OAAO;IACP,SAAS;IACT,MAAM;IACN,OAAO;IACP,MAAM;IACN,MAAM;IACN,OAAO;IACP,MAAM;CACP,CAAC"}
|
|
@@ -72,7 +72,7 @@ export declare function buildPipeline(): import("@langchain/langgraph").Compiled
|
|
|
72
72
|
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
73
73
|
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
74
74
|
};
|
|
75
|
-
}>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
|
|
75
|
+
}>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "imageReaderNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
|
|
76
76
|
filePath: {
|
|
77
77
|
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
78
78
|
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
@@ -225,7 +225,7 @@ export declare const graph: import("@langchain/langgraph").CompiledStateGraph<im
|
|
|
225
225
|
(annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
|
|
226
226
|
Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
|
|
227
227
|
};
|
|
228
|
-
}>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
|
|
228
|
+
}>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "imageReaderNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
|
|
229
229
|
filePath: {
|
|
230
230
|
(): import("@langchain/langgraph").LastValue<string | undefined>;
|
|
231
231
|
(annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"singleDocument.d.ts","sourceRoot":"","sources":["../../src/graphs/singleDocument.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"singleDocument.d.ts","sourceRoot":"","sources":["../../src/graphs/singleDocument.ts"],"names":[],"mappings":"AA4CA,wBAAgB,aAAa;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mDAiE5B;AAED;;;GAGG;AACH,eAAO,MAAM,KAAK;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kDAAkB,CAAC"}
|
|
@@ -11,6 +11,7 @@ import { vectorEmbedderNode } from "../nodes/vectorEmbedderNode.js";
|
|
|
11
11
|
import { vectorUpsertNode } from "../nodes/vectorUpsertNode.js";
|
|
12
12
|
import { saveMarkdown } from "../nodes/saveMarkdown.js";
|
|
13
13
|
import { libreOfficeToPdf } from "../nodes/libreOfficeToPdf.js";
|
|
14
|
+
import { imageReaderNode } from "../nodes/imageReaderNode.js";
|
|
14
15
|
/**
|
|
15
16
|
* Builds and compiles the Virstack Doc Ingest pipeline as a LangGraph StateGraph.
|
|
16
17
|
*
|
|
@@ -34,6 +35,7 @@ function dispatchPdfChunks(state) {
|
|
|
34
35
|
chunk,
|
|
35
36
|
index,
|
|
36
37
|
totalChunks: state.pdfChunks.length,
|
|
38
|
+
mimeType: state.mimeType,
|
|
37
39
|
});
|
|
38
40
|
});
|
|
39
41
|
}
|
|
@@ -48,6 +50,8 @@ export function buildPipeline() {
|
|
|
48
50
|
// ── Phase 2b: Text / Data Extraction Branch ──
|
|
49
51
|
.addNode("textExtractorNode", textExtractorNode)
|
|
50
52
|
.addNode("llmExtractionNode", llmExtractionNode)
|
|
53
|
+
// ── Phase 2c: Image Branch ──
|
|
54
|
+
.addNode("imageReaderNode", imageReaderNode)
|
|
51
55
|
// ── Phase 3: Normalization & Chunking ──
|
|
52
56
|
.addNode("markdownNormalizer", markdownNormalizer)
|
|
53
57
|
.addNode("saveMarkdown", saveMarkdown)
|
|
@@ -63,11 +67,13 @@ export function buildPipeline() {
|
|
|
63
67
|
pdf: "pdfSplitter",
|
|
64
68
|
convert: "libreOfficeToPdf",
|
|
65
69
|
extract: "textExtractorNode",
|
|
70
|
+
image: "imageReaderNode",
|
|
66
71
|
})
|
|
67
72
|
// Convert branch: LibreOffice → pdfSplitter → (joins PDF branch)
|
|
68
73
|
.addEdge("libreOfficeToPdf", "pdfSplitter")
|
|
69
|
-
// PDF
|
|
74
|
+
// PDF/Image unified dispatcher
|
|
70
75
|
.addConditionalEdges("pdfSplitter", dispatchPdfChunks, ["llmExtractionNode"])
|
|
76
|
+
.addConditionalEdges("imageReaderNode", dispatchPdfChunks, ["llmExtractionNode"])
|
|
71
77
|
// Unified Document/Text branch flow
|
|
72
78
|
.addEdge("textExtractorNode", "llmExtractionNode")
|
|
73
79
|
// After llmExtractionNode, conditionally merge PDF chunks or normalize Text
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"singleDocument.js","sourceRoot":"","sources":["../../src/graphs/singleDocument.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,sBAAsB,CAAC;AAC7D,OAAO,EAAE,uBAAuB,EAAsB,MAAM,kBAAkB,CAAC;AAC/E,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AAC7E,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,+BAA+B,CAAC;AACjF,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAE5D,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;
|
|
1
|
+
{"version":3,"file":"singleDocument.js","sourceRoot":"","sources":["../../src/graphs/singleDocument.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,sBAAsB,CAAC;AAC7D,OAAO,EAAE,uBAAuB,EAAsB,MAAM,kBAAkB,CAAC;AAC/E,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AAC7E,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,+BAA+B,CAAC;AACjF,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAE5D,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAE9D;;;;;;;;;GASG;AAEH;;GAEG;AACH,SAAS,iBAAiB,CAAC,KAAoB;IAC7C,IAAI,CAAC,KAAK,CAAC,SAAS,IAAI,KAAK,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrD,OAAO,CAAC,IAAI,CAAC,qDAAqD,CAAC,CAAC;QACpE,OAAO,EAAE,CAAC;IACZ,CAAC;IACD,OAAO,KAAK,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC1C,OAAO,IAAI,IAAI,CAAC,mBAAmB,EAAE;YACnC,KAAK;YACL,KAAK;YACL,WAAW,EAAE,KAAK,CAAC,SAAS,CAAC,MAAM;YACnC,QAAQ,EAAE,KAAK,CAAC,QAAQ;SACzB,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AACD,MAAM,UAAU,aAAa;IAC3B,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,uBAAuB,CAAC;QACnD,yBAAyB;SACxB,OAAO,CAAC,gBAAgB,EAAE,cAAc,CAAC;QAE1C,6BAA6B;SAC5B,OAAO,CAAC,kBAAkB,EAAE,gBAAgB,CAAC;SAC7C,OAAO,CAAC,aAAa,EAAE,WAAW,CAAC;SACnC,OAAO,CAAC,gBAAgB,EAAE,cAAc,CAAC;QAE1C,gDAAgD;SAC/C,OAAO,CAAC,mBAAmB,EAAE,iBAAiB,CAAC;SAC/C,OAAO,CAAC,mBAAmB,EAAE,iBAAiB,CAAC;QAEhD,+BAA+B;SAC9B,OAAO,CAAC,iBAAiB,EAAE,eAAe,CAAC;QAE5C,0CAA0C;SACzC,OAAO,CAAC,oBAAoB,EAAE,kBAAkB,CAAC;SACjD,OAAO,CAAC,cAAc,EAAE,YAAY,CAAC;SACrC,OAAO,CAAC,iBAAiB,EAAE,eAAe,CAAC;QAE5C,sCAAsC;SACrC,OAAO,CAAC,oBAAoB,EAAE,kBAAkB,CAAC;SACjD,OAAO,CAAC,kBAAkB,EAAE,gBAAgB,CAAC;QAE9C,cAAc;QACd,iBAAiB;SAChB,OAAO,CAAC,WAAW,EAAE,gBAAgB,CAAC;QAEvC,8BAA8B;SAC7B,mBAAmB,CAAC,gBAAgB,EAAE,eAAe,EAAE;QACtD,GAAG,EAAE,aAAa;QAClB,OAAO,EAAE,kBAAkB;QAC3B,OAAO,EAAE,mBAAmB;QAC5B,KAAK,EAAE,iBAAiB;KACzB,CAAC;QAEF,iEAAiE;SAChE,OAAO,CAAC,kBAAkB,EAAE,aAAa,CAAC;QAE3C,+BAA+B;SAC9B,mBAAmB,CAAC,aAAa,EAAE,iBAAiB,EAAE,CAAC,mBAAmB,CAAC,CAAC;SAC5E,mBAAmB,CAAC,iBAAiB,EAAE,iBAAiB,EAAE,CAAC,mBAAmB,CAAC,CAAC;QAEjF,oCAAoC;SACnC,OAAO,CAAC,mBAAmB,EAAE,mBAAmB,CAAC;QAElD,4EAA4E;SAC3E,mBAAmB,CAAC,mBAAmB,EAAE,aAAa,EAAE;QACvD,cAAc,EAAE,gBAAgB;QAChC,kBAAkB,EAAE,oBAAoB;KACzC,CAAC;QAEF,+BAA+B;SAC9B,OAAO,CAAC,gBAAgB,EAAE,oBAAoB,CAAC;QAEhD,+DAA+D;SAC9D,OAAO,CAAC,oBAAoB,EAAE,cAAc,CAAC;SAC7C,OAAO,CAAC,cAAc,EAAE,iBAAiB,CAAC;SAC1C,OAAO,CAAC,iBAAiB,EAAE,oBAAoB,CAAC;SAChD,OAAO,CAAC,oBAAoB,EAAE,kBAAkB,CAAC;SACjD,OAAO,CAAC,kBAAkB,EAAE,GAAG,CAAC,CAAC;IAEpC,OAAO,KAAK,CAAC,OAAO,EAAE,CAAC;AACzB,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,KAAK,GAAG,aAAa,EAAE,CAAC"}
|
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
export { initializeConfig, type VirstackDocIngestConfig, } from "./core/config.js";
|
|
2
2
|
export { graph as batchGraph, BatchStateAnnotation, } from "./graphs/batchProcessor.js";
|
|
3
3
|
export { buildPipeline, graph as singleDocGraph, } from "./graphs/singleDocument.js";
|
|
4
|
+
export { SUPPORTED_FILE_EXTENSIONS } from "./core/constants.js";
|
|
4
5
|
export type { PipelineState } from "./core/state.js";
|
|
5
6
|
export type { BatchState } from "./graphs/batchProcessor.js";
|
|
6
7
|
export { type VectorStoreAdapter, type VectorRecord, UpstashAdapter, } from "./adapters/vectorStore.js";
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EACL,gBAAgB,EAChB,KAAK,uBAAuB,GAC7B,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACL,KAAK,IAAI,UAAU,EACnB,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EACL,aAAa,EACb,KAAK,IAAI,cAAc,GACxB,MAAM,4BAA4B,CAAC;AAGpC,YAAY,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AACrD,YAAY,EAAE,UAAU,EAAE,MAAM,4BAA4B,CAAC;AAG7D,OAAO,EACL,KAAK,kBAAkB,EACvB,KAAK,YAAY,EACjB,cAAc,GACf,MAAM,2BAA2B,CAAC;AAGnC,OAAO,EACL,KAAK,UAAU,EACf,KAAK,QAAQ,EACb,KAAK,gBAAgB,EACrB,oBAAoB,EACpB,0BAA0B,GAC3B,MAAM,0BAA0B,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EACL,gBAAgB,EAChB,KAAK,uBAAuB,GAC7B,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACL,KAAK,IAAI,UAAU,EACnB,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EACL,aAAa,EACb,KAAK,IAAI,cAAc,GACxB,MAAM,4BAA4B,CAAC;AAGpC,OAAO,EAAE,yBAAyB,EAAE,MAAM,qBAAqB,CAAC;AAGhE,YAAY,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AACrD,YAAY,EAAE,UAAU,EAAE,MAAM,4BAA4B,CAAC;AAG7D,OAAO,EACL,KAAK,kBAAkB,EACvB,KAAK,YAAY,EACjB,cAAc,GACf,MAAM,2BAA2B,CAAC;AAGnC,OAAO,EACL,KAAK,UAAU,EACf,KAAK,QAAQ,EACb,KAAK,gBAAgB,EACrB,oBAAoB,EACpB,0BAA0B,GAC3B,MAAM,0BAA0B,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -3,6 +3,8 @@ export { initializeConfig, } from "./core/config.js";
|
|
|
3
3
|
// Export the processing graphs
|
|
4
4
|
export { graph as batchGraph, BatchStateAnnotation, } from "./graphs/batchProcessor.js";
|
|
5
5
|
export { buildPipeline, graph as singleDocGraph, } from "./graphs/singleDocument.js";
|
|
6
|
+
// Export the core constants
|
|
7
|
+
export { SUPPORTED_FILE_EXTENSIONS } from "./core/constants.js";
|
|
6
8
|
// Export vector store injection types and built-in adapters
|
|
7
9
|
export { UpstashAdapter, } from "./adapters/vectorStore.js";
|
|
8
10
|
// Export AI injection types and built-in adapter
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,wBAAwB;AACxB,OAAO,EACL,gBAAgB,GAEjB,MAAM,kBAAkB,CAAC;AAE1B,+BAA+B;AAC/B,OAAO,EACL,KAAK,IAAI,UAAU,EACnB,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EACL,aAAa,EACb,KAAK,IAAI,cAAc,GACxB,MAAM,4BAA4B,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,wBAAwB;AACxB,OAAO,EACL,gBAAgB,GAEjB,MAAM,kBAAkB,CAAC;AAE1B,+BAA+B;AAC/B,OAAO,EACL,KAAK,IAAI,UAAU,EACnB,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EACL,aAAa,EACb,KAAK,IAAI,cAAc,GACxB,MAAM,4BAA4B,CAAC;AAEpC,4BAA4B;AAC5B,OAAO,EAAE,yBAAyB,EAAE,MAAM,qBAAqB,CAAC;AAMhE,4DAA4D;AAC5D,OAAO,EAGL,cAAc,GACf,MAAM,2BAA2B,CAAC;AAEnC,iDAAiD;AACjD,OAAO,EAIL,oBAAoB,EACpB,0BAA0B,GAC3B,MAAM,0BAA0B,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fileTypeRouter.d.ts","sourceRoot":"","sources":["../../src/nodes/fileTypeRouter.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAiBjC;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,
|
|
1
|
+
{"version":3,"file":"fileTypeRouter.d.ts","sourceRoot":"","sources":["../../src/nodes/fileTypeRouter.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAiBjC;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,CA8D5D"}
|
|
@@ -65,6 +65,17 @@ export function routeByMimeType(state) {
|
|
|
65
65
|
if (mime === "text/plain" || mime === "text/html") {
|
|
66
66
|
return "extract";
|
|
67
67
|
}
|
|
68
|
+
// Images
|
|
69
|
+
const imageTypes = [
|
|
70
|
+
"image/jpeg",
|
|
71
|
+
"image/png",
|
|
72
|
+
"image/gif",
|
|
73
|
+
"image/webp",
|
|
74
|
+
"image/svg+xml"
|
|
75
|
+
];
|
|
76
|
+
if (mime && imageTypes.includes(mime)) {
|
|
77
|
+
return "image";
|
|
78
|
+
}
|
|
68
79
|
// Fallback: try to treat as text
|
|
69
80
|
logger.warn(LogSource.FILE_ROUTER, `Unknown MIME "${mime}", falling back to extract branch`);
|
|
70
81
|
return "extract";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fileTypeRouter.js","sourceRoot":"","sources":["../../src/nodes/fileTypeRouter.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,IAAI,MAAM,YAAY,CAAC;AAE9B,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,KAAoB;IAEpB,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,sCAAsC,CAAC,CAAC;QAC3E,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,CAAC;IACpC,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,+DAA+D,CAAC,CAAC;IACnF,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,0BAA0B,CAAC;IAEhE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,SAAS,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IAC7E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,kBAAkB,QAAQ,EAAE,CAAC,CAAC;IAEjE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC;AAChC,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,eAAe,CAAC,KAAoB;IAClD,sDAAsD;IACtD,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACrC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,CAAC;IAE5B,IAAI,IAAI,KAAK,iBAAiB,EAAE,CAAC;QAC/B,OAAO,KAAK,CAAC;IACf,CAAC;IAED,uDAAuD;IACvD,MAAM,gBAAgB,GAAG;QACvB,kBAAkB;QAClB,yEAAyE,EAAE,OAAO;QAClF,oBAAoB,EAAyD,MAAM;QACnF,iBAAiB,EAA4D,MAAM;QACnF,UAAU,EAAmE,gBAAgB;QAC7F,yCAAyC,EAAmC,MAAM;QAClF,sBAAsB,EAAuD,OAAO;QACpF,gBAAgB;QAChB,2EAA2E,EAAE,OAAO;QACpF,+BAA+B,EAA8C,MAAM;QACnF,iDAAiD,EAA2B,MAAM;KACnF,CAAC;IAEF,IAAI,IAAI,IAAI,gBAAgB,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC5C,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,yDAAyD;IACzD,MAAM,WAAW,GAAG;QAClB,mEAAmE,EAAE,OAAO;QAC5E,0BAA0B,EAA4C,MAAM;QAC5E,UAAU;KACX,CAAC;IAEF,IAAI,IAAI,IAAI,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACvC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,IAAI,IAAI,KAAK,YAAY,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;QAClD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,iCAAiC;IACjC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,iBAAiB,IAAI,mCAAmC,CAAC,CAAC;IAC7F,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
|
1
|
+
{"version":3,"file":"fileTypeRouter.js","sourceRoot":"","sources":["../../src/nodes/fileTypeRouter.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,IAAI,MAAM,YAAY,CAAC;AAE9B,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,KAAoB;IAEpB,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,sCAAsC,CAAC,CAAC;QAC3E,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,CAAC;IACpC,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,+DAA+D,CAAC,CAAC;IACnF,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,0BAA0B,CAAC;IAEhE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,SAAS,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IAC7E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,kBAAkB,QAAQ,EAAE,CAAC,CAAC;IAEjE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC;AAChC,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,eAAe,CAAC,KAAoB;IAClD,sDAAsD;IACtD,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACrC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,CAAC;IAE5B,IAAI,IAAI,KAAK,iBAAiB,EAAE,CAAC;QAC/B,OAAO,KAAK,CAAC;IACf,CAAC;IAED,uDAAuD;IACvD,MAAM,gBAAgB,GAAG;QACvB,kBAAkB;QAClB,yEAAyE,EAAE,OAAO;QAClF,oBAAoB,EAAyD,MAAM;QACnF,iBAAiB,EAA4D,MAAM;QACnF,UAAU,EAAmE,gBAAgB;QAC7F,yCAAyC,EAAmC,MAAM;QAClF,sBAAsB,EAAuD,OAAO;QACpF,gBAAgB;QAChB,2EAA2E,EAAE,OAAO;QACpF,+BAA+B,EAA8C,MAAM;QACnF,iDAAiD,EAA2B,MAAM;KACnF,CAAC;IAEF,IAAI,IAAI,IAAI,gBAAgB,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC5C,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,yDAAyD;IACzD,MAAM,WAAW,GAAG;QAClB,mEAAmE,EAAE,OAAO;QAC5E,0BAA0B,EAA4C,MAAM;QAC5E,UAAU;KACX,CAAC;IAEF,IAAI,IAAI,IAAI,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACvC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,IAAI,IAAI,KAAK,YAAY,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;QAClD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,SAAS;IACT,MAAM,UAAU,GAAG;QACjB,YAAY;QACZ,WAAW;QACX,WAAW;QACX,YAAY;QACZ,eAAe;KAChB,CAAC;IAEF,IAAI,IAAI,IAAI,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACtC,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,iCAAiC;IACjC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,iBAAiB,IAAI,mCAAmC,CAAC,CAAC;IAC7F,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { PipelineState } from "../core/state.js";
|
|
2
|
+
/**
|
|
3
|
+
* Reads an image file and converts it into a base64 chunk.
|
|
4
|
+
* The resulting chunk is stored in `state.pdfChunks` so it can be
|
|
5
|
+
* processed generically by the same parallel LLM dispatch logic.
|
|
6
|
+
*/
|
|
7
|
+
export declare function imageReaderNode(state: PipelineState): Promise<Partial<PipelineState>>;
|
|
8
|
+
//# sourceMappingURL=imageReaderNode.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"imageReaderNode.d.ts","sourceRoot":"","sources":["../../src/nodes/imageReaderNode.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAItD;;;;GAIG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAoBjC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { logger, LogSource } from "../core/logger.js";
|
|
4
|
+
import { requireInit } from "../core/config.js";
|
|
5
|
+
/**
|
|
6
|
+
* Reads an image file and converts it into a base64 chunk.
|
|
7
|
+
* The resulting chunk is stored in `state.pdfChunks` so it can be
|
|
8
|
+
* processed generically by the same parallel LLM dispatch logic.
|
|
9
|
+
*/
|
|
10
|
+
export async function imageReaderNode(state) {
|
|
11
|
+
requireInit();
|
|
12
|
+
if (!state.filePath)
|
|
13
|
+
throw new Error("[imageReaderNode] filePath is missing");
|
|
14
|
+
const fullPath = path.resolve(process.cwd(), state.filePath);
|
|
15
|
+
logger.info(LogSource.PDF_SPLITTER, `Reading image at: ${fullPath}`); // Reusing PDF_SPLITTER or maybe we can just use generic logging but LogSource is an enum.
|
|
16
|
+
let fileBuffer;
|
|
17
|
+
try {
|
|
18
|
+
fileBuffer = await fs.readFile(fullPath);
|
|
19
|
+
}
|
|
20
|
+
catch (err) {
|
|
21
|
+
throw new Error(`Failed to read image at ${fullPath}: ${err.message}`);
|
|
22
|
+
}
|
|
23
|
+
const base64Data = fileBuffer.toString("base64");
|
|
24
|
+
// We place it in pdfChunks so it uses the exact same parallel mapping logic
|
|
25
|
+
logger.info(LogSource.PDF_SPLITTER, `Created 1 image chunk from ${state.mimeType}`);
|
|
26
|
+
return { pdfChunks: [base64Data] };
|
|
27
|
+
}
|
|
28
|
+
//# sourceMappingURL=imageReaderNode.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"imageReaderNode.js","sourceRoot":"","sources":["../../src/nodes/imageReaderNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAE7B,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhD;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IAEd,IAAI,CAAC,KAAK,CAAC,QAAQ;QAAE,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;IAC9E,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,qBAAqB,QAAQ,EAAE,CAAC,CAAC,CAAC,0FAA0F;IAEhK,IAAI,UAAU,CAAC;IACf,IAAI,CAAC;QACH,UAAU,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC3C,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CAAC,2BAA2B,QAAQ,KAAK,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IACzE,CAAC;IAED,MAAM,UAAU,GAAG,UAAU,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEjD,4EAA4E;IAC5E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,8BAA8B,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAEpF,OAAO,EAAE,SAAS,EAAE,CAAC,UAAU,CAAC,EAAE,CAAC;AACrC,CAAC"}
|
|
@@ -9,6 +9,7 @@ export declare function llmExtractionNode(state: Partial<PipelineState> & {
|
|
|
9
9
|
chunk?: string;
|
|
10
10
|
index?: number;
|
|
11
11
|
totalChunks?: number;
|
|
12
|
+
mimeType?: string;
|
|
12
13
|
}): Promise<Partial<PipelineState>>;
|
|
13
14
|
/**
|
|
14
15
|
* Conditional router to determine what happens after llmExtractionNode.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llmExtractionNode.d.ts","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAuBtD;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,OAAO,CAAC,aAAa,CAAC,GAAG;IAAE,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,
|
|
1
|
+
{"version":3,"file":"llmExtractionNode.d.ts","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAuBtD;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,OAAO,CAAC,aAAa,CAAC,GAAG;IAAE,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAAC,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,GAC1G,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAwCjC;AAED;;;;GAIG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,CAK1D"}
|
|
@@ -35,12 +35,13 @@ export async function llmExtractionNode(state) {
|
|
|
35
35
|
const promptInput = {
|
|
36
36
|
systemPrompt: finalSystemPrompt,
|
|
37
37
|
userText: isChunkFlow
|
|
38
|
-
? `Extract all content from this
|
|
38
|
+
? `Extract all content from this document/image (chunk ${state.index + 1} of ${state.totalChunks}) into clean Markdown.`
|
|
39
39
|
: `Convert the following extracted document text into clean Markdown:\n\n${state.rawText}`,
|
|
40
|
-
|
|
40
|
+
base64Data: isChunkFlow ? state.chunk : undefined,
|
|
41
|
+
mimeType: state.mimeType
|
|
41
42
|
};
|
|
42
43
|
if (isChunkFlow) {
|
|
43
|
-
logger.info(LogSource.LLM_EXTRACTION, `Processing
|
|
44
|
+
logger.info(LogSource.LLM_EXTRACTION, `Processing chunk ${state.index + 1}/${state.totalChunks} (${((state.chunk.length * 0.75) / 1024).toFixed(0)} KB)`);
|
|
44
45
|
}
|
|
45
46
|
else {
|
|
46
47
|
logger.info(LogSource.LLM_EXTRACTION, `Sending ${state.rawText.length} chars to generic LLM Adapter`);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llmExtractionNode.js","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAE1E,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAGtD,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;4GAiB8E,CAAC;AAE7G;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,
|
|
1
|
+
{"version":3,"file":"llmExtractionNode.js","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAE1E,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAGtD,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;4GAiB8E,CAAC;AAE7G;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAA2G;IAG3G,WAAW,EAAE,CAAC;IAEd,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,WAAW,KAAK,SAAS,CAAC;IAC9G,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAEnC,IAAI,CAAC,WAAW,IAAI,CAAC,UAAU,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;IAC9F,CAAC;IAED,MAAM,iBAAiB,GAAG,cAAc,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAE/E,MAAM,WAAW,GAAa;QAC5B,YAAY,EAAE,iBAAiB;QAC/B,QAAQ,EAAE,WAAW;YACnB,CAAC,CAAC,uDAAuD,KAAK,CAAC,KAAM,GAAG,CAAC,OAAO,KAAK,CAAC,WAAW,wBAAwB;YACzH,CAAC,CAAC,yEAAyE,KAAK,CAAC,OAAO,EAAE;QAC5F,UAAU,EAAE,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;QACjD,QAAQ,EAAE,KAAK,CAAC,QAAQ;KACzB,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,oBAAoB,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,KAAK,CAAC,CAAC,KAAK,CAAC,KAAM,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC9J,CAAC;SAAM,CAAC;QACN,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,WAAW,KAAK,CAAC,OAAQ,CAAC,MAAM,+BAA+B,CAAC,CAAC;IACzG,CAAC;IAED,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,CACnC,cAAc,CAAC,GAAG,CAAC,gBAAgB,CAAC,WAAW,CAAC,CACjD,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,SAAS,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,eAAe,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAC;QAC7H,OAAO,EAAE,aAAa,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;IACvC,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,uBAAuB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IACtF,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,aAAa,CAAC,KAAoB;IAChD,IAAI,KAAK,CAAC,aAAa,IAAI,KAAK,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QAC7E,OAAO,gBAAgB,CAAC;IAC1B,CAAC;IACD,OAAO,oBAAoB,CAAC;AAC9B,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@virstack/doc-ingest",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "A high-performance, parallelized document ingestion and vectorization pipeline.",
|
|
5
|
+
"repository": {
|
|
6
|
+
"type": "git",
|
|
7
|
+
"url": "https://github.com/virstack/virstack-doc-ingest"
|
|
8
|
+
},
|
|
5
9
|
"main": "./dist/index.js",
|
|
6
10
|
"types": "./dist/index.d.ts",
|
|
7
11
|
"type": "module",
|
|
@@ -52,4 +56,4 @@
|
|
|
52
56
|
"tsx": "^4.19.0",
|
|
53
57
|
"typescript": "^5.6.0"
|
|
54
58
|
}
|
|
55
|
-
}
|
|
59
|
+
}
|