@virstack/doc-ingest 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +16 -1
  2. package/dist/adapters/aiAdapters.d.ts +3 -0
  3. package/dist/adapters/aiAdapters.d.ts.map +1 -1
  4. package/dist/adapters/aiAdapters.js +4 -2
  5. package/dist/adapters/aiAdapters.js.map +1 -1
  6. package/dist/cli.js +0 -0
  7. package/dist/core/constants.d.ts +6 -0
  8. package/dist/core/constants.d.ts.map +1 -0
  9. package/dist/core/constants.js +33 -0
  10. package/dist/core/constants.js.map +1 -0
  11. package/dist/graphs/singleDocument.d.ts +2 -2
  12. package/dist/graphs/singleDocument.d.ts.map +1 -1
  13. package/dist/graphs/singleDocument.js +7 -1
  14. package/dist/graphs/singleDocument.js.map +1 -1
  15. package/dist/index.d.ts +1 -0
  16. package/dist/index.d.ts.map +1 -1
  17. package/dist/index.js +2 -0
  18. package/dist/index.js.map +1 -1
  19. package/dist/nodes/fileTypeRouter.d.ts.map +1 -1
  20. package/dist/nodes/fileTypeRouter.js +11 -0
  21. package/dist/nodes/fileTypeRouter.js.map +1 -1
  22. package/dist/nodes/imageReaderNode.d.ts +8 -0
  23. package/dist/nodes/imageReaderNode.d.ts.map +1 -0
  24. package/dist/nodes/imageReaderNode.js +28 -0
  25. package/dist/nodes/imageReaderNode.js.map +1 -0
  26. package/dist/nodes/llmExtractionNode.d.ts +1 -0
  27. package/dist/nodes/llmExtractionNode.d.ts.map +1 -1
  28. package/dist/nodes/llmExtractionNode.js +4 -3
  29. package/dist/nodes/llmExtractionNode.js.map +1 -1
  30. package/package.json +9 -4
  31. package/dist/aiAdapters.d.ts +0 -25
  32. package/dist/aiAdapters.d.ts.map +0 -1
  33. package/dist/aiAdapters.js +0 -50
  34. package/dist/aiAdapters.js.map +0 -1
  35. package/dist/assets/logo.png +0 -0
  36. package/dist/batchPipeline.d.ts +0 -52
  37. package/dist/batchPipeline.d.ts.map +0 -1
  38. package/dist/batchPipeline.js +0 -81
  39. package/dist/batchPipeline.js.map +0 -1
  40. package/dist/config.d.ts +0 -26
  41. package/dist/config.d.ts.map +0 -1
  42. package/dist/config.js +0 -97
  43. package/dist/config.js.map +0 -1
  44. package/dist/logger.d.ts +0 -24
  45. package/dist/logger.d.ts.map +0 -1
  46. package/dist/logger.js +0 -36
  47. package/dist/logger.js.map +0 -1
  48. package/dist/logo.d.ts +0 -2
  49. package/dist/logo.d.ts.map +0 -1
  50. package/dist/logo.js +0 -3
  51. package/dist/logo.js.map +0 -1
  52. package/dist/nodes/geminiExtraction.d.ts +0 -19
  53. package/dist/nodes/geminiExtraction.d.ts.map +0 -1
  54. package/dist/nodes/geminiExtraction.js +0 -87
  55. package/dist/nodes/geminiExtraction.js.map +0 -1
  56. package/dist/nodes/openrouterEmbedder.d.ts +0 -7
  57. package/dist/nodes/openrouterEmbedder.d.ts.map +0 -1
  58. package/dist/nodes/openrouterEmbedder.js +0 -31
  59. package/dist/nodes/openrouterEmbedder.js.map +0 -1
  60. package/dist/nodes/upstashUpsert.d.ts +0 -7
  61. package/dist/nodes/upstashUpsert.d.ts.map +0 -1
  62. package/dist/nodes/upstashUpsert.js +0 -45
  63. package/dist/nodes/upstashUpsert.js.map +0 -1
  64. package/dist/pipeline.d.ts +0 -303
  65. package/dist/pipeline.d.ts.map +0 -1
  66. package/dist/pipeline.js +0 -93
  67. package/dist/pipeline.js.map +0 -1
  68. package/dist/state.d.ts +0 -52
  69. package/dist/state.d.ts.map +0 -1
  70. package/dist/state.js +0 -27
  71. package/dist/state.js.map +0 -1
  72. package/dist/vectorStore.d.ts +0 -24
  73. package/dist/vectorStore.d.ts.map +0 -1
  74. package/dist/vectorStore.js +0 -22
  75. package/dist/vectorStore.js.map +0 -1
package/README.md CHANGED
@@ -8,7 +8,7 @@ Powered by **LangGraph** for resilient orchestration, **OpenRouter / Gemini** fo
8
8
 
9
9
  ## ✨ Key Features
10
10
 
11
- - **Universal Multi-Format Support:** Natively processes PDF, DOCX, XLSX, PPTX, CSV, TXT, HTML, and EPUB files.
11
+ - **Universal Multi-Format Support:** Natively processes PDF, DOCX, XLSX, PPTX, CSV, TXT, HTML, EPUB, and Images (JPG, JPEG, PNG, GIF, WEBP, SVG).
12
12
  - **Dual-Tier Parallelism:** Concurrently processes multiple files while simultaneously splitting and routing large PDFs into parallel Vision-API execution nodes.
13
13
  - **Smart Type Routing:** Automatically identifies MIME types and dynamically routes files to the most optimal, parser-specific extraction graph.
14
14
  - **Provider Agnostic Architecture:** Built entirely on Dependency Injection. Easily swap out LLMs, Embeddings, and Vector Databases (Pinecone, Qdrant, etc.) to fit your specific stack.
@@ -121,6 +121,21 @@ virstack-doc-ingest ./documents/ --verbose
121
121
 
122
122
  Virstack Doc Ingest is designed to be fully embedded into your own SaaS backends or ETL pipelines. It is rigidly decoupled from concrete implementations.
123
123
 
124
+ ### Validating Supported File Types
125
+
126
+ You can import the list of natively supported file extensions directly from the library to validate user uploads before sending them to the ingestion pipeline.
127
+
128
+ ```typescript
129
+ import { SUPPORTED_FILE_EXTENSIONS, batchGraph } from "virstack-doc-ingest";
130
+
131
+ const fileExt = ".jpg"; // e.g. path.extname(file)
132
+
133
+ if (!SUPPORTED_FILE_EXTENSIONS.includes(fileExt.toLowerCase())) {
134
+ console.error(`Unsupported file type: ${fileExt}`);
135
+ // Return a 400 Bad Request to the user
136
+ }
137
+ ```
138
+
124
139
  ### Default Built-In Adapters
125
140
 
126
141
  The package exports fully functional adapters for typical stacks:
@@ -1,7 +1,10 @@
1
1
  export interface LlmInput {
2
2
  systemPrompt: string;
3
3
  userText: string;
4
+ /** @deprecated use base64Data instead */
4
5
  base64PdfChunk?: string;
6
+ base64Data?: string;
7
+ mimeType?: string;
5
8
  }
6
9
  export interface LlmAdapter {
7
10
  generateMarkdown(input: LlmInput): Promise<string>;
@@ -1 +1 @@
1
- {"version":3,"file":"aiAdapters.d.ts","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,QAAQ;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;CAC9C;AAID,qBAAa,oBAAqB,YAAW,UAAU;IACrD,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;gBAEV,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAKnC,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;CAgCzD;AAED,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,GAAE,MAAa;IAM9D,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CA4BnD"}
1
+ {"version":3,"file":"aiAdapters.d.ts","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,QAAQ;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,yCAAyC;IACzC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;CAC9C;AAID,qBAAa,oBAAqB,YAAW,UAAU;IACrD,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;gBAEV,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAKnC,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;CAmCzD;AAED,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,MAAM,CAAa;IAC3B,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,GAAE,MAAa;IAM9D,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CA4BnD"}
@@ -9,10 +9,12 @@ export class OpenRouterLlmAdapter {
9
9
  }
10
10
  async generateMarkdown(input) {
11
11
  const userContent = [];
12
- if (input.base64PdfChunk) {
12
+ const mediaObj = input.base64Data || input.base64PdfChunk;
13
+ if (mediaObj) {
14
+ const mime = input.mimeType || "application/pdf";
13
15
  userContent.push({
14
16
  type: "image_url",
15
- imageUrl: { url: `data:application/pdf;base64,${input.base64PdfChunk}` },
17
+ imageUrl: { url: `data:${mime};base64,${mediaObj}` },
16
18
  });
17
19
  }
18
20
  userContent.push({ type: "text", text: input.userText });
@@ -1 +1 @@
1
- {"version":3,"file":"aiAdapters.js","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAkB7C,wDAAwD;AAExD,MAAM,OAAO,oBAAoB;IACvB,MAAM,CAAa;IACnB,KAAK,CAAS;IAEtB,YAAY,MAAc,EAAE,KAAa;QACvC,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,KAAe;QACpC,MAAM,WAAW,GAAU,EAAE,CAAC;QAE9B,IAAI,KAAK,CAAC,cAAc,EAAE,CAAC;YACzB,WAAW,CAAC,IAAI,CAAC;gBACf,IAAI,EAAE,WAAW;gBACjB,QAAQ,EAAE,EAAE,GAAG,EAAE,+BAA+B,KAAK,CAAC,cAAc,EAAE,EAAE;aACzE,CAAC,CAAC;QACL,CAAC;QACD,WAAW,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEzD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;YAC3C,oBAAoB,EAAE;gBACpB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,YAAY,EAAE;oBAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAkB,EAAE;iBAC9C;gBACD,WAAW,EAAE,CAAC;aACf;SACF,CAAC,CAAC;QAEH,kDAAkD;QAClD,MAAM,YAAY,GAAG,QAAe,CAAC;QACrC,MAAM,OAAO,GAAG,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;QAE5D,IAAI,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;YAC3B,OAAO,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACtF,CAAC;QAED,OAAO,CAAC,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;CACF;AAED,MAAM,OAAO,0BAA0B;IAC7B,MAAM,CAAa;IACnB,KAAK,CAAS;IACd,UAAU,CAAS;IAE3B,YAAY,MAAc,EAAE,KAAa,EAAE,aAAqB,IAAI;QAClE,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAgB;QAC1B,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC;YACrD,WAAW,EAAE;gBACX,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,KAAK,EAAE,MAAM;gBACb,UAAU,EAAE,IAAI,CAAC,UAAU;aAC5B;SACF,CAAC,CAAC;QAEH,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE,CAAC;YACjC,MAAM,IAAI,KAAK,CAAC,kEAAkE,QAAQ,EAAE,CAAC,CAAC;QAChG,CAAC;QAED,8DAA8D;QAC9D,IAAI,cAAc,GAAG,QAAQ,CAAC,IAAI,CAAC;QACnC,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,IAAI,OAAO,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC7E,cAAc,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAC9E,CAAC;QAED,OAAO,cAAc,CAAC,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE;YACtC,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC;YAC3B,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;gBAC3B,2EAA2E;gBAC3E,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;YAC3E,CAAC;YACD,OAAO,GAAG,CAAC;QACb,CAAC,CAAC,CAAC;IACL,CAAC;CACF"}
1
+ {"version":3,"file":"aiAdapters.js","sourceRoot":"","sources":["../../src/adapters/aiAdapters.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAqB7C,wDAAwD;AAExD,MAAM,OAAO,oBAAoB;IACvB,MAAM,CAAa;IACnB,KAAK,CAAS;IAEtB,YAAY,MAAc,EAAE,KAAa;QACvC,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,KAAe;QACpC,MAAM,WAAW,GAAU,EAAE,CAAC;QAE9B,MAAM,QAAQ,GAAG,KAAK,CAAC,UAAU,IAAI,KAAK,CAAC,cAAc,CAAC;QAE1D,IAAI,QAAQ,EAAE,CAAC;YACb,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,IAAI,iBAAiB,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC;gBACf,IAAI,EAAE,WAAW;gBACjB,QAAQ,EAAE,EAAE,GAAG,EAAE,QAAQ,IAAI,WAAW,QAAQ,EAAE,EAAE;aACrD,CAAC,CAAC;QACL,CAAC;QACD,WAAW,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEzD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;YAC3C,oBAAoB,EAAE;gBACpB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,YAAY,EAAE;oBAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAkB,EAAE;iBAC9C;gBACD,WAAW,EAAE,CAAC;aACf;SACF,CAAC,CAAC;QAEH,kDAAkD;QAClD,MAAM,YAAY,GAAG,QAAe,CAAC;QACrC,MAAM,OAAO,GAAG,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;QAE5D,IAAI,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;YAC3B,OAAO,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACtF,CAAC;QAED,OAAO,CAAC,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;CACF;AAED,MAAM,OAAO,0BAA0B;IAC7B,MAAM,CAAa;IACnB,KAAK,CAAS;IACd,UAAU,CAAS;IAE3B,YAAY,MAAc,EAAE,KAAa,EAAE,aAAqB,IAAI;QAClE,IAAI,CAAC,MAAM,GAAG,IAAI,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAgB;QAC1B,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC;YACrD,WAAW,EAAE;gBACX,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,KAAK,EAAE,MAAM;gBACb,UAAU,EAAE,IAAI,CAAC,UAAU;aAC5B;SACF,CAAC,CAAC;QAEH,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE,CAAC;YACjC,MAAM,IAAI,KAAK,CAAC,kEAAkE,QAAQ,EAAE,CAAC,CAAC;QAChG,CAAC;QAED,8DAA8D;QAC9D,IAAI,cAAc,GAAG,QAAQ,CAAC,IAAI,CAAC;QACnC,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,IAAI,OAAO,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC7E,cAAc,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAC9E,CAAC;QAED,OAAO,cAAc,CAAC,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE;YACtC,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC;YAC3B,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;gBAC3B,2EAA2E;gBAC3E,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;YAC3E,CAAC;YACD,OAAO,GAAG,CAAC;QACb,CAAC,CAAC,CAAC;IACL,CAAC;CACF"}
package/dist/cli.js CHANGED
File without changes
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Supported file extensions for document ingestion.
3
+ * This array can be used externally to validate files before sending them to the pipeline.
4
+ */
5
+ export declare const SUPPORTED_FILE_EXTENSIONS: string[];
6
+ //# sourceMappingURL=constants.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"constants.d.ts","sourceRoot":"","sources":["../../src/core/constants.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,eAAO,MAAM,yBAAyB,UA2BrC,CAAC"}
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Supported file extensions for document ingestion.
3
+ * This array can be used externally to validate files before sending them to the pipeline.
4
+ */
5
+ export const SUPPORTED_FILE_EXTENSIONS = [
6
+ // PDFs
7
+ ".pdf",
8
+ // Word processing
9
+ ".docx",
10
+ ".doc",
11
+ ".rtf",
12
+ ".odt",
13
+ ".epub",
14
+ // Presentations
15
+ ".pptx",
16
+ ".ppt",
17
+ ".odp",
18
+ // Spreadsheets and data
19
+ ".xlsx",
20
+ ".xls",
21
+ ".csv",
22
+ // Text & web
23
+ ".txt",
24
+ ".html",
25
+ // Images
26
+ ".jpg",
27
+ ".jpeg",
28
+ ".png",
29
+ ".gif",
30
+ ".webp",
31
+ ".svg",
32
+ ];
33
+ //# sourceMappingURL=constants.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"constants.js","sourceRoot":"","sources":["../../src/core/constants.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,MAAM,CAAC,MAAM,yBAAyB,GAAG;IACvC,OAAO;IACP,MAAM;IACN,kBAAkB;IAClB,OAAO;IACP,MAAM;IACN,MAAM;IACN,MAAM;IACN,OAAO;IACP,gBAAgB;IAChB,OAAO;IACP,MAAM;IACN,MAAM;IACN,wBAAwB;IACxB,OAAO;IACP,MAAM;IACN,MAAM;IACN,aAAa;IACb,MAAM;IACN,OAAO;IACP,SAAS;IACT,MAAM;IACN,OAAO;IACP,MAAM;IACN,MAAM;IACN,OAAO;IACP,MAAM;CACP,CAAC"}
@@ -72,7 +72,7 @@ export declare function buildPipeline(): import("@langchain/langgraph").Compiled
72
72
  (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
73
73
  Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
74
74
  };
75
- }>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
75
+ }>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "imageReaderNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
76
76
  filePath: {
77
77
  (): import("@langchain/langgraph").LastValue<string | undefined>;
78
78
  (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
@@ -225,7 +225,7 @@ export declare const graph: import("@langchain/langgraph").CompiledStateGraph<im
225
225
  (annotation: import("@langchain/langgraph").SingleReducer<number[][], number[][]>): import("@langchain/langgraph").BinaryOperatorAggregate<number[][], number[][]>;
226
226
  Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
227
227
  };
228
- }>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
228
+ }>, "markdownMerger" | "markdownNormalizer" | "llmExtractionNode" | "__start__" | "fileTypeRouter" | "libreOfficeToPdf" | "pdfSplitter" | "textExtractorNode" | "imageReaderNode" | "saveMarkdown" | "markdownChunker" | "vectorEmbedderNode" | "vectorUpsertNode", {
229
229
  filePath: {
230
230
  (): import("@langchain/langgraph").LastValue<string | undefined>;
231
231
  (annotation: import("@langchain/langgraph").SingleReducer<string | undefined, string | undefined>): import("@langchain/langgraph").BinaryOperatorAggregate<string | undefined, string | undefined>;
@@ -1 +1 @@
1
- {"version":3,"file":"singleDocument.d.ts","sourceRoot":"","sources":["../../src/graphs/singleDocument.ts"],"names":[],"mappings":"AA0CA,wBAAgB,aAAa;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mDA4D5B;AAED;;;GAGG;AACH,eAAO,MAAM,KAAK;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kDAAkB,CAAC"}
1
+ {"version":3,"file":"singleDocument.d.ts","sourceRoot":"","sources":["../../src/graphs/singleDocument.ts"],"names":[],"mappings":"AA4CA,wBAAgB,aAAa;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mDAiE5B;AAED;;;GAGG;AACH,eAAO,MAAM,KAAK;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kDAAkB,CAAC"}
@@ -11,6 +11,7 @@ import { vectorEmbedderNode } from "../nodes/vectorEmbedderNode.js";
11
11
  import { vectorUpsertNode } from "../nodes/vectorUpsertNode.js";
12
12
  import { saveMarkdown } from "../nodes/saveMarkdown.js";
13
13
  import { libreOfficeToPdf } from "../nodes/libreOfficeToPdf.js";
14
+ import { imageReaderNode } from "../nodes/imageReaderNode.js";
14
15
  /**
15
16
  * Builds and compiles the Virstack Doc Ingest pipeline as a LangGraph StateGraph.
16
17
  *
@@ -34,6 +35,7 @@ function dispatchPdfChunks(state) {
34
35
  chunk,
35
36
  index,
36
37
  totalChunks: state.pdfChunks.length,
38
+ mimeType: state.mimeType,
37
39
  });
38
40
  });
39
41
  }
@@ -48,6 +50,8 @@ export function buildPipeline() {
48
50
  // ── Phase 2b: Text / Data Extraction Branch ──
49
51
  .addNode("textExtractorNode", textExtractorNode)
50
52
  .addNode("llmExtractionNode", llmExtractionNode)
53
+ // ── Phase 2c: Image Branch ──
54
+ .addNode("imageReaderNode", imageReaderNode)
51
55
  // ── Phase 3: Normalization & Chunking ──
52
56
  .addNode("markdownNormalizer", markdownNormalizer)
53
57
  .addNode("saveMarkdown", saveMarkdown)
@@ -63,11 +67,13 @@ export function buildPipeline() {
63
67
  pdf: "pdfSplitter",
64
68
  convert: "libreOfficeToPdf",
65
69
  extract: "textExtractorNode",
70
+ image: "imageReaderNode",
66
71
  })
67
72
  // Convert branch: LibreOffice → pdfSplitter → (joins PDF branch)
68
73
  .addEdge("libreOfficeToPdf", "pdfSplitter")
69
- // PDF branch dispatcher
74
+ // PDF/Image unified dispatcher
70
75
  .addConditionalEdges("pdfSplitter", dispatchPdfChunks, ["llmExtractionNode"])
76
+ .addConditionalEdges("imageReaderNode", dispatchPdfChunks, ["llmExtractionNode"])
71
77
  // Unified Document/Text branch flow
72
78
  .addEdge("textExtractorNode", "llmExtractionNode")
73
79
  // After llmExtractionNode, conditionally merge PDF chunks or normalize Text
@@ -1 +1 @@
1
- {"version":3,"file":"singleDocument.js","sourceRoot":"","sources":["../../src/graphs/singleDocument.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,sBAAsB,CAAC;AAC7D,OAAO,EAAE,uBAAuB,EAAsB,MAAM,kBAAkB,CAAC;AAC/E,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AAC7E,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,+BAA+B,CAAC;AACjF,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAE5D,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAEhE;;;;;;;;;GASG;AAEH;;GAEG;AACH,SAAS,iBAAiB,CAAC,KAAoB;IAC7C,IAAI,CAAC,KAAK,CAAC,SAAS,IAAI,KAAK,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrD,OAAO,CAAC,IAAI,CAAC,qDAAqD,CAAC,CAAC;QACpE,OAAO,EAAE,CAAC;IACZ,CAAC;IACD,OAAO,KAAK,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC1C,OAAO,IAAI,IAAI,CAAC,mBAAmB,EAAE;YACnC,KAAK;YACL,KAAK;YACL,WAAW,EAAE,KAAK,CAAC,SAAS,CAAC,MAAM;SACpC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AACD,MAAM,UAAU,aAAa;IAC3B,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,uBAAuB,CAAC;QACnD,yBAAyB;SACxB,OAAO,CAAC,gBAAgB,EAAE,cAAc,CAAC;QAE1C,6BAA6B;SAC5B,OAAO,CAAC,kBAAkB,EAAE,gBAAgB,CAAC;SAC7C,OAAO,CAAC,aAAa,EAAE,WAAW,CAAC;SACnC,OAAO,CAAC,gBAAgB,EAAE,cAAc,CAAC;QAE1C,gDAAgD;SAC/C,OAAO,CAAC,mBAAmB,EAAE,iBAAiB,CAAC;SAC/C,OAAO,CAAC,mBAAmB,EAAE,iBAAiB,CAAC;QAEhD,0CAA0C;SACzC,OAAO,CAAC,oBAAoB,EAAE,kBAAkB,CAAC;SACjD,OAAO,CAAC,cAAc,EAAE,YAAY,CAAC;SACrC,OAAO,CAAC,iBAAiB,EAAE,eAAe,CAAC;QAE5C,sCAAsC;SACrC,OAAO,CAAC,oBAAoB,EAAE,kBAAkB,CAAC;SACjD,OAAO,CAAC,kBAAkB,EAAE,gBAAgB,CAAC;QAE9C,cAAc;QACd,iBAAiB;SAChB,OAAO,CAAC,WAAW,EAAE,gBAAgB,CAAC;QAEvC,8BAA8B;SAC7B,mBAAmB,CAAC,gBAAgB,EAAE,eAAe,EAAE;QACtD,GAAG,EAAE,aAAa;QAClB,OAAO,EAAE,kBAAkB;QAC3B,OAAO,EAAE,mBAAmB;KAC7B,CAAC;QAEF,iEAAiE;SAChE,OAAO,CAAC,kBAAkB,EAAE,aAAa,CAAC;QAE3C,wBAAwB;SACvB,mBAAmB,CAAC,aAAa,EAAE,iBAAiB,EAAE,CAAC,mBAAmB,CAAC,CAAC;QAE7E,oCAAoC;SACnC,OAAO,CAAC,mBAAmB,EAAE,mBAAmB,CAAC;QAElD,4EAA4E;SAC3E,mBAAmB,CAAC,mBAAmB,EAAE,aAAa,EAAE;QACvD,cAAc,EAAE,gBAAgB;QAChC,kBAAkB,EAAE,oBAAoB;KACzC,CAAC;QAEF,+BAA+B;SAC9B,OAAO,CAAC,gBAAgB,EAAE,oBAAoB,CAAC;QAEhD,+DAA+D;SAC9D,OAAO,CAAC,oBAAoB,EAAE,cAAc,CAAC;SAC7C,OAAO,CAAC,cAAc,EAAE,iBAAiB,CAAC;SAC1C,OAAO,CAAC,iBAAiB,EAAE,oBAAoB,CAAC;SAChD,OAAO,CAAC,oBAAoB,EAAE,kBAAkB,CAAC;SACjD,OAAO,CAAC,kBAAkB,EAAE,GAAG,CAAC,CAAC;IAEpC,OAAO,KAAK,CAAC,OAAO,EAAE,CAAC;AACzB,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,KAAK,GAAG,aAAa,EAAE,CAAC"}
1
+ {"version":3,"file":"singleDocument.js","sourceRoot":"","sources":["../../src/graphs/singleDocument.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,sBAAsB,CAAC;AAC7D,OAAO,EAAE,uBAAuB,EAAsB,MAAM,kBAAkB,CAAC;AAC/E,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AAC7E,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,+BAA+B,CAAC;AACjF,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAE5D,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAE9D;;;;;;;;;GASG;AAEH;;GAEG;AACH,SAAS,iBAAiB,CAAC,KAAoB;IAC7C,IAAI,CAAC,KAAK,CAAC,SAAS,IAAI,KAAK,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrD,OAAO,CAAC,IAAI,CAAC,qDAAqD,CAAC,CAAC;QACpE,OAAO,EAAE,CAAC;IACZ,CAAC;IACD,OAAO,KAAK,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC1C,OAAO,IAAI,IAAI,CAAC,mBAAmB,EAAE;YACnC,KAAK;YACL,KAAK;YACL,WAAW,EAAE,KAAK,CAAC,SAAS,CAAC,MAAM;YACnC,QAAQ,EAAE,KAAK,CAAC,QAAQ;SACzB,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AACD,MAAM,UAAU,aAAa;IAC3B,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,uBAAuB,CAAC;QACnD,yBAAyB;SACxB,OAAO,CAAC,gBAAgB,EAAE,cAAc,CAAC;QAE1C,6BAA6B;SAC5B,OAAO,CAAC,kBAAkB,EAAE,gBAAgB,CAAC;SAC7C,OAAO,CAAC,aAAa,EAAE,WAAW,CAAC;SACnC,OAAO,CAAC,gBAAgB,EAAE,cAAc,CAAC;QAE1C,gDAAgD;SAC/C,OAAO,CAAC,mBAAmB,EAAE,iBAAiB,CAAC;SAC/C,OAAO,CAAC,mBAAmB,EAAE,iBAAiB,CAAC;QAEhD,+BAA+B;SAC9B,OAAO,CAAC,iBAAiB,EAAE,eAAe,CAAC;QAE5C,0CAA0C;SACzC,OAAO,CAAC,oBAAoB,EAAE,kBAAkB,CAAC;SACjD,OAAO,CAAC,cAAc,EAAE,YAAY,CAAC;SACrC,OAAO,CAAC,iBAAiB,EAAE,eAAe,CAAC;QAE5C,sCAAsC;SACrC,OAAO,CAAC,oBAAoB,EAAE,kBAAkB,CAAC;SACjD,OAAO,CAAC,kBAAkB,EAAE,gBAAgB,CAAC;QAE9C,cAAc;QACd,iBAAiB;SAChB,OAAO,CAAC,WAAW,EAAE,gBAAgB,CAAC;QAEvC,8BAA8B;SAC7B,mBAAmB,CAAC,gBAAgB,EAAE,eAAe,EAAE;QACtD,GAAG,EAAE,aAAa;QAClB,OAAO,EAAE,kBAAkB;QAC3B,OAAO,EAAE,mBAAmB;QAC5B,KAAK,EAAE,iBAAiB;KACzB,CAAC;QAEF,iEAAiE;SAChE,OAAO,CAAC,kBAAkB,EAAE,aAAa,CAAC;QAE3C,+BAA+B;SAC9B,mBAAmB,CAAC,aAAa,EAAE,iBAAiB,EAAE,CAAC,mBAAmB,CAAC,CAAC;SAC5E,mBAAmB,CAAC,iBAAiB,EAAE,iBAAiB,EAAE,CAAC,mBAAmB,CAAC,CAAC;QAEjF,oCAAoC;SACnC,OAAO,CAAC,mBAAmB,EAAE,mBAAmB,CAAC;QAElD,4EAA4E;SAC3E,mBAAmB,CAAC,mBAAmB,EAAE,aAAa,EAAE;QACvD,cAAc,EAAE,gBAAgB;QAChC,kBAAkB,EAAE,oBAAoB;KACzC,CAAC;QAEF,+BAA+B;SAC9B,OAAO,CAAC,gBAAgB,EAAE,oBAAoB,CAAC;QAEhD,+DAA+D;SAC9D,OAAO,CAAC,oBAAoB,EAAE,cAAc,CAAC;SAC7C,OAAO,CAAC,cAAc,EAAE,iBAAiB,CAAC;SAC1C,OAAO,CAAC,iBAAiB,EAAE,oBAAoB,CAAC;SAChD,OAAO,CAAC,oBAAoB,EAAE,kBAAkB,CAAC;SACjD,OAAO,CAAC,kBAAkB,EAAE,GAAG,CAAC,CAAC;IAEpC,OAAO,KAAK,CAAC,OAAO,EAAE,CAAC;AACzB,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,KAAK,GAAG,aAAa,EAAE,CAAC"}
package/dist/index.d.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  export { initializeConfig, type VirstackDocIngestConfig, } from "./core/config.js";
2
2
  export { graph as batchGraph, BatchStateAnnotation, } from "./graphs/batchProcessor.js";
3
3
  export { buildPipeline, graph as singleDocGraph, } from "./graphs/singleDocument.js";
4
+ export { SUPPORTED_FILE_EXTENSIONS } from "./core/constants.js";
4
5
  export type { PipelineState } from "./core/state.js";
5
6
  export type { BatchState } from "./graphs/batchProcessor.js";
6
7
  export { type VectorStoreAdapter, type VectorRecord, UpstashAdapter, } from "./adapters/vectorStore.js";
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EACL,gBAAgB,EAChB,KAAK,uBAAuB,GAC7B,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACL,KAAK,IAAI,UAAU,EACnB,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EACL,aAAa,EACb,KAAK,IAAI,cAAc,GACxB,MAAM,4BAA4B,CAAC;AAGpC,YAAY,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AACrD,YAAY,EAAE,UAAU,EAAE,MAAM,4BAA4B,CAAC;AAG7D,OAAO,EACL,KAAK,kBAAkB,EACvB,KAAK,YAAY,EACjB,cAAc,GACf,MAAM,2BAA2B,CAAC;AAGnC,OAAO,EACL,KAAK,UAAU,EACf,KAAK,QAAQ,EACb,KAAK,gBAAgB,EACrB,oBAAoB,EACpB,0BAA0B,GAC3B,MAAM,0BAA0B,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EACL,gBAAgB,EAChB,KAAK,uBAAuB,GAC7B,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACL,KAAK,IAAI,UAAU,EACnB,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EACL,aAAa,EACb,KAAK,IAAI,cAAc,GACxB,MAAM,4BAA4B,CAAC;AAGpC,OAAO,EAAE,yBAAyB,EAAE,MAAM,qBAAqB,CAAC;AAGhE,YAAY,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AACrD,YAAY,EAAE,UAAU,EAAE,MAAM,4BAA4B,CAAC;AAG7D,OAAO,EACL,KAAK,kBAAkB,EACvB,KAAK,YAAY,EACjB,cAAc,GACf,MAAM,2BAA2B,CAAC;AAGnC,OAAO,EACL,KAAK,UAAU,EACf,KAAK,QAAQ,EACb,KAAK,gBAAgB,EACrB,oBAAoB,EACpB,0BAA0B,GAC3B,MAAM,0BAA0B,CAAC"}
package/dist/index.js CHANGED
@@ -3,6 +3,8 @@ export { initializeConfig, } from "./core/config.js";
3
3
  // Export the processing graphs
4
4
  export { graph as batchGraph, BatchStateAnnotation, } from "./graphs/batchProcessor.js";
5
5
  export { buildPipeline, graph as singleDocGraph, } from "./graphs/singleDocument.js";
6
+ // Export the core constants
7
+ export { SUPPORTED_FILE_EXTENSIONS } from "./core/constants.js";
6
8
  // Export vector store injection types and built-in adapters
7
9
  export { UpstashAdapter, } from "./adapters/vectorStore.js";
8
10
  // Export AI injection types and built-in adapter
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,wBAAwB;AACxB,OAAO,EACL,gBAAgB,GAEjB,MAAM,kBAAkB,CAAC;AAE1B,+BAA+B;AAC/B,OAAO,EACL,KAAK,IAAI,UAAU,EACnB,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EACL,aAAa,EACb,KAAK,IAAI,cAAc,GACxB,MAAM,4BAA4B,CAAC;AAMpC,4DAA4D;AAC5D,OAAO,EAGL,cAAc,GACf,MAAM,2BAA2B,CAAC;AAEnC,iDAAiD;AACjD,OAAO,EAIL,oBAAoB,EACpB,0BAA0B,GAC3B,MAAM,0BAA0B,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,wBAAwB;AACxB,OAAO,EACL,gBAAgB,GAEjB,MAAM,kBAAkB,CAAC;AAE1B,+BAA+B;AAC/B,OAAO,EACL,KAAK,IAAI,UAAU,EACnB,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EACL,aAAa,EACb,KAAK,IAAI,cAAc,GACxB,MAAM,4BAA4B,CAAC;AAEpC,4BAA4B;AAC5B,OAAO,EAAE,yBAAyB,EAAE,MAAM,qBAAqB,CAAC;AAMhE,4DAA4D;AAC5D,OAAO,EAGL,cAAc,GACf,MAAM,2BAA2B,CAAC;AAEnC,iDAAiD;AACjD,OAAO,EAIL,oBAAoB,EACpB,0BAA0B,GAC3B,MAAM,0BAA0B,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"fileTypeRouter.d.ts","sourceRoot":"","sources":["../../src/nodes/fileTypeRouter.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAiBjC;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,CAiD5D"}
1
+ {"version":3,"file":"fileTypeRouter.d.ts","sourceRoot":"","sources":["../../src/nodes/fileTypeRouter.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGtD;;;GAGG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAiBjC;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,CA8D5D"}
@@ -65,6 +65,17 @@ export function routeByMimeType(state) {
65
65
  if (mime === "text/plain" || mime === "text/html") {
66
66
  return "extract";
67
67
  }
68
+ // Images
69
+ const imageTypes = [
70
+ "image/jpeg",
71
+ "image/png",
72
+ "image/gif",
73
+ "image/webp",
74
+ "image/svg+xml"
75
+ ];
76
+ if (mime && imageTypes.includes(mime)) {
77
+ return "image";
78
+ }
68
79
  // Fallback: try to treat as text
69
80
  logger.warn(LogSource.FILE_ROUTER, `Unknown MIME "${mime}", falling back to extract branch`);
70
81
  return "extract";
@@ -1 +1 @@
1
- {"version":3,"file":"fileTypeRouter.js","sourceRoot":"","sources":["../../src/nodes/fileTypeRouter.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,IAAI,MAAM,YAAY,CAAC;AAE9B,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,KAAoB;IAEpB,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,sCAAsC,CAAC,CAAC;QAC3E,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,CAAC;IACpC,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,+DAA+D,CAAC,CAAC;IACnF,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,0BAA0B,CAAC;IAEhE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,SAAS,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IAC7E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,kBAAkB,QAAQ,EAAE,CAAC,CAAC;IAEjE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC;AAChC,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,eAAe,CAAC,KAAoB;IAClD,sDAAsD;IACtD,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACrC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,CAAC;IAE5B,IAAI,IAAI,KAAK,iBAAiB,EAAE,CAAC;QAC/B,OAAO,KAAK,CAAC;IACf,CAAC;IAED,uDAAuD;IACvD,MAAM,gBAAgB,GAAG;QACvB,kBAAkB;QAClB,yEAAyE,EAAE,OAAO;QAClF,oBAAoB,EAAyD,MAAM;QACnF,iBAAiB,EAA4D,MAAM;QACnF,UAAU,EAAmE,gBAAgB;QAC7F,yCAAyC,EAAmC,MAAM;QAClF,sBAAsB,EAAuD,OAAO;QACpF,gBAAgB;QAChB,2EAA2E,EAAE,OAAO;QACpF,+BAA+B,EAA8C,MAAM;QACnF,iDAAiD,EAA2B,MAAM;KACnF,CAAC;IAEF,IAAI,IAAI,IAAI,gBAAgB,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC5C,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,yDAAyD;IACzD,MAAM,WAAW,GAAG;QAClB,mEAAmE,EAAE,OAAO;QAC5E,0BAA0B,EAA4C,MAAM;QAC5E,UAAU;KACX,CAAC;IAEF,IAAI,IAAI,IAAI,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACvC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,IAAI,IAAI,KAAK,YAAY,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;QAClD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,iCAAiC;IACjC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,iBAAiB,IAAI,mCAAmC,CAAC,CAAC;IAC7F,OAAO,SAAS,CAAC;AACnB,CAAC"}
1
+ {"version":3,"file":"fileTypeRouter.js","sourceRoot":"","sources":["../../src/nodes/fileTypeRouter.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,IAAI,MAAM,YAAY,CAAC;AAE9B,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAEtD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,KAAoB;IAEpB,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,sCAAsC,CAAC,CAAC;QAC3E,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,CAAC;IACpC,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,+DAA+D,CAAC,CAAC;IACnF,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,0BAA0B,CAAC;IAEhE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,SAAS,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IAC7E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,kBAAkB,QAAQ,EAAE,CAAC,CAAC;IAEjE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC;AAChC,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,eAAe,CAAC,KAAoB;IAClD,sDAAsD;IACtD,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QACrC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,CAAC;IAE5B,IAAI,IAAI,KAAK,iBAAiB,EAAE,CAAC;QAC/B,OAAO,KAAK,CAAC;IACf,CAAC;IAED,uDAAuD;IACvD,MAAM,gBAAgB,GAAG;QACvB,kBAAkB;QAClB,yEAAyE,EAAE,OAAO;QAClF,oBAAoB,EAAyD,MAAM;QACnF,iBAAiB,EAA4D,MAAM;QACnF,UAAU,EAAmE,gBAAgB;QAC7F,yCAAyC,EAAmC,MAAM;QAClF,sBAAsB,EAAuD,OAAO;QACpF,gBAAgB;QAChB,2EAA2E,EAAE,OAAO;QACpF,+BAA+B,EAA8C,MAAM;QACnF,iDAAiD,EAA2B,MAAM;KACnF,CAAC;IAEF,IAAI,IAAI,IAAI,gBAAgB,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC5C,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,yDAAyD;IACzD,MAAM,WAAW,GAAG;QAClB,mEAAmE,EAAE,OAAO;QAC5E,0BAA0B,EAA4C,MAAM;QAC5E,UAAU;KACX,CAAC;IAEF,IAAI,IAAI,IAAI,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACvC,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,IAAI,IAAI,KAAK,YAAY,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;QAClD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,SAAS;IACT,MAAM,UAAU,GAAG;QACjB,YAAY;QACZ,WAAW;QACX,WAAW;QACX,YAAY;QACZ,eAAe;KAChB,CAAC;IAEF,IAAI,IAAI,IAAI,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACtC,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,iCAAiC;IACjC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,iBAAiB,IAAI,mCAAmC,CAAC,CAAC;IAC7F,OAAO,SAAS,CAAC;AACnB,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { PipelineState } from "../core/state.js";
2
+ /**
3
+ * Reads an image file and converts it into a base64 chunk.
4
+ * The resulting chunk is stored in `state.pdfChunks` so it can be
5
+ * processed generically by the same parallel LLM dispatch logic.
6
+ */
7
+ export declare function imageReaderNode(state: PipelineState): Promise<Partial<PipelineState>>;
8
+ //# sourceMappingURL=imageReaderNode.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"imageReaderNode.d.ts","sourceRoot":"","sources":["../../src/nodes/imageReaderNode.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAItD;;;;GAIG;AACH,wBAAsB,eAAe,CACnC,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAoBjC"}
@@ -0,0 +1,28 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { logger, LogSource } from "../core/logger.js";
4
+ import { requireInit } from "../core/config.js";
5
+ /**
6
+ * Reads an image file and converts it into a base64 chunk.
7
+ * The resulting chunk is stored in `state.pdfChunks` so it can be
8
+ * processed generically by the same parallel LLM dispatch logic.
9
+ */
10
+ export async function imageReaderNode(state) {
11
+ requireInit();
12
+ if (!state.filePath)
13
+ throw new Error("[imageReaderNode] filePath is missing");
14
+ const fullPath = path.resolve(process.cwd(), state.filePath);
15
+ logger.info(LogSource.PDF_SPLITTER, `Reading image at: ${fullPath}`); // Reusing PDF_SPLITTER or maybe we can just use generic logging but LogSource is an enum.
16
+ let fileBuffer;
17
+ try {
18
+ fileBuffer = await fs.readFile(fullPath);
19
+ }
20
+ catch (err) {
21
+ throw new Error(`Failed to read image at ${fullPath}: ${err.message}`);
22
+ }
23
+ const base64Data = fileBuffer.toString("base64");
24
+ // We place it in pdfChunks so it uses the exact same parallel mapping logic
25
+ logger.info(LogSource.PDF_SPLITTER, `Created 1 image chunk from ${state.mimeType}`);
26
+ return { pdfChunks: [base64Data] };
27
+ }
28
+ //# sourceMappingURL=imageReaderNode.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"imageReaderNode.js","sourceRoot":"","sources":["../../src/nodes/imageReaderNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAE7B,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhD;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,KAAoB;IAEpB,WAAW,EAAE,CAAC;IAEd,IAAI,CAAC,KAAK,CAAC,QAAQ;QAAE,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;IAC9E,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,qBAAqB,QAAQ,EAAE,CAAC,CAAC,CAAC,0FAA0F;IAEhK,IAAI,UAAU,CAAC;IACf,IAAI,CAAC;QACH,UAAU,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC3C,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CAAC,2BAA2B,QAAQ,KAAK,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IACzE,CAAC;IAED,MAAM,UAAU,GAAG,UAAU,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEjD,4EAA4E;IAC5E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,8BAA8B,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAEpF,OAAO,EAAE,SAAS,EAAE,CAAC,UAAU,CAAC,EAAE,CAAC;AACrC,CAAC"}
@@ -9,6 +9,7 @@ export declare function llmExtractionNode(state: Partial<PipelineState> & {
9
9
  chunk?: string;
10
10
  index?: number;
11
11
  totalChunks?: number;
12
+ mimeType?: string;
12
13
  }): Promise<Partial<PipelineState>>;
13
14
  /**
14
15
  * Conditional router to determine what happens after llmExtractionNode.
@@ -1 +1 @@
1
- {"version":3,"file":"llmExtractionNode.d.ts","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAuBtD;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,OAAO,CAAC,aAAa,CAAC,GAAG;IAAE,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,GACvF,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAuCjC;AAED;;;;GAIG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,CAK1D"}
1
+ {"version":3,"file":"llmExtractionNode.d.ts","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAuBtD;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,OAAO,CAAC,aAAa,CAAC,GAAG;IAAE,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAAC,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,GAC1G,OAAO,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAwCjC;AAED;;;;GAIG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,CAK1D"}
@@ -35,12 +35,13 @@ export async function llmExtractionNode(state) {
35
35
  const promptInput = {
36
36
  systemPrompt: finalSystemPrompt,
37
37
  userText: isChunkFlow
38
- ? `Extract all content from this PDF (chunk ${state.index + 1} of ${state.totalChunks}) into clean Markdown.`
38
+ ? `Extract all content from this document/image (chunk ${state.index + 1} of ${state.totalChunks}) into clean Markdown.`
39
39
  : `Convert the following extracted document text into clean Markdown:\n\n${state.rawText}`,
40
- base64PdfChunk: isChunkFlow ? state.chunk : undefined
40
+ base64Data: isChunkFlow ? state.chunk : undefined,
41
+ mimeType: state.mimeType
41
42
  };
42
43
  if (isChunkFlow) {
43
- logger.info(LogSource.LLM_EXTRACTION, `Processing PDF chunk ${state.index + 1}/${state.totalChunks} (${((state.chunk.length * 0.75) / 1024).toFixed(0)} KB)`);
44
+ logger.info(LogSource.LLM_EXTRACTION, `Processing chunk ${state.index + 1}/${state.totalChunks} (${((state.chunk.length * 0.75) / 1024).toFixed(0)} KB)`);
44
45
  }
45
46
  else {
46
47
  logger.info(LogSource.LLM_EXTRACTION, `Sending ${state.rawText.length} chars to generic LLM Adapter`);
@@ -1 +1 @@
1
- {"version":3,"file":"llmExtractionNode.js","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAE1E,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAGtD,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;4GAiB8E,CAAC;AAE7G;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAAwF;IAGxF,WAAW,EAAE,CAAC;IAEd,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,WAAW,KAAK,SAAS,CAAC;IAC9G,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAEnC,IAAI,CAAC,WAAW,IAAI,CAAC,UAAU,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;IAC9F,CAAC;IAED,MAAM,iBAAiB,GAAG,cAAc,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAE/E,MAAM,WAAW,GAAa;QAC5B,YAAY,EAAE,iBAAiB;QAC/B,QAAQ,EAAE,WAAW;YACnB,CAAC,CAAC,4CAA4C,KAAK,CAAC,KAAM,GAAG,CAAC,OAAO,KAAK,CAAC,WAAW,wBAAwB;YAC9G,CAAC,CAAC,yEAAyE,KAAK,CAAC,OAAO,EAAE;QAC5F,cAAc,EAAE,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;KACtD,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,wBAAwB,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,KAAK,CAAC,CAAC,KAAK,CAAC,KAAM,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAClK,CAAC;SAAM,CAAC;QACN,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,WAAW,KAAK,CAAC,OAAQ,CAAC,MAAM,+BAA+B,CAAC,CAAC;IACzG,CAAC;IAED,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,CACnC,cAAc,CAAC,GAAG,CAAC,gBAAgB,CAAC,WAAW,CAAC,CACjD,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,SAAS,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,eAAe,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAC;QAC7H,OAAO,EAAE,aAAa,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;IACvC,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,uBAAuB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IACtF,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,aAAa,CAAC,KAAoB;IAChD,IAAI,KAAK,CAAC,aAAa,IAAI,KAAK,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QAC7E,OAAO,gBAAgB,CAAC;IAC1B,CAAC;IACD,OAAO,oBAAoB,CAAC;AAC9B,CAAC"}
1
+ {"version":3,"file":"llmExtractionNode.js","sourceRoot":"","sources":["../../src/nodes/llmExtractionNode.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAE1E,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAGtD,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;4GAiB8E,CAAC;AAE7G;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAA2G;IAG3G,WAAW,EAAE,CAAC;IAEd,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,WAAW,KAAK,SAAS,CAAC;IAC9G,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;IAEnC,IAAI,CAAC,WAAW,IAAI,CAAC,UAAU,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;IAC9F,CAAC;IAED,MAAM,iBAAiB,GAAG,cAAc,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAE/E,MAAM,WAAW,GAAa;QAC5B,YAAY,EAAE,iBAAiB;QAC/B,QAAQ,EAAE,WAAW;YACnB,CAAC,CAAC,uDAAuD,KAAK,CAAC,KAAM,GAAG,CAAC,OAAO,KAAK,CAAC,WAAW,wBAAwB;YACzH,CAAC,CAAC,yEAAyE,KAAK,CAAC,OAAO,EAAE;QAC5F,UAAU,EAAE,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;QACjD,QAAQ,EAAE,KAAK,CAAC,QAAQ;KACzB,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,oBAAoB,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,KAAK,CAAC,CAAC,KAAK,CAAC,KAAM,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC9J,CAAC;SAAM,CAAC;QACN,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,WAAW,KAAK,CAAC,OAAQ,CAAC,MAAM,+BAA+B,CAAC,CAAC;IACzG,CAAC;IAED,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,CACnC,cAAc,CAAC,GAAG,CAAC,gBAAgB,CAAC,WAAW,CAAC,CACjD,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,SAAS,KAAK,CAAC,KAAM,GAAG,CAAC,IAAI,KAAK,CAAC,WAAW,eAAe,QAAQ,CAAC,MAAM,SAAS,CAAC,CAAC;QAC7H,OAAO,EAAE,aAAa,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;IACvC,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,uBAAuB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IACtF,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,aAAa,CAAC,KAAoB;IAChD,IAAI,KAAK,CAAC,aAAa,IAAI,KAAK,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;QAC7E,OAAO,gBAAgB,CAAC;IAC1B,CAAC;IACD,OAAO,oBAAoB,CAAC;AAC9B,CAAC"}
package/package.json CHANGED
@@ -1,7 +1,11 @@
1
1
  {
2
2
  "name": "@virstack/doc-ingest",
3
- "version": "1.0.1",
3
+ "version": "1.0.3",
4
4
  "description": "A high-performance, parallelized document ingestion and vectorization pipeline.",
5
+ "repository": {
6
+ "type": "git",
7
+ "url": "https://github.com/virstack/virstack-doc-ingest"
8
+ },
5
9
  "main": "./dist/index.js",
6
10
  "types": "./dist/index.d.ts",
7
11
  "type": "module",
@@ -9,10 +13,11 @@
9
13
  "virstack-doc-ingest": "./dist/cli.js"
10
14
  },
11
15
  "files": [
12
- "dist"
16
+ "dist",
17
+ "README.md"
13
18
  ],
14
19
  "scripts": {
15
- "build": "tsc",
20
+ "build": "rm -rf dist && tsc",
16
21
  "prepublishOnly": "npm run build",
17
22
  "start": "node dist/cli.js",
18
23
  "dev": "tsx src/cli.ts"
@@ -52,4 +57,4 @@
52
57
  "tsx": "^4.19.0",
53
58
  "typescript": "^5.6.0"
54
59
  }
55
- }
60
+ }
@@ -1,25 +0,0 @@
1
- export interface LlmInput {
2
- systemPrompt: string;
3
- userText: string;
4
- base64PdfChunk?: string;
5
- }
6
- export interface LlmAdapter {
7
- generateMarkdown(input: LlmInput): Promise<string>;
8
- }
9
- export interface EmbeddingAdapter {
10
- embed(chunks: string[]): Promise<number[][]>;
11
- }
12
- export declare class OpenRouterLlmAdapter implements LlmAdapter {
13
- private client;
14
- private model;
15
- constructor(apiKey: string, model: string);
16
- generateMarkdown(input: LlmInput): Promise<string>;
17
- }
18
- export declare class OpenRouterEmbeddingAdapter implements EmbeddingAdapter {
19
- private client;
20
- private model;
21
- private dimensions;
22
- constructor(apiKey: string, model: string, dimensions?: number);
23
- embed(chunks: string[]): Promise<number[][]>;
24
- }
25
- //# sourceMappingURL=aiAdapters.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"aiAdapters.d.ts","sourceRoot":"","sources":["../src/aiAdapters.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,QAAQ;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,UAAU;IACzB,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;CAC9C;AAID,qBAAa,oBAAqB,YAAW,UAAU;IACrD,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;gBAEV,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAKnC,gBAAgB,CAAC,KAAK,EAAE,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;CAsBzD;AAED,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,GAAE,MAAa;IAM9D,KAAK,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;CAWnD"}
@@ -1,50 +0,0 @@
1
- import { OpenAI } from "openai";
2
- // --- BUILT-IN ADAPTERS (For CLI to use by default) ---
3
- export class OpenRouterLlmAdapter {
4
- client;
5
- model;
6
- constructor(apiKey, model) {
7
- this.client = new OpenAI({ baseURL: "https://openrouter.ai/api/v1", apiKey });
8
- this.model = model;
9
- }
10
- async generateMarkdown(input) {
11
- const userContent = [];
12
- if (input.base64PdfChunk) {
13
- userContent.push({
14
- type: "file",
15
- file: { filename: "chunk.pdf", file_data: `data:application/pdf;base64,${input.base64PdfChunk}` },
16
- });
17
- }
18
- userContent.push({ type: "text", text: input.userText });
19
- const response = await this.client.chat.completions.create({
20
- model: this.model,
21
- messages: [
22
- { role: "system", content: input.systemPrompt },
23
- { role: "user", content: userContent },
24
- ],
25
- temperature: 0,
26
- });
27
- return response.choices[0]?.message?.content?.trim() ?? "";
28
- }
29
- }
30
- export class OpenRouterEmbeddingAdapter {
31
- client;
32
- model;
33
- dimensions;
34
- constructor(apiKey, model, dimensions = 1536) {
35
- this.client = new OpenAI({ baseURL: "https://openrouter.ai/api/v1", apiKey });
36
- this.model = model;
37
- this.dimensions = dimensions;
38
- }
39
- async embed(chunks) {
40
- const response = await this.client.embeddings.create({
41
- model: this.model,
42
- input: chunks,
43
- dimensions: this.dimensions,
44
- });
45
- // Sort to maintain chunk order
46
- const sorted = response.data.sort((a, b) => a.index - b.index);
47
- return sorted.map((item) => item.embedding);
48
- }
49
- }
50
- //# sourceMappingURL=aiAdapters.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"aiAdapters.js","sourceRoot":"","sources":["../src/aiAdapters.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAkBhC,wDAAwD;AAExD,MAAM,OAAO,oBAAoB;IACvB,MAAM,CAAS;IACf,KAAK,CAAS;IAEtB,YAAY,MAAc,EAAE,KAAa;QACvC,IAAI,CAAC,MAAM,GAAG,IAAI,MAAM,CAAC,EAAE,OAAO,EAAE,8BAA8B,EAAE,MAAM,EAAE,CAAC,CAAC;QAC9E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,KAAe;QACpC,MAAM,WAAW,GAAU,EAAE,CAAC;QAE9B,IAAI,KAAK,CAAC,cAAc,EAAE,CAAC;YACzB,WAAW,CAAC,IAAI,CAAC;gBACf,IAAI,EAAE,MAAM;gBACZ,IAAI,EAAE,EAAE,QAAQ,EAAE,WAAW,EAAE,SAAS,EAAE,+BAA+B,KAAK,CAAC,cAAc,EAAE,EAAE;aAClG,CAAC,CAAC;QACL,CAAC;QACD,WAAW,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEzD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC;YACzD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ,EAAE;gBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,YAAY,EAAE;gBAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAkB,EAAE;aAC9C;YACD,WAAW,EAAE,CAAC;SACf,CAAC,CAAC;QAEH,OAAO,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC7D,CAAC;CACF;AAED,MAAM,OAAO,0BAA0B;IAC7B,MAAM,CAAS;IACf,KAAK,CAAS;IACd,UAAU,CAAS;IAE3B,YAAY,MAAc,EAAE,KAAa,EAAE,aAAqB,IAAI;QAClE,IAAI,CAAC,MAAM,GAAG,IAAI,MAAM,CAAC,EAAE,OAAO,EAAE,8BAA8B,EAAE,MAAM,EAAE,CAAC,CAAC;QAC9E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAgB;QAC1B,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;YACnD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,MAAM;YACb,UAAU,EAAE,IAAI,CAAC,UAAU;SACrB,CAAC,CAAC;QAEV,+BAA+B;QAC/B,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QACzE,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACnD,CAAC;CACF"}
Binary file
@@ -1,52 +0,0 @@
1
- /**
2
- * State for the batch document processing graph.
3
- */
4
- export declare const BatchStateAnnotation: import("@langchain/langgraph").AnnotationRoot<{
5
- /** Input: List of absolute file paths to process */
6
- files: {
7
- (): import("@langchain/langgraph").LastValue<string[]>;
8
- (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
9
- Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
10
- };
11
- /** Output: Collection of results from each individual document run */
12
- results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
13
- }>;
14
- export type BatchState = typeof BatchStateAnnotation.State;
15
- export declare const graph: import("@langchain/langgraph").CompiledStateGraph<import("@langchain/langgraph").StateType<{
16
- /** Input: List of absolute file paths to process */
17
- files: {
18
- (): import("@langchain/langgraph").LastValue<string[]>;
19
- (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
20
- Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
21
- };
22
- /** Output: Collection of results from each individual document run */
23
- results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
24
- }>, import("@langchain/langgraph").UpdateType<{
25
- /** Input: List of absolute file paths to process */
26
- files: {
27
- (): import("@langchain/langgraph").LastValue<string[]>;
28
- (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
29
- Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
30
- };
31
- /** Output: Collection of results from each individual document run */
32
- results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
33
- }>, "__start__" | "workerNode" | "orchestrator" | "summaryNode", {
34
- /** Input: List of absolute file paths to process */
35
- files: {
36
- (): import("@langchain/langgraph").LastValue<string[]>;
37
- (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
38
- Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
39
- };
40
- /** Output: Collection of results from each individual document run */
41
- results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
42
- }, {
43
- /** Input: List of absolute file paths to process */
44
- files: {
45
- (): import("@langchain/langgraph").LastValue<string[]>;
46
- (annotation: import("@langchain/langgraph").SingleReducer<string[], string[]>): import("@langchain/langgraph").BinaryOperatorAggregate<string[], string[]>;
47
- Root: <S extends import("@langchain/langgraph").StateDefinition>(sd: S) => import("@langchain/langgraph").AnnotationRoot<S>;
48
- };
49
- /** Output: Collection of results from each individual document run */
50
- results: import("@langchain/langgraph").BinaryOperatorAggregate<any[], any[]>;
51
- }, import("@langchain/langgraph").StateDefinition>;
52
- //# sourceMappingURL=batchPipeline.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"batchPipeline.d.ts","sourceRoot":"","sources":["../src/batchPipeline.ts"],"names":[],"mappings":"AAKA;;GAEG;AACH,eAAO,MAAM,oBAAoB;IAC/B,oDAAoD;;;;;;IAGpD,sEAAsE;;EAKtE,CAAC;AAEH,MAAM,MAAM,UAAU,GAAG,OAAO,oBAAoB,CAAC,KAAK,CAAC;AAyE3D,eAAO,MAAM,KAAK;IAnFhB,oDAAoD;;;;;;IAGpD,sEAAsE;;;IAHtE,oDAAoD;;;;;;IAGpD,sEAAsE;;;IAHtE,oDAAoD;;;;;;IAGpD,sEAAsE;;;IAHtE,oDAAoD;;;;;;IAGpD,sEAAsE;;kDAgF/B,CAAC"}