@dcyfr/ai-rag 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +124 -2
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/ingestion/index.d.ts +8 -0
- package/dist/ingestion/index.d.ts.map +1 -0
- package/dist/ingestion/index.js +7 -0
- package/dist/ingestion/index.js.map +1 -0
- package/dist/ingestion/markitdown-bridge.d.ts +37 -0
- package/dist/ingestion/markitdown-bridge.d.ts.map +1 -0
- package/dist/ingestion/markitdown-bridge.js +325 -0
- package/dist/ingestion/markitdown-bridge.js.map +1 -0
- package/dist/ingestion/types.d.ts +158 -0
- package/dist/ingestion/types.d.ts.map +1 -0
- package/dist/ingestion/types.js +42 -0
- package/dist/ingestion/types.js.map +1 -0
- package/dist/pipeline/ingestion/pipeline.d.ts +17 -5
- package/dist/pipeline/ingestion/pipeline.d.ts.map +1 -1
- package/dist/pipeline/ingestion/pipeline.js +176 -33
- package/dist/pipeline/ingestion/pipeline.js.map +1 -1
- package/dist/types/index.d.ts +35 -0
- package/dist/types/index.d.ts.map +1 -1
- package/package.json +6 -7
package/LICENSE
CHANGED
package/README.md
CHANGED
|
@@ -1,13 +1,135 @@
|
|
|
1
1
|
# @dcyfr/ai-rag
|
|
2
2
|
|
|
3
|
+
[](https://deepwiki.com/dcyfr/dcyfr-ai-rag)
|
|
4
|
+
|
|
3
5
|
> **RAG (Retrieval-Augmented Generation) framework for Node.js and TypeScript**
|
|
4
6
|
|
|
5
7
|
Build production-ready RAG systems with document loading, embedding, vector stores, and semantic search.
|
|
6
8
|
|
|
7
9
|
[](https://www.npmjs.com/package/@dcyfr/ai-rag)
|
|
8
|
-
[](https://www.typescriptlang.org/)
|
|
9
11
|
[](https://opensource.org/licenses/MIT)
|
|
10
12
|
|
|
13
|
+
---
|
|
14
|
+
- [Document Conversion](#-document-conversion-markitdown)
|
|
15
|
+
- **Document Conversion** - Convert 15+ file formats (PDF, DOCX, PPTX, XLSX, images, etc.) to Markdown via MarkItDown Python library
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## 📄 Document Conversion (MarkItDown)
|
|
19
|
+
|
|
20
|
+
Convert diverse document formats to LLM-optimized Markdown:
|
|
21
|
+
|
|
22
|
+
```typescript
|
|
23
|
+
import { convertToMarkdown, convertBatch } from '@dcyfr/ai-rag/ingestion';
|
|
24
|
+
|
|
25
|
+
// Single document conversion
|
|
26
|
+
const result = await convertToMarkdown('/path/to/document.pdf', {
|
|
27
|
+
timeout: 45000, // 45 seconds
|
|
28
|
+
maxFileSize: 50 * 1024 * 1024, // 50 MB
|
|
29
|
+
enableLLMDescriptions: true, // Use GPT-4 Vision for image descriptions
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
console.log(result.markdown); // Converted markdown content
|
|
33
|
+
console.log(result.metadata); // File size, duration, page count, etc.
|
|
34
|
+
|
|
35
|
+
// Batch conversion (parallel, concurrency-controlled)
|
|
36
|
+
const files = ['/docs/report.pdf', '/slides/deck.pptx', '/data/sheet.xlsx'];
|
|
37
|
+
const results = await convertBatch(files, { timeout: 60000 });
|
|
38
|
+
|
|
39
|
+
results.forEach((r, i) => {
|
|
40
|
+
if (r.success) {
|
|
41
|
+
console.log(`✅ ${files[i]}: ${r.markdown.length} chars`);
|
|
42
|
+
} else {
|
|
43
|
+
console.error(`❌ ${files[i]}: ${r.error}`);
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Supported Formats:**
|
|
49
|
+
- **Documents:** PDF, DOCX, PPTX, XLSX, CSV, TXT, Markdown
|
|
50
|
+
- **Web:** HTML, XML, JSON
|
|
51
|
+
- **Images:** PNG, JPG, JPEG, GIF, WEBP (with optional LLM-powered OCR)
|
|
52
|
+
- **Audio:** MP3, WAV, M4A (transcription)
|
|
53
|
+
- **Archives:** EPUB, ZIP
|
|
54
|
+
|
|
55
|
+
**Installation:**
|
|
56
|
+
```bash
|
|
57
|
+
# Python environment required (workspace already configured)
|
|
58
|
+
pip install markitdown>=0.1.5
|
|
59
|
+
|
|
60
|
+
# Or use workspace .venv (pre-configured)
|
|
61
|
+
source /path/to/workspace/.venv/bin/activate
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**Performance:**
|
|
65
|
+
- **Latency:** 200-500ms per document (PDF/Office), <100ms (text/HTML)
|
|
66
|
+
- **Concurrency:** Max 3 parallel conversions (configurable)
|
|
67
|
+
- **Memory:** ~50-200 MB per conversion (temp files auto-cleaned)
|
|
68
|
+
|
|
69
|
+
**Error Handling:**
|
|
70
|
+
```typescript
|
|
71
|
+
import { ConversionError, ConversionErrorType } from '@dcyfr/ai-rag/ingestion';
|
|
72
|
+
|
|
73
|
+
try {
|
|
74
|
+
const result = await convertToMarkdown('/path/to/file.pdf');
|
|
75
|
+
} catch (error) {
|
|
76
|
+
if (error instanceof ConversionError) {
|
|
77
|
+
switch (error.type) {
|
|
78
|
+
case ConversionErrorType.TIMEOUT:
|
|
79
|
+
console.error('Conversion timed out - file too large?');
|
|
80
|
+
break;
|
|
81
|
+
case ConversionErrorType.FILE_TOO_LARGE:
|
|
82
|
+
console.error(`File exceeds ${error.details?.maxFileSize} bytes`);
|
|
83
|
+
break;
|
|
84
|
+
case ConversionErrorType.UNSUPPORTED_FORMAT:
|
|
85
|
+
console.error('File format not supported by MarkItDown');
|
|
86
|
+
break;
|
|
87
|
+
default:
|
|
88
|
+
console.error(`Conversion failed: ${error.message}`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**LLM Integration (Optional):**
|
|
95
|
+
```typescript
|
|
96
|
+
// Enable GPT-4 Vision or Claude for image descriptions
|
|
97
|
+
const result = await convertToMarkdown('/path/to/presentation.pptx', {
|
|
98
|
+
enableLLMDescriptions: true,
|
|
99
|
+
llmModel: 'gpt-4-vision-preview', // or 'claude-3-opus-20240229'
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// Requires environment variables:
|
|
103
|
+
// OPENAI_API_KEY=sk-...
|
|
104
|
+
// ANTHROPIC_API_KEY=sk-ant-...
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
## ⚡ 30-Second Quick Start
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# Install package
|
|
112
|
+
npm install @dcyfr/ai-rag
|
|
113
|
+
|
|
114
|
+
# Basic usage
|
|
115
|
+
import { TextLoader, InMemoryVectorStore } from '@dcyfr/ai-rag';
|
|
116
|
+
|
|
117
|
+
const loader = new TextLoader();
|
|
118
|
+
const store = new InMemoryVectorStore();
|
|
119
|
+
# ✅ RAG system ready for document ingestion
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## 🧭 Related Packages
|
|
125
|
+
|
|
126
|
+
| Package | Purpose | Type |
|
|
127
|
+
|---------|---------|------|
|
|
128
|
+
| [@dcyfr/ai](../dcyfr-ai) | Core AI harness | npm package |
|
|
129
|
+
| [@dcyfr/ai-agents](../dcyfr-ai-agents) | Autonomous agents | Template |
|
|
130
|
+
| [@dcyfr/ai-chatbot](../dcyfr-ai-chatbot) | Chatbot template | Template |
|
|
131
|
+
| [dcyfr-labs](../dcyfr-labs) | Production Next.js app | Application |
|
|
132
|
+
|
|
11
133
|
---
|
|
12
134
|
|
|
13
135
|
## ✨ Features
|
|
@@ -57,7 +179,7 @@ import {
|
|
|
57
179
|
// 1. Setup components
|
|
58
180
|
const loader = new TextLoader();
|
|
59
181
|
const embedder = new SimpleEmbeddingGenerator({ dimensions: 384 });
|
|
60
|
-
const store = new
|
|
182
|
+
const store = new InMemoryVectorStore({
|
|
61
183
|
collectionName: 'my-docs',
|
|
62
184
|
embeddingDimensions: 384,
|
|
63
185
|
});
|
package/dist/index.d.ts
CHANGED
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,mBAAmB,kBAAkB,CAAC;AAGtC,cAAc,oBAAoB,CAAC;AAGnC,cAAc,mBAAmB,CAAC;AAGlC,cAAc,qBAAqB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,mBAAmB,kBAAkB,CAAC;AAGtC,cAAc,oBAAoB,CAAC;AAGnC,cAAc,mBAAmB,CAAC;AAGlC,cAAc,qBAAqB,CAAC;AAGpC,cAAc,sBAAsB,CAAC"}
|
package/dist/index.js
CHANGED
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,UAAU;AACV,cAAc,oBAAoB,CAAC;AAEnC,SAAS;AACT,cAAc,mBAAmB,CAAC;AAElC,YAAY;AACZ,cAAc,qBAAqB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,UAAU;AACV,cAAc,oBAAoB,CAAC;AAEnC,SAAS;AACT,cAAc,mBAAmB,CAAC;AAElC,YAAY;AACZ,cAAc,qBAAqB,CAAC;AAEpC,kCAAkC;AAClC,cAAc,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document ingestion module for MarkItDown integration
|
|
3
|
+
* @module @dcyfr/ai-rag/ingestion
|
|
4
|
+
*/
|
|
5
|
+
export { convertToMarkdown, convertBatch, checkMarkItDownInstalled } from './markitdown-bridge.js';
|
|
6
|
+
export type { ConversionOptions, ConversionResult, SupportedFormat, SubprocessMessage, } from './types.js';
|
|
7
|
+
export { ConversionError, ConversionErrorType } from './types.js';
|
|
8
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/ingestion/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,iBAAiB,EAAE,YAAY,EAAE,wBAAwB,EAAE,MAAM,wBAAwB,CAAC;AACnG,YAAY,EACV,iBAAiB,EACjB,gBAAgB,EAChB,eAAe,EACf,iBAAiB,GAClB,MAAM,YAAY,CAAC;AACpB,OAAO,EAAE,eAAe,EAAE,mBAAmB,EAAE,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document ingestion module for MarkItDown integration
|
|
3
|
+
* @module @dcyfr/ai-rag/ingestion
|
|
4
|
+
*/
|
|
5
|
+
export { convertToMarkdown, convertBatch, checkMarkItDownInstalled } from './markitdown-bridge.js';
|
|
6
|
+
export { ConversionError, ConversionErrorType } from './types.js';
|
|
7
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/ingestion/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,iBAAiB,EAAE,YAAY,EAAE,wBAAwB,EAAE,MAAM,wBAAwB,CAAC;AAOnG,OAAO,EAAE,eAAe,EAAE,mBAAmB,EAAE,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TypeScript bridge to Python MarkItDown document converter
|
|
3
|
+
* @module @dcyfr/ai-rag/ingestion/markitdown-bridge
|
|
4
|
+
*/
|
|
5
|
+
import type { ConversionOptions, ConversionResult } from './types.js';
|
|
6
|
+
/**
|
|
7
|
+
* Convert document to Markdown using Python MarkItDown subprocess
|
|
8
|
+
*
|
|
9
|
+
* @param filePath - Absolute path to file to convert
|
|
10
|
+
* @param options - Conversion options
|
|
11
|
+
* @returns Conversion result with markdown and metadata
|
|
12
|
+
*
|
|
13
|
+
* @throws {ConversionError} If conversion fails
|
|
14
|
+
*
|
|
15
|
+
* @example
|
|
16
|
+
* ```typescript
|
|
17
|
+
* const result = await convertToMarkdown('/path/to/document.pdf', {
|
|
18
|
+
* timeout: 45000,
|
|
19
|
+
* enableLLMDescriptions: true
|
|
20
|
+
* });
|
|
21
|
+
* console.log(result.markdown);
|
|
22
|
+
* ```
|
|
23
|
+
*/
|
|
24
|
+
export declare function convertToMarkdown(filePath: string, options?: ConversionOptions): Promise<ConversionResult>;
|
|
25
|
+
/**
|
|
26
|
+
* Batch convert multiple documents
|
|
27
|
+
*
|
|
28
|
+
* @param filePaths - Array of file paths to convert
|
|
29
|
+
* @param options - Shared conversion options
|
|
30
|
+
* @returns Array of conversion results (same order as input)
|
|
31
|
+
*/
|
|
32
|
+
export declare function convertBatch(filePaths: string[], options?: ConversionOptions): Promise<ConversionResult[]>;
|
|
33
|
+
/**
|
|
34
|
+
* Check if Python MarkItDown is installed and accessible
|
|
35
|
+
*/
|
|
36
|
+
export declare function checkMarkItDownInstalled(): Promise<boolean>;
|
|
37
|
+
//# sourceMappingURL=markitdown-bridge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markitdown-bridge.d.ts","sourceRoot":"","sources":["../../src/ingestion/markitdown-bridge.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAOH,OAAO,KAAK,EACV,iBAAiB,EACjB,gBAAgB,EAGjB,MAAM,YAAY,CAAC;AAwRpB;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAsB,iBAAiB,CACrC,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,gBAAgB,CAAC,CAoE3B;AAED;;;;;;GAMG;AACH,wBAAsB,YAAY,CAChC,SAAS,EAAE,MAAM,EAAE,EACnB,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAkC7B;AAED;;GAEG;AACH,wBAAsB,wBAAwB,IAAI,OAAO,CAAC,OAAO,CAAC,CAmBjE"}
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TypeScript bridge to Python MarkItDown document converter
|
|
3
|
+
* @module @dcyfr/ai-rag/ingestion/markitdown-bridge
|
|
4
|
+
*/
|
|
5
|
+
import { spawn } from 'node:child_process';
|
|
6
|
+
import { existsSync, promises as fs } from 'node:fs';
|
|
7
|
+
import { tmpdir } from 'node:os';
|
|
8
|
+
import { join, resolve, basename, extname, dirname } from 'node:path';
|
|
9
|
+
import { fileURLToPath } from 'node:url';
|
|
10
|
+
import { ConversionError, ConversionErrorType } from './types.js';
|
|
11
|
+
// ES module __dirname equivalent
|
|
12
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
13
|
+
const __dirname = dirname(__filename);
|
|
14
|
+
/**
|
|
15
|
+
* Default conversion options
|
|
16
|
+
*/
|
|
17
|
+
const DEFAULT_OPTIONS = {
|
|
18
|
+
timeout: 30000, // 30 seconds
|
|
19
|
+
maxFileSize: 52428800, // 50MB
|
|
20
|
+
enableLLMDescriptions: false,
|
|
21
|
+
preserveMetadata: true,
|
|
22
|
+
};
|
|
23
|
+
/**
|
|
24
|
+
* Supported file extensions mapping to format types
|
|
25
|
+
*/
|
|
26
|
+
const EXTENSION_MAP = {
|
|
27
|
+
'.pdf': 'pdf',
|
|
28
|
+
'.docx': 'docx',
|
|
29
|
+
'.pptx': 'pptx',
|
|
30
|
+
'.xlsx': 'xlsx',
|
|
31
|
+
'.csv': 'csv',
|
|
32
|
+
'.html': 'html',
|
|
33
|
+
'.htm': 'htm',
|
|
34
|
+
'.xml': 'xml',
|
|
35
|
+
'.json': 'json',
|
|
36
|
+
'.png': 'png',
|
|
37
|
+
'.jpg': 'jpg',
|
|
38
|
+
'.jpeg': 'jpeg',
|
|
39
|
+
'.gif': 'gif',
|
|
40
|
+
'.webp': 'webp',
|
|
41
|
+
'.mp3': 'mp3',
|
|
42
|
+
'.wav': 'wav',
|
|
43
|
+
'.m4a': 'm4a',
|
|
44
|
+
'.epub': 'epub',
|
|
45
|
+
'.zip': 'zip',
|
|
46
|
+
};
|
|
47
|
+
/**
|
|
48
|
+
* Detect file format from extension
|
|
49
|
+
*/
|
|
50
|
+
function detectFormat(filePath) {
|
|
51
|
+
const ext = extname(filePath).toLowerCase();
|
|
52
|
+
const format = EXTENSION_MAP[ext];
|
|
53
|
+
if (!format) {
|
|
54
|
+
throw new ConversionError(ConversionErrorType.UNSUPPORTED_FORMAT, `Unsupported file format: ${ext}`, { filePath, extension: ext });
|
|
55
|
+
}
|
|
56
|
+
return format;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Create temporary directory for conversion workspace
|
|
60
|
+
*/
|
|
61
|
+
async function createTempDir() {
|
|
62
|
+
try {
|
|
63
|
+
const tempDir = join(tmpdir(), `markitdown-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`);
|
|
64
|
+
await fs.mkdir(tempDir, { recursive: true });
|
|
65
|
+
return tempDir;
|
|
66
|
+
}
|
|
67
|
+
catch (error) {
|
|
68
|
+
throw new ConversionError(ConversionErrorType.TEMP_DIR_ERROR, 'Failed to create temporary directory', { error: error instanceof Error ? error.message : String(error) });
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Clean up temporary directory
|
|
73
|
+
*/
|
|
74
|
+
async function cleanupTempDir(tempDir) {
|
|
75
|
+
try {
|
|
76
|
+
await fs.rm(tempDir, { recursive: true, force: true });
|
|
77
|
+
}
|
|
78
|
+
catch (error) {
|
|
79
|
+
// Log warning but don't throw - cleanup failure shouldn't break the flow
|
|
80
|
+
console.warn(`Failed to cleanup temp directory ${tempDir}:`, error);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Validate file accessibility and size
|
|
85
|
+
*/
|
|
86
|
+
async function validateFile(filePath, maxFileSize) {
|
|
87
|
+
try {
|
|
88
|
+
const stats = await fs.stat(filePath);
|
|
89
|
+
if (!stats.isFile()) {
|
|
90
|
+
throw new ConversionError(ConversionErrorType.FILE_NOT_FOUND, 'Path is not a file', { filePath });
|
|
91
|
+
}
|
|
92
|
+
if (stats.size > maxFileSize) {
|
|
93
|
+
throw new ConversionError(ConversionErrorType.FILE_TOO_LARGE, `File size ${stats.size} bytes exceeds limit ${maxFileSize} bytes`, { filePath, fileSize: stats.size, maxFileSize });
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
catch (error) {
|
|
97
|
+
if (error instanceof ConversionError) {
|
|
98
|
+
throw error;
|
|
99
|
+
}
|
|
100
|
+
throw new ConversionError(ConversionErrorType.FILE_NOT_FOUND, `File not found or inaccessible: ${filePath}`, { error: error instanceof Error ? error.message : String(error) });
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Find Python executable with worktree-aware fallback order
|
|
105
|
+
*/
|
|
106
|
+
function getPythonExecutable() {
|
|
107
|
+
const explicitPython = process.env.PYTHON_EXECUTABLE;
|
|
108
|
+
if (explicitPython && existsSync(explicitPython)) {
|
|
109
|
+
return explicitPython;
|
|
110
|
+
}
|
|
111
|
+
const activeVenv = process.env.VIRTUAL_ENV;
|
|
112
|
+
if (activeVenv) {
|
|
113
|
+
const activeVenvPython = join(activeVenv, 'bin', 'python');
|
|
114
|
+
if (existsSync(activeVenvPython)) {
|
|
115
|
+
return activeVenvPython;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
const workspaceRoot = resolve(__dirname, '../../../..');
|
|
119
|
+
const candidatePaths = [
|
|
120
|
+
join(workspaceRoot, '.venv', 'bin', 'python'),
|
|
121
|
+
join(workspaceRoot, '..', 'dcyfr-workspace', '.venv', 'bin', 'python'),
|
|
122
|
+
'/usr/bin/python3',
|
|
123
|
+
];
|
|
124
|
+
for (const candidate of candidatePaths) {
|
|
125
|
+
if (existsSync(candidate)) {
|
|
126
|
+
return candidate;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return 'python3';
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Execute Python subprocess with timeout handling
|
|
133
|
+
*/
|
|
134
|
+
async function executeSubprocess(python, args, options) {
|
|
135
|
+
const child = spawn(python, args, {
|
|
136
|
+
cwd: options.cwd,
|
|
137
|
+
timeout: options.timeout,
|
|
138
|
+
env: options.env,
|
|
139
|
+
});
|
|
140
|
+
let stdout = '';
|
|
141
|
+
let stderr = '';
|
|
142
|
+
child.stdout?.on('data', (data) => {
|
|
143
|
+
stdout += data.toString();
|
|
144
|
+
});
|
|
145
|
+
child.stderr?.on('data', (data) => {
|
|
146
|
+
stderr += data.toString();
|
|
147
|
+
});
|
|
148
|
+
const exitCode = await new Promise((resolve, reject) => {
|
|
149
|
+
const timeoutId = setTimeout(() => {
|
|
150
|
+
child.kill('SIGTERM');
|
|
151
|
+
setTimeout(() => child.kill('SIGKILL'), 1000);
|
|
152
|
+
reject(new ConversionError(ConversionErrorType.TIMEOUT, `Conversion exceeded timeout of ${options.timeout}ms`, { filePath: options.filePath, timeout: options.timeout }));
|
|
153
|
+
}, options.timeout);
|
|
154
|
+
child.on('exit', (code) => {
|
|
155
|
+
clearTimeout(timeoutId);
|
|
156
|
+
resolve(code);
|
|
157
|
+
});
|
|
158
|
+
child.on('error', (error) => {
|
|
159
|
+
clearTimeout(timeoutId);
|
|
160
|
+
reject(new ConversionError(ConversionErrorType.SUBPROCESS_ERROR, `Python subprocess failed: ${error.message}`, { error: error.message, python }));
|
|
161
|
+
});
|
|
162
|
+
});
|
|
163
|
+
return { stdout, stderr, exitCode };
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Build conversion result metadata
|
|
167
|
+
*/
|
|
168
|
+
async function buildMetadata(fileName, resolvedPath, format, durationMs, stderr, opts) {
|
|
169
|
+
const stats = await fs.stat(resolvedPath);
|
|
170
|
+
const metadata = {
|
|
171
|
+
fileName,
|
|
172
|
+
fileSize: stats.size,
|
|
173
|
+
format,
|
|
174
|
+
convertedAt: new Date().toISOString(),
|
|
175
|
+
durationMs,
|
|
176
|
+
usedLLMDescriptions: opts.enableLLMDescriptions,
|
|
177
|
+
};
|
|
178
|
+
const pageRegex = /(\p{N}+)\s+pages?/iu;
|
|
179
|
+
const pageMatch = pageRegex.exec(stderr);
|
|
180
|
+
if (pageMatch) {
|
|
181
|
+
metadata.pageCount = Number.parseInt(pageMatch[1], 10);
|
|
182
|
+
}
|
|
183
|
+
return metadata;
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* Validate subprocess execution result
|
|
187
|
+
*/
|
|
188
|
+
function validateSubprocessResult(output, resolvedPath) {
|
|
189
|
+
if (output.exitCode !== 0) {
|
|
190
|
+
throw new ConversionError(ConversionErrorType.SUBPROCESS_ERROR, `MarkItDown conversion failed with exit code ${output.exitCode}`, { exitCode: output.exitCode, stderr: output.stderr, stdout: output.stdout, filePath: resolvedPath });
|
|
191
|
+
}
|
|
192
|
+
const markdown = output.stdout.trim();
|
|
193
|
+
if (!markdown) {
|
|
194
|
+
throw new ConversionError(ConversionErrorType.SUBPROCESS_ERROR, 'MarkItDown returned empty output', { stderr: output.stderr, filePath: resolvedPath });
|
|
195
|
+
}
|
|
196
|
+
return markdown;
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Convert document to Markdown using Python MarkItDown subprocess
|
|
200
|
+
*
|
|
201
|
+
* @param filePath - Absolute path to file to convert
|
|
202
|
+
* @param options - Conversion options
|
|
203
|
+
* @returns Conversion result with markdown and metadata
|
|
204
|
+
*
|
|
205
|
+
* @throws {ConversionError} If conversion fails
|
|
206
|
+
*
|
|
207
|
+
* @example
|
|
208
|
+
* ```typescript
|
|
209
|
+
* const result = await convertToMarkdown('/path/to/document.pdf', {
|
|
210
|
+
* timeout: 45000,
|
|
211
|
+
* enableLLMDescriptions: true
|
|
212
|
+
* });
|
|
213
|
+
* console.log(result.markdown);
|
|
214
|
+
* ```
|
|
215
|
+
*/
|
|
216
|
+
export async function convertToMarkdown(filePath, options = {}) {
|
|
217
|
+
const resolvedPath = resolve(filePath);
|
|
218
|
+
const opts = {
|
|
219
|
+
...DEFAULT_OPTIONS,
|
|
220
|
+
...options,
|
|
221
|
+
};
|
|
222
|
+
await validateFile(resolvedPath, opts.maxFileSize);
|
|
223
|
+
const format = detectFormat(resolvedPath);
|
|
224
|
+
const fileName = basename(resolvedPath);
|
|
225
|
+
let tempDir = null;
|
|
226
|
+
try {
|
|
227
|
+
tempDir = await createTempDir();
|
|
228
|
+
const python = getPythonExecutable();
|
|
229
|
+
const startMs = Date.now();
|
|
230
|
+
const output = await executeSubprocess(python, ['-m', 'markitdown', resolvedPath], {
|
|
231
|
+
cwd: tempDir,
|
|
232
|
+
timeout: opts.timeout,
|
|
233
|
+
env: {
|
|
234
|
+
...process.env,
|
|
235
|
+
...(opts.enableLLMDescriptions && {
|
|
236
|
+
OPENAI_API_KEY: process.env.OPENAI_API_KEY,
|
|
237
|
+
ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY,
|
|
238
|
+
LLM_MODEL: opts.llmModel || 'gpt-4-vision-preview',
|
|
239
|
+
}),
|
|
240
|
+
},
|
|
241
|
+
filePath: resolvedPath,
|
|
242
|
+
});
|
|
243
|
+
const markdown = validateSubprocessResult(output, resolvedPath);
|
|
244
|
+
const durationMs = Date.now() - startMs;
|
|
245
|
+
const metadata = await buildMetadata(fileName, resolvedPath, format, durationMs, output.stderr, opts);
|
|
246
|
+
return {
|
|
247
|
+
markdown,
|
|
248
|
+
metadata,
|
|
249
|
+
success: true,
|
|
250
|
+
warnings: output.stderr ? [output.stderr] : undefined,
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
catch (error) {
|
|
254
|
+
if (error instanceof ConversionError) {
|
|
255
|
+
throw error;
|
|
256
|
+
}
|
|
257
|
+
throw new ConversionError(ConversionErrorType.SUBPROCESS_ERROR, `Unexpected error during conversion: ${error instanceof Error ? error.message : String(error)}`, { error: error instanceof Error ? error.message : String(error), filePath: resolvedPath });
|
|
258
|
+
}
|
|
259
|
+
finally {
|
|
260
|
+
if (tempDir) {
|
|
261
|
+
await cleanupTempDir(tempDir);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Batch convert multiple documents
|
|
267
|
+
*
|
|
268
|
+
* @param filePaths - Array of file paths to convert
|
|
269
|
+
* @param options - Shared conversion options
|
|
270
|
+
* @returns Array of conversion results (same order as input)
|
|
271
|
+
*/
|
|
272
|
+
export async function convertBatch(filePaths, options = {}) {
|
|
273
|
+
// Process conversions in parallel with concurrency limit
|
|
274
|
+
const MAX_CONCURRENT = 3;
|
|
275
|
+
const results = [];
|
|
276
|
+
for (let i = 0; i < filePaths.length; i += MAX_CONCURRENT) {
|
|
277
|
+
const batch = filePaths.slice(i, i + MAX_CONCURRENT);
|
|
278
|
+
const batchResults = await Promise.allSettled(batch.map((path) => convertToMarkdown(path, options)));
|
|
279
|
+
for (const result of batchResults) {
|
|
280
|
+
if (result.status === 'fulfilled') {
|
|
281
|
+
results.push(result.value);
|
|
282
|
+
}
|
|
283
|
+
else {
|
|
284
|
+
// Convert rejected promise to failed ConversionResult
|
|
285
|
+
const error = result.reason;
|
|
286
|
+
results.push({
|
|
287
|
+
markdown: '',
|
|
288
|
+
metadata: {
|
|
289
|
+
fileName: '',
|
|
290
|
+
fileSize: 0,
|
|
291
|
+
format: 'pdf',
|
|
292
|
+
convertedAt: new Date().toISOString(),
|
|
293
|
+
durationMs: 0,
|
|
294
|
+
},
|
|
295
|
+
success: false,
|
|
296
|
+
error: error instanceof Error ? error.message : String(error),
|
|
297
|
+
});
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
return results;
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* Check if Python MarkItDown is installed and accessible
|
|
305
|
+
*/
|
|
306
|
+
export async function checkMarkItDownInstalled() {
|
|
307
|
+
try {
|
|
308
|
+
const python = getPythonExecutable();
|
|
309
|
+
const child = spawn(python, ['-c', 'import markitdown; print("ok")'], {
|
|
310
|
+
timeout: 5000,
|
|
311
|
+
});
|
|
312
|
+
return new Promise((resolve) => {
|
|
313
|
+
child.on('exit', (code) => {
|
|
314
|
+
resolve(code === 0);
|
|
315
|
+
});
|
|
316
|
+
child.on('error', () => {
|
|
317
|
+
resolve(false);
|
|
318
|
+
});
|
|
319
|
+
});
|
|
320
|
+
}
|
|
321
|
+
catch {
|
|
322
|
+
return false;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
//# sourceMappingURL=markitdown-bridge.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markitdown-bridge.js","sourceRoot":"","sources":["../../src/ingestion/markitdown-bridge.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAC3C,OAAO,EAAE,UAAU,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAC;AACrD,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAOzC,OAAO,EAAE,eAAe,EAAE,mBAAmB,EAAE,MAAM,YAAY,CAAC;AAElE,iCAAiC;AACjC,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAClD,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAEtC;;GAEG;AACH,MAAM,eAAe,GAA+G;IAClI,OAAO,EAAE,KAAK,EAAE,aAAa;IAC7B,WAAW,EAAE,QAAQ,EAAE,OAAO;IAC9B,qBAAqB,EAAE,KAAK;IAC5B,gBAAgB,EAAE,IAAI;CACvB,CAAC;AAEF;;GAEG;AACH,MAAM,aAAa,GAAoC;IACrD,MAAM,EAAE,KAAK;IACb,OAAO,EAAE,MAAM;IACf,OAAO,EAAE,MAAM;IACf,OAAO,EAAE,MAAM;IACf,MAAM,EAAE,KAAK;IACb,OAAO,EAAE,MAAM;IACf,MAAM,EAAE,KAAK;IACb,MAAM,EAAE,KAAK;IACb,OAAO,EAAE,MAAM;IACf,MAAM,EAAE,KAAK;IACb,MAAM,EAAE,KAAK;IACb,OAAO,EAAE,MAAM;IACf,MAAM,EAAE,KAAK;IACb,OAAO,EAAE,MAAM;IACf,MAAM,EAAE,KAAK;IACb,MAAM,EAAE,KAAK;IACb,MAAM,EAAE,KAAK;IACb,OAAO,EAAE,MAAM;IACf,MAAM,EAAE,KAAK;CACd,CAAC;AAEF;;GAEG;AACH,SAAS,YAAY,CAAC,QAAgB;IACpC,MAAM,GAAG,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IAC5C,MAAM,MAAM,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;IAClC,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,eAAe,CACvB,mBAAmB,CAAC,kBAAkB,EACtC,4BAA4B,GAAG,EAAE,EACjC,EAAE,QAAQ,EAAE,SAAS,EAAE,GAAG,EAAE,CAC7B,CAAC;IACJ,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,aAAa;IAC1B,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,cAAc,IAAI,CAAC,GAAG,EAAE,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC;QACrG,MAAM,EAAE,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAC7C,OAAO,OAAO,CAAC;IACjB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,eAAe,CACvB,mBAAmB,CAAC,cAAc,EAClC,sCAAsC,EACtC,EAAE,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAClE,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,cAAc,CAAC,OAAe;IAC3C,IAAI,CAAC;QACH,MAAM,EAAE,CAAC,EAAE,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACzD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,yEAAyE;QACzE,OAAO,CAAC,IAAI,CAAC,oCAAoC,OAAO,GAAG,EAAE,KAAK,CAAC,CAAC;IACtE,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,YAAY,CAAC,QAAgB,EAAE,WAAmB;IAC/D,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAEtC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC;YACpB,MAAM,IAAI,eAAe,CACvB,mBAAmB,CAAC,cAAc,EAClC,oBAAoB,EACpB,EAAE,QAAQ,EAAE,CACb,CAAC;QACJ,CAAC;QAED,IAAI,KAAK,CAAC,IAAI,GAAG,WAAW,EAAE,CAAC;YAC7B,MAAM,IAAI,eAAe,CACvB,mBAAmB,CAAC,cAAc,EAClC,aAAa,KAAK,CAAC,IAAI,wBAAwB,WAAW,QAAQ,EAClE,EAAE,QAAQ,EAAE,QAAQ,EAAE,KAAK,CAAC,IAAI,EAAE,WAAW,EAAE,CAChD,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,eAAe,EAAE,CAAC;YACrC,MAAM,KAAK,CAAC;QACd,CAAC;QACD,MAAM,IAAI,eAAe,CACvB,mBAAmB,CAAC,cAAc,EAClC,mCAAmC,QAAQ,EAAE,EAC7C,EAAE,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAClE,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB;IAC1B,MAAM,cAAc,GAAG,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IACrD,IAAI,cAAc,IAAI,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;QACjD,OAAO,cAAc,CAAC;IACxB,CAAC;IAED,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC;IAC3C,IAAI,UAAU,EAAE,CAAC;QACf,MAAM,gBAAgB,GAAG,IAAI,CAAC,UAAU,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;QAC3D,IAAI,UAAU,CAAC,gBAAgB,CAAC,EAAE,CAAC;YACjC,OAAO,gBAAgB,CAAC;QAC1B,CAAC;IACH,CAAC;IAED,MAAM,aAAa,GAAG,OAAO,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;IACxD,MAAM,cAAc,GAAG;QACrB,IAAI,CAAC,aAAa,EAAE,OAAO,EAAE,KAAK,EAAE,QAAQ,CAAC;QAC7C,IAAI,CAAC,aAAa,EAAE,IAAI,EAAE,iBAAiB,EAAE,OAAO,EAAE,KAAK,EAAE,QAAQ,CAAC;QACtE,kBAAkB;KACnB,CAAC;IAEF,KAAK,MAAM,SAAS,IAAI,cAAc,EAAE,CAAC;QACvC,IAAI,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;YAC1B,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAWD;;GAEG;AACH,KAAK,UAAU,iBAAiB,CAC9B,MAAc,EACd,IAAc,EACd,OAKC;IAED,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,IAAI,EAAE;QAChC,GAAG,EAAE,OAAO,CAAC,GAAG;QAChB,OAAO,EAAE,OAAO,CAAC,OAAO;QACxB,GAAG,EAAE,OAAO,CAAC,GAAG;KACjB,CAAC,CAAC;IAEH,IAAI,MAAM,GAAG,EAAE,CAAC;IAChB,IAAI,MAAM,GAAG,EAAE,CAAC;IAEhB,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;QAChC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;IAC5B,CAAC,CAAC,CAAC;IAEH,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;QAChC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;IAC5B,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,MAAM,IAAI,OAAO,CAAgB,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACpE,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE;YAChC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtB,UAAU,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,CAAC,CAAC;YAC9C,MAAM,CAAC,IAAI,eAAe,CACxB,mBAAmB,CAAC,OAAO,EAC3B,kCAAkC,OAAO,CAAC,OAAO,IAAI,EACrD,EAAE,QAAQ,EAAE,OAAO,CAAC,QAAQ,EAAE,OAAO,EAAE,OAAO,CAAC,OAAO,EAAE,CACzD,CAAC,CAAC;QACL,CAAC,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;QAEpB,KAAK,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;YACxB,YAAY,CAAC,SAAS,CAAC,CAAC;YACxB,OAAO,CAAC,IAAI,CAAC,CAAC;QAChB,CAAC,CAAC,CAAC;QAEH,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;YAC1B,YAAY,CAAC,SAAS,CAAC,CAAC;YACxB,MAAM,CAAC,IAAI,eAAe,CACxB,mBAAmB,CAAC,gBAAgB,EACpC,6BAA6B,KAAK,CAAC,OAAO,EAAE,EAC5C,EAAE,KAAK,EAAE,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,CACjC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC;AACtC,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,aAAa,CAC1B,QAAgB,EAChB,YAAoB,EACpB,MAAuB,EACvB,UAAkB,EAClB,MAAc,EACd,IAAgE;IAEhE,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IAE1C,MAAM,QAAQ,GAAqB;QACjC,QAAQ;QACR,QAAQ,EAAE,KAAK,CAAC,IAAI;QACpB,MAAM;QACN,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACrC,UAAU;QACV,mBAAmB,EAAE,IAAI,CAAC,qBAAqB;KAChD,CAAC;IAEF,MAAM,SAAS,GAAG,qBAAqB,CAAC;IACxC,MAAM,SAAS,GAAG,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACzC,IAAI,SAAS,EAAE,CAAC;QACd,QAAQ,CAAC,SAAS,GAAG,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACzD,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,SAAS,wBAAwB,CAC/B,MAAwB,EACxB,YAAoB;IAEpB,IAAI,MAAM,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,eAAe,CACvB,mBAAmB,CAAC,gBAAgB,EACpC,+CAA+C,MAAM,CAAC,QAAQ,EAAE,EAChE,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,CACpG,CAAC;IACJ,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;IACtC,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,MAAM,IAAI,eAAe,CACvB,mBAAmB,CAAC,gBAAgB,EACpC,kCAAkC,EAClC,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,CAClD,CAAC;IACJ,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,UAA6B,EAAE;IAE/B,MAAM,YAAY,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IACvC,MAAM,IAAI,GAAG;QACX,GAAG,eAAe;QAClB,GAAG,OAAO;KACX,CAAC;IAEF,MAAM,YAAY,CAAC,YAAY,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;IAEnD,MAAM,MAAM,GAAG,YAAY,CAAC,YAAY,CAAC,CAAC;IAC1C,MAAM,QAAQ,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC;IAExC,IAAI,OAAO,GAAkB,IAAI,CAAC;IAClC,IAAI,CAAC;QACH,OAAO,GAAG,MAAM,aAAa,EAAE,CAAC;QAChC,MAAM,MAAM,GAAG,mBAAmB,EAAE,CAAC;QACrC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE3B,MAAM,MAAM,GAAG,MAAM,iBAAiB,CACpC,MAAM,EACN,CAAC,IAAI,EAAE,YAAY,EAAE,YAAY,CAAC,EAClC;YACE,GAAG,EAAE,OAAO;YACZ,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,GAAG,EAAE;gBACH,GAAG,OAAO,CAAC,GAAG;gBACd,GAAG,CAAC,IAAI,CAAC,qBAAqB,IAAI;oBAChC,cAAc,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;oBAC1C,iBAAiB,EAAE,OAAO,CAAC,GAAG,CAAC,iBAAiB;oBAChD,SAAS,EAAE,IAAI,CAAC,QAAQ,IAAI,sBAAsB;iBACnD,CAAC;aACH;YACD,QAAQ,EAAE,YAAY;SACvB,CACF,CAAC;QAEF,MAAM,QAAQ,GAAG,wBAAwB,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;QAChE,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO,CAAC;QACxC,MAAM,QAAQ,GAAG,MAAM,aAAa,CAClC,QAAQ,EACR,YAAY,EACZ,MAAM,EACN,UAAU,EACV,MAAM,CAAC,MAAM,EACb,IAAI,CACL,CAAC;QAEF,OAAO;YACL,QAAQ;YACR,QAAQ;YACR,OAAO,EAAE,IAAI;YACb,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,SAAS;SACtD,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,eAAe,EAAE,CAAC;YACrC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,MAAM,IAAI,eAAe,CACvB,mBAAmB,CAAC,gBAAgB,EACpC,uCAAuC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,EAC/F,EAAE,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,QAAQ,EAAE,YAAY,EAAE,CAC1F,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,SAAmB,EACnB,UAA6B,EAAE;IAE/B,yDAAyD;IACzD,MAAM,cAAc,GAAG,CAAC,CAAC;IACzB,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,IAAI,cAAc,EAAE,CAAC;QAC1D,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,cAAc,CAAC,CAAC;QACrD,MAAM,YAAY,GAAG,MAAM,OAAO,CAAC,UAAU,CAC3C,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CACtD,CAAC;QAEF,KAAK,MAAM,MAAM,IAAI,YAAY,EAAE,CAAC;YAClC,IAAI,MAAM,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;gBAClC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC7B,CAAC;iBAAM,CAAC;gBACN,sDAAsD;gBACtD,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC;gBAC5B,OAAO,CAAC,IAAI,CAAC;oBACX,QAAQ,EAAE,EAAE;oBACZ,QAAQ,EAAE;wBACR,QAAQ,EAAE,EAAE;wBACZ,QAAQ,EAAE,CAAC;wBACX,MAAM,EAAE,KAAK;wBACb,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;wBACrC,UAAU,EAAE,CAAC;qBACd;oBACD,OAAO,EAAE,KAAK;oBACd,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;iBAC9D,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,wBAAwB;IAC5C,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,mBAAmB,EAAE,CAAC;QACrC,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,gCAAgC,CAAC,EAAE;YACpE,OAAO,EAAE,IAAI;SACd,CAAC,CAAC;QAEH,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,KAAK,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,OAAO,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;gBACrB,OAAO,CAAC,KAAK,CAAC,CAAC;YACjB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MarkItDown document conversion types
|
|
3
|
+
* @module @dcyfr/ai-rag/ingestion
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Supported file formats for conversion
|
|
7
|
+
*/
|
|
8
|
+
export type SupportedFormat = 'pdf' | 'docx' | 'pptx' | 'xlsx' | 'csv' | 'html' | 'htm' | 'xml' | 'json' | 'png' | 'jpg' | 'jpeg' | 'gif' | 'webp' | 'mp3' | 'wav' | 'm4a' | 'epub' | 'zip';
|
|
9
|
+
/**
|
|
10
|
+
* Options for document conversion
|
|
11
|
+
*/
|
|
12
|
+
export interface ConversionOptions {
|
|
13
|
+
/**
|
|
14
|
+
* Maximum time to wait for conversion (milliseconds)
|
|
15
|
+
* @default 30000 (30 seconds)
|
|
16
|
+
*/
|
|
17
|
+
timeout?: number;
|
|
18
|
+
/**
|
|
19
|
+
* Maximum file size to process (bytes)
|
|
20
|
+
* @default 52428800 (50MB)
|
|
21
|
+
*/
|
|
22
|
+
maxFileSize?: number;
|
|
23
|
+
/**
|
|
24
|
+
* Enable LLM-powered image descriptions (requires API key)
|
|
25
|
+
* Supports: OpenAI GPT-4 Vision, Anthropic Claude
|
|
26
|
+
* @default false
|
|
27
|
+
*/
|
|
28
|
+
enableLLMDescriptions?: boolean;
|
|
29
|
+
/**
|
|
30
|
+
* LLM model to use for image descriptions
|
|
31
|
+
* @default "gpt-4-vision-preview" or "claude-3-opus-20240229"
|
|
32
|
+
*/
|
|
33
|
+
llmModel?: string;
|
|
34
|
+
/**
|
|
35
|
+
* Preserve original file metadata in result
|
|
36
|
+
* @default true
|
|
37
|
+
*/
|
|
38
|
+
preserveMetadata?: boolean;
|
|
39
|
+
/**
|
|
40
|
+
* Working directory for temporary files
|
|
41
|
+
* Auto-created and cleaned up after conversion
|
|
42
|
+
* @default system temp directory
|
|
43
|
+
*/
|
|
44
|
+
workDir?: string;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Metadata about the converted document
|
|
48
|
+
*/
|
|
49
|
+
export interface DocumentMetadata {
|
|
50
|
+
/**
|
|
51
|
+
* Original file name
|
|
52
|
+
*/
|
|
53
|
+
fileName: string;
|
|
54
|
+
/**
|
|
55
|
+
* File size in bytes
|
|
56
|
+
*/
|
|
57
|
+
fileSize: number;
|
|
58
|
+
/**
|
|
59
|
+
* Detected or specified file format
|
|
60
|
+
*/
|
|
61
|
+
format: SupportedFormat;
|
|
62
|
+
/**
|
|
63
|
+
* Conversion timestamp (ISO 8601)
|
|
64
|
+
*/
|
|
65
|
+
convertedAt: string;
|
|
66
|
+
/**
|
|
67
|
+
* Conversion duration in milliseconds
|
|
68
|
+
*/
|
|
69
|
+
durationMs: number;
|
|
70
|
+
/**
|
|
71
|
+
* Number of pages (PDF, DOCX, PPTX) or sections
|
|
72
|
+
*/
|
|
73
|
+
pageCount?: number;
|
|
74
|
+
/**
|
|
75
|
+
* Whether LLM descriptions were used
|
|
76
|
+
*/
|
|
77
|
+
usedLLMDescriptions?: boolean;
|
|
78
|
+
/**
|
|
79
|
+
* Additional format-specific metadata
|
|
80
|
+
*/
|
|
81
|
+
[key: string]: unknown;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Result of document conversion
|
|
85
|
+
*/
|
|
86
|
+
export interface ConversionResult {
|
|
87
|
+
/**
|
|
88
|
+
* Converted markdown content
|
|
89
|
+
*/
|
|
90
|
+
markdown: string;
|
|
91
|
+
/**
|
|
92
|
+
* Document metadata
|
|
93
|
+
*/
|
|
94
|
+
metadata: DocumentMetadata;
|
|
95
|
+
/**
|
|
96
|
+
* Conversion success status
|
|
97
|
+
*/
|
|
98
|
+
success: boolean;
|
|
99
|
+
/**
|
|
100
|
+
* Error message if conversion failed
|
|
101
|
+
*/
|
|
102
|
+
error?: string;
|
|
103
|
+
/**
|
|
104
|
+
* Warning messages (non-fatal issues)
|
|
105
|
+
*/
|
|
106
|
+
warnings?: string[];
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Error types for conversion failures
|
|
110
|
+
*/
|
|
111
|
+
export declare enum ConversionErrorType {
|
|
112
|
+
/** File not found or inaccessible */
|
|
113
|
+
FILE_NOT_FOUND = "FILE_NOT_FOUND",
|
|
114
|
+
/** File exceeds maximum size limit */
|
|
115
|
+
FILE_TOO_LARGE = "FILE_TOO_LARGE",
|
|
116
|
+
/** File format not supported by MarkItDown */
|
|
117
|
+
UNSUPPORTED_FORMAT = "UNSUPPORTED_FORMAT",
|
|
118
|
+
/** Conversion timeout exceeded */
|
|
119
|
+
TIMEOUT = "TIMEOUT",
|
|
120
|
+
/** Python subprocess failed to start or crashed */
|
|
121
|
+
SUBPROCESS_ERROR = "SUBPROCESS_ERROR",
|
|
122
|
+
/** Temporary directory creation/cleanup failed */
|
|
123
|
+
TEMP_DIR_ERROR = "TEMP_DIR_ERROR",
|
|
124
|
+
/** Invalid conversion options provided */
|
|
125
|
+
INVALID_OPTIONS = "INVALID_OPTIONS",
|
|
126
|
+
/** Python environment or MarkItDown not installed */
|
|
127
|
+
PYTHON_ENV_ERROR = "PYTHON_ENV_ERROR",
|
|
128
|
+
/** LLM API call failed (if enableLLMDescriptions=true) */
|
|
129
|
+
LLM_API_ERROR = "LLM_API_ERROR"
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Conversion error with typed error code
|
|
133
|
+
*/
|
|
134
|
+
export declare class ConversionError extends Error {
|
|
135
|
+
readonly type: ConversionErrorType;
|
|
136
|
+
readonly details?: Record<string, unknown> | undefined;
|
|
137
|
+
constructor(type: ConversionErrorType, message: string, details?: Record<string, unknown> | undefined);
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Internal subprocess communication message format
|
|
141
|
+
*/
|
|
142
|
+
export interface SubprocessMessage {
|
|
143
|
+
/** Message type: request or response */
|
|
144
|
+
type: 'request' | 'response';
|
|
145
|
+
/** File path to convert */
|
|
146
|
+
filePath?: string;
|
|
147
|
+
/** Conversion options */
|
|
148
|
+
options?: ConversionOptions;
|
|
149
|
+
/** Conversion result */
|
|
150
|
+
result?: ConversionResult;
|
|
151
|
+
/** Error information */
|
|
152
|
+
error?: {
|
|
153
|
+
type: string;
|
|
154
|
+
message: string;
|
|
155
|
+
details?: Record<string, unknown>;
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/ingestion/types.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,MAAM,eAAe,GACvB,KAAK,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,KAAK,GACxC,MAAM,GAAG,KAAK,GAAG,KAAK,GAAG,MAAM,GAC/B,KAAK,GAAG,KAAK,GAAG,MAAM,GAAG,KAAK,GAAG,MAAM,GACvC,KAAK,GAAG,KAAK,GAAG,KAAK,GACrB,MAAM,GAAG,KAAK,CAAC;AAEnB;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;;OAIG;IACH,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAEhC;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;OAGG;IACH,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAE3B;;;;OAIG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IAEjB;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IAEjB;;OAEG;IACH,MAAM,EAAE,eAAe,CAAC;IAExB;;OAEG;IACH,WAAW,EAAE,MAAM,CAAC;IAEpB;;OAEG;IACH,UAAU,EAAE,MAAM,CAAC;IAEnB;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB;;OAEG;IACH,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAE9B;;OAEG;IACH,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IAEjB;;OAEG;IACH,QAAQ,EAAE,gBAAgB,CAAC;IAE3B;;OAEG;IACH,OAAO,EAAE,OAAO,CAAC;IAEjB;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;OAEG;IACH,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CACrB;AAED;;GAEG;AACH,oBAAY,mBAAmB;IAC7B,qCAAqC;IACrC,cAAc,mBAAmB;IAEjC,sCAAsC;IACtC,cAAc,mBAAmB;IAEjC,8CAA8C;IAC9C,kBAAkB,uBAAuB;IAEzC,kCAAkC;IAClC,OAAO,YAAY;IAEnB,mDAAmD;IACnD,gBAAgB,qBAAqB;IAErC,kDAAkD;IAClD,cAAc,mBAAmB;IAEjC,0CAA0C;IAC1C,eAAe,oBAAoB;IAEnC,qDAAqD;IACrD,gBAAgB,qBAAqB;IAErC,0DAA0D;IAC1D,aAAa,kBAAkB;CAChC;AAED;;GAEG;AACH,qBAAa,eAAgB,SAAQ,KAAK;aAEtB,IAAI,EAAE,mBAAmB;aAEzB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC;gBAFjC,IAAI,EAAE,mBAAmB,EACzC,OAAO,EAAE,MAAM,EACC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,YAAA;CAKpD;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,wCAAwC;IACxC,IAAI,EAAE,SAAS,GAAG,UAAU,CAAC;IAE7B,2BAA2B;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB,yBAAyB;IACzB,OAAO,CAAC,EAAE,iBAAiB,CAAC;IAE5B,wBAAwB;IACxB,MAAM,CAAC,EAAE,gBAAgB,CAAC;IAE1B,wBAAwB;IACxB,KAAK,CAAC,EAAE;QACN,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,EAAE,MAAM,CAAC;QAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACnC,CAAC;CACH"}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MarkItDown document conversion types
|
|
3
|
+
* @module @dcyfr/ai-rag/ingestion
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Error types for conversion failures
|
|
7
|
+
*/
|
|
8
|
+
export var ConversionErrorType;
|
|
9
|
+
(function (ConversionErrorType) {
|
|
10
|
+
/** File not found or inaccessible */
|
|
11
|
+
ConversionErrorType["FILE_NOT_FOUND"] = "FILE_NOT_FOUND";
|
|
12
|
+
/** File exceeds maximum size limit */
|
|
13
|
+
ConversionErrorType["FILE_TOO_LARGE"] = "FILE_TOO_LARGE";
|
|
14
|
+
/** File format not supported by MarkItDown */
|
|
15
|
+
ConversionErrorType["UNSUPPORTED_FORMAT"] = "UNSUPPORTED_FORMAT";
|
|
16
|
+
/** Conversion timeout exceeded */
|
|
17
|
+
ConversionErrorType["TIMEOUT"] = "TIMEOUT";
|
|
18
|
+
/** Python subprocess failed to start or crashed */
|
|
19
|
+
ConversionErrorType["SUBPROCESS_ERROR"] = "SUBPROCESS_ERROR";
|
|
20
|
+
/** Temporary directory creation/cleanup failed */
|
|
21
|
+
ConversionErrorType["TEMP_DIR_ERROR"] = "TEMP_DIR_ERROR";
|
|
22
|
+
/** Invalid conversion options provided */
|
|
23
|
+
ConversionErrorType["INVALID_OPTIONS"] = "INVALID_OPTIONS";
|
|
24
|
+
/** Python environment or MarkItDown not installed */
|
|
25
|
+
ConversionErrorType["PYTHON_ENV_ERROR"] = "PYTHON_ENV_ERROR";
|
|
26
|
+
/** LLM API call failed (if enableLLMDescriptions=true) */
|
|
27
|
+
ConversionErrorType["LLM_API_ERROR"] = "LLM_API_ERROR";
|
|
28
|
+
})(ConversionErrorType || (ConversionErrorType = {}));
|
|
29
|
+
/**
|
|
30
|
+
* Conversion error with typed error code
|
|
31
|
+
*/
|
|
32
|
+
export class ConversionError extends Error {
|
|
33
|
+
type;
|
|
34
|
+
details;
|
|
35
|
+
constructor(type, message, details) {
|
|
36
|
+
super(message);
|
|
37
|
+
this.type = type;
|
|
38
|
+
this.details = details;
|
|
39
|
+
this.name = 'ConversionError';
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=types.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/ingestion/types.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAkIH;;GAEG;AACH,MAAM,CAAN,IAAY,mBA2BX;AA3BD,WAAY,mBAAmB;IAC7B,qCAAqC;IACrC,wDAAiC,CAAA;IAEjC,sCAAsC;IACtC,wDAAiC,CAAA;IAEjC,8CAA8C;IAC9C,gEAAyC,CAAA;IAEzC,kCAAkC;IAClC,0CAAmB,CAAA;IAEnB,mDAAmD;IACnD,4DAAqC,CAAA;IAErC,kDAAkD;IAClD,wDAAiC,CAAA;IAEjC,0CAA0C;IAC1C,0DAAmC,CAAA;IAEnC,qDAAqD;IACrD,4DAAqC,CAAA;IAErC,0DAA0D;IAC1D,sDAA+B,CAAA;AACjC,CAAC,EA3BW,mBAAmB,KAAnB,mBAAmB,QA2B9B;AAED;;GAEG;AACH,MAAM,OAAO,eAAgB,SAAQ,KAAK;IAEtB;IAEA;IAHlB,YACkB,IAAyB,EACzC,OAAe,EACC,OAAiC;QAEjD,KAAK,CAAC,OAAO,CAAC,CAAC;QAJC,SAAI,GAAJ,IAAI,CAAqB;QAEzB,YAAO,GAAP,OAAO,CAA0B;QAGjD,IAAI,CAAC,IAAI,GAAG,iBAAiB,CAAC;IAChC,CAAC;CACF"}
|
|
@@ -7,21 +7,33 @@ import type { DocumentLoader, EmbeddingGenerator, VectorStore, IngestionOptions,
|
|
|
7
7
|
* Pipeline for ingesting documents into vector store
|
|
8
8
|
*/
|
|
9
9
|
export declare class IngestionPipeline {
|
|
10
|
-
private loader;
|
|
11
|
-
private embedder;
|
|
12
|
-
private store;
|
|
10
|
+
private readonly loader;
|
|
11
|
+
private readonly embedder;
|
|
12
|
+
private readonly store;
|
|
13
13
|
constructor(loader: DocumentLoader, embedder: EmbeddingGenerator, store: VectorStore);
|
|
14
14
|
/**
|
|
15
15
|
* Ingest one or more documents
|
|
16
16
|
*/
|
|
17
17
|
ingest(filePaths: string | string[], options?: IngestionOptions): Promise<IngestionResult>;
|
|
18
|
+
private loadDocumentsForPath;
|
|
19
|
+
private processAndStoreDocuments;
|
|
20
|
+
private extractErrorType;
|
|
21
|
+
private updateMemoryMetrics;
|
|
18
22
|
/**
|
|
19
23
|
* Process documents: chunk and embed
|
|
20
24
|
*/
|
|
21
25
|
private processDocuments;
|
|
22
26
|
/**
|
|
23
|
-
*
|
|
27
|
+
* Fixed-size chunking with overlap
|
|
24
28
|
*/
|
|
25
|
-
private
|
|
29
|
+
private fixedChunkDocument;
|
|
30
|
+
/**
|
|
31
|
+
* Semantic chunking preserving heading-based sections where possible
|
|
32
|
+
*/
|
|
33
|
+
private semanticChunkDocument;
|
|
34
|
+
/**
|
|
35
|
+
* Infer document type from file extension
|
|
36
|
+
*/
|
|
37
|
+
private inferDocumentType;
|
|
26
38
|
}
|
|
27
39
|
//# sourceMappingURL=pipeline.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../../src/pipeline/ingestion/pipeline.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAGV,cAAc,EACd,kBAAkB,EAClB,WAAW,EACX,gBAAgB,EAChB,eAAe,EAChB,MAAM,sBAAsB,CAAC;
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../../src/pipeline/ingestion/pipeline.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAGV,cAAc,EACd,kBAAkB,EAClB,WAAW,EACX,gBAAgB,EAChB,eAAe,EAChB,MAAM,sBAAsB,CAAC;AAG9B;;GAEG;AACH,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAiB;IACxC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAqB;IAC9C,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAc;gBAGlC,MAAM,EAAE,cAAc,EACtB,QAAQ,EAAE,kBAAkB,EAC5B,KAAK,EAAE,WAAW;IAOpB;;OAEG;IACG,MAAM,CACV,SAAS,EAAE,MAAM,GAAG,MAAM,EAAE,EAC5B,OAAO,CAAC,EAAE,gBAAgB,GACzB,OAAO,CAAC,eAAe,CAAC;YA+Fb,oBAAoB;YA0CpB,wBAAwB;IAsCtC,OAAO,CAAC,gBAAgB;IAQxB,OAAO,CAAC,mBAAmB;IAW3B;;OAEG;YACW,gBAAgB;IAmC9B;;OAEG;IACH,OAAO,CAAC,kBAAkB;IA+B1B;;OAEG;IACH,OAAO,CAAC,qBAAqB;IAsC7B;;OAEG;IACH,OAAO,CAAC,iBAAiB;CAS1B"}
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* Document ingestion pipeline
|
|
3
3
|
* Orchestrates loading, embedding, and storage
|
|
4
4
|
*/
|
|
5
|
+
import { convertToMarkdown } from '../../ingestion/index.js';
|
|
5
6
|
/**
|
|
6
7
|
* Pipeline for ingesting documents into vector store
|
|
7
8
|
*/
|
|
@@ -20,58 +21,150 @@ export class IngestionPipeline {
|
|
|
20
21
|
async ingest(filePaths, options) {
|
|
21
22
|
const paths = Array.isArray(filePaths) ? filePaths : [filePaths];
|
|
22
23
|
const startTime = Date.now();
|
|
23
|
-
const { batchSize = 32, onProgress, loaderConfig, } = options ?? {};
|
|
24
|
+
const { batchSize = 32, onProgress, loaderConfig, enableDocumentConversion = false, conversionTimeoutMs = 30000, conversionMaxFileSize = 50 * 1024 * 1024, enableLLMDescriptions = false, chunkingStrategy = 'fixed', fixedChunkSize, fixedChunkOverlap, onConversionError, } = options ?? {};
|
|
24
25
|
let totalDocuments = 0;
|
|
25
26
|
let totalChunks = 0;
|
|
27
|
+
let successCount = 0;
|
|
28
|
+
let failureCount = 0;
|
|
26
29
|
const errors = [];
|
|
30
|
+
const conversionDurationsMs = [];
|
|
31
|
+
let peakHeapUsedBytes = process.memoryUsage().heapUsed;
|
|
32
|
+
const warnings = [];
|
|
27
33
|
for (let i = 0; i < paths.length; i++) {
|
|
28
34
|
const path = paths[i];
|
|
29
35
|
try {
|
|
30
|
-
|
|
31
|
-
|
|
36
|
+
const documents = await this.loadDocumentsForPath(path, {
|
|
37
|
+
enableDocumentConversion,
|
|
38
|
+
conversionTimeoutMs,
|
|
39
|
+
conversionMaxFileSize,
|
|
40
|
+
enableLLMDescriptions,
|
|
41
|
+
loaderConfig,
|
|
42
|
+
conversionDurationsMs,
|
|
43
|
+
});
|
|
32
44
|
totalDocuments += documents.length;
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
chunksGenerated: allChunks.length,
|
|
45
|
-
});
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
// Store chunks
|
|
49
|
-
await this.store.addDocuments(allChunks);
|
|
50
|
-
totalChunks += allChunks.length;
|
|
45
|
+
const chunkCount = await this.processAndStoreDocuments(path, documents, {
|
|
46
|
+
batchSize,
|
|
47
|
+
chunkingStrategy,
|
|
48
|
+
fixedChunkSize,
|
|
49
|
+
fixedChunkOverlap,
|
|
50
|
+
onProgress,
|
|
51
|
+
fileIndex: i,
|
|
52
|
+
totalFiles: paths.length,
|
|
53
|
+
});
|
|
54
|
+
totalChunks += chunkCount;
|
|
55
|
+
successCount++;
|
|
51
56
|
}
|
|
52
57
|
catch (error) {
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
58
|
+
failureCount++;
|
|
59
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
60
|
+
errors.push({ file: path, error: errorMessage });
|
|
61
|
+
if (enableDocumentConversion && onConversionError) {
|
|
62
|
+
onConversionError({
|
|
63
|
+
file: path,
|
|
64
|
+
error: errorMessage,
|
|
65
|
+
errorType: this.extractErrorType(error),
|
|
66
|
+
});
|
|
67
|
+
}
|
|
57
68
|
}
|
|
69
|
+
peakHeapUsedBytes = this.updateMemoryMetrics(path, peakHeapUsedBytes, warnings);
|
|
58
70
|
}
|
|
59
71
|
const endTime = Date.now();
|
|
72
|
+
const durationMs = endTime - startTime;
|
|
73
|
+
const averageConversionMs = conversionDurationsMs.length > 0
|
|
74
|
+
? conversionDurationsMs.reduce((sum, ms) => sum + ms, 0) / conversionDurationsMs.length
|
|
75
|
+
: undefined;
|
|
76
|
+
const documentsPerSecond = durationMs > 0 ? (successCount / durationMs) * 1000 : 0;
|
|
60
77
|
return {
|
|
61
78
|
documentsProcessed: totalDocuments,
|
|
79
|
+
successCount,
|
|
80
|
+
failureCount,
|
|
62
81
|
chunksGenerated: totalChunks,
|
|
63
82
|
errors,
|
|
64
|
-
|
|
83
|
+
metrics: {
|
|
84
|
+
averageConversionMs,
|
|
85
|
+
documentsPerSecond,
|
|
86
|
+
peakHeapUsedMb: peakHeapUsedBytes / (1024 * 1024),
|
|
87
|
+
warnings,
|
|
88
|
+
},
|
|
89
|
+
durationMs,
|
|
65
90
|
};
|
|
66
91
|
}
|
|
92
|
+
async loadDocumentsForPath(path, config) {
|
|
93
|
+
if (!config.enableDocumentConversion) {
|
|
94
|
+
return this.loader.load(path, config.loaderConfig);
|
|
95
|
+
}
|
|
96
|
+
const conversionStart = Date.now();
|
|
97
|
+
const converted = await convertToMarkdown(path, {
|
|
98
|
+
timeout: config.conversionTimeoutMs,
|
|
99
|
+
maxFileSize: config.conversionMaxFileSize,
|
|
100
|
+
enableLLMDescriptions: config.enableLLMDescriptions,
|
|
101
|
+
preserveMetadata: true,
|
|
102
|
+
});
|
|
103
|
+
config.conversionDurationsMs.push(Date.now() - conversionStart);
|
|
104
|
+
return [
|
|
105
|
+
{
|
|
106
|
+
id: `${path}-${Date.now()}`,
|
|
107
|
+
content: converted.markdown,
|
|
108
|
+
metadata: {
|
|
109
|
+
source: path,
|
|
110
|
+
type: this.inferDocumentType(path),
|
|
111
|
+
createdAt: new Date(),
|
|
112
|
+
conversionTimestamp: converted.metadata.convertedAt,
|
|
113
|
+
originalFileType: converted.metadata.format,
|
|
114
|
+
pageCount: converted.metadata.pageCount,
|
|
115
|
+
conversionDurationMs: converted.metadata.durationMs,
|
|
116
|
+
usedLLMDescriptions: converted.metadata.usedLLMDescriptions,
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
];
|
|
120
|
+
}
|
|
121
|
+
async processAndStoreDocuments(path, documents, config) {
|
|
122
|
+
const allChunks = [];
|
|
123
|
+
for (let j = 0; j < documents.length; j += config.batchSize) {
|
|
124
|
+
const batch = documents.slice(j, j + config.batchSize);
|
|
125
|
+
const chunks = await this.processDocuments(batch, {
|
|
126
|
+
chunkingStrategy: config.chunkingStrategy,
|
|
127
|
+
fixedChunkSize: config.fixedChunkSize,
|
|
128
|
+
fixedChunkOverlap: config.fixedChunkOverlap,
|
|
129
|
+
});
|
|
130
|
+
allChunks.push(...chunks);
|
|
131
|
+
if (config.onProgress) {
|
|
132
|
+
config.onProgress(config.fileIndex + 1, config.totalFiles, {
|
|
133
|
+
currentFile: path,
|
|
134
|
+
documentsProcessed: Math.min(j + config.batchSize, documents.length),
|
|
135
|
+
totalDocuments: documents.length,
|
|
136
|
+
chunksGenerated: allChunks.length,
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
await this.store.addDocuments(allChunks);
|
|
141
|
+
return allChunks.length;
|
|
142
|
+
}
|
|
143
|
+
extractErrorType(error) {
|
|
144
|
+
if (typeof error !== 'object' || error === null || !('type' in error)) {
|
|
145
|
+
return undefined;
|
|
146
|
+
}
|
|
147
|
+
const rawType = error.type;
|
|
148
|
+
return typeof rawType === 'string' ? rawType : undefined;
|
|
149
|
+
}
|
|
150
|
+
updateMemoryMetrics(path, currentPeak, warnings) {
|
|
151
|
+
const currentHeap = process.memoryUsage().heapUsed;
|
|
152
|
+
const nextPeak = Math.max(currentHeap, currentPeak);
|
|
153
|
+
if (currentHeap > 512 * 1024 * 1024) {
|
|
154
|
+
warnings.push(`High memory usage detected while processing ${path}: ${(currentHeap / (1024 * 1024)).toFixed(1)}MB`);
|
|
155
|
+
}
|
|
156
|
+
return nextPeak;
|
|
157
|
+
}
|
|
67
158
|
/**
|
|
68
159
|
* Process documents: chunk and embed
|
|
69
160
|
*/
|
|
70
|
-
async processDocuments(documents) {
|
|
161
|
+
async processDocuments(documents, options) {
|
|
71
162
|
const chunks = [];
|
|
72
163
|
for (const doc of documents) {
|
|
73
|
-
// Split into chunks
|
|
74
|
-
const docChunks =
|
|
164
|
+
// Split into chunks
|
|
165
|
+
const docChunks = options?.chunkingStrategy === 'semantic'
|
|
166
|
+
? this.semanticChunkDocument(doc)
|
|
167
|
+
: this.fixedChunkDocument(doc, options?.fixedChunkSize ?? 1000, options?.fixedChunkOverlap ?? 200);
|
|
75
168
|
chunks.push(...docChunks);
|
|
76
169
|
}
|
|
77
170
|
// Generate embeddings
|
|
@@ -84,11 +177,9 @@ export class IngestionPipeline {
|
|
|
84
177
|
return chunks;
|
|
85
178
|
}
|
|
86
179
|
/**
|
|
87
|
-
*
|
|
180
|
+
* Fixed-size chunking with overlap
|
|
88
181
|
*/
|
|
89
|
-
|
|
90
|
-
const chunkSize = 1000;
|
|
91
|
-
const overlap = 200;
|
|
182
|
+
fixedChunkDocument(doc, chunkSize, overlap) {
|
|
92
183
|
const chunks = [];
|
|
93
184
|
const content = doc.content;
|
|
94
185
|
for (let i = 0; i < content.length; i += chunkSize - overlap) {
|
|
@@ -114,5 +205,57 @@ export class IngestionPipeline {
|
|
|
114
205
|
}
|
|
115
206
|
return chunks;
|
|
116
207
|
}
|
|
208
|
+
/**
|
|
209
|
+
* Semantic chunking preserving heading-based sections where possible
|
|
210
|
+
*/
|
|
211
|
+
semanticChunkDocument(doc) {
|
|
212
|
+
const chunks = [];
|
|
213
|
+
const sections = doc.content
|
|
214
|
+
.split(/\n(?=#{1,6}\s)/g)
|
|
215
|
+
.filter((section) => section.trim().length > 0);
|
|
216
|
+
if (sections.length <= 1) {
|
|
217
|
+
return this.fixedChunkDocument(doc, 1000, 200);
|
|
218
|
+
}
|
|
219
|
+
let cursor = 0;
|
|
220
|
+
for (let i = 0; i < sections.length; i++) {
|
|
221
|
+
const section = sections[i];
|
|
222
|
+
const startChar = doc.content.indexOf(section, cursor);
|
|
223
|
+
const endChar = startChar + section.length;
|
|
224
|
+
const chunkId = `${doc.id}-chunk-${i}`;
|
|
225
|
+
chunks.push({
|
|
226
|
+
id: chunkId,
|
|
227
|
+
documentId: doc.id,
|
|
228
|
+
content: section,
|
|
229
|
+
index: i,
|
|
230
|
+
metadata: {
|
|
231
|
+
chunkIndex: i,
|
|
232
|
+
chunkCount: sections.length,
|
|
233
|
+
startChar,
|
|
234
|
+
endChar,
|
|
235
|
+
chunkingStrategy: 'semantic',
|
|
236
|
+
...doc.metadata,
|
|
237
|
+
},
|
|
238
|
+
});
|
|
239
|
+
cursor = endChar;
|
|
240
|
+
}
|
|
241
|
+
return chunks;
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Infer document type from file extension
|
|
245
|
+
*/
|
|
246
|
+
inferDocumentType(path) {
|
|
247
|
+
const lowerPath = path.toLowerCase();
|
|
248
|
+
if (lowerPath.endsWith('.pdf'))
|
|
249
|
+
return 'pdf';
|
|
250
|
+
if (lowerPath.endsWith('.md') || lowerPath.endsWith('.markdown'))
|
|
251
|
+
return 'markdown';
|
|
252
|
+
if (lowerPath.endsWith('.html') || lowerPath.endsWith('.htm'))
|
|
253
|
+
return 'html';
|
|
254
|
+
if (lowerPath.endsWith('.json'))
|
|
255
|
+
return 'json';
|
|
256
|
+
if (lowerPath.endsWith('.txt'))
|
|
257
|
+
return 'text';
|
|
258
|
+
return 'other';
|
|
259
|
+
}
|
|
117
260
|
}
|
|
118
261
|
//# sourceMappingURL=pipeline.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../../src/pipeline/ingestion/pipeline.ts"],"names":[],"mappings":"AAAA;;;GAGG;
|
|
1
|
+
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../../src/pipeline/ingestion/pipeline.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAWH,OAAO,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AAE7D;;GAEG;AACH,MAAM,OAAO,iBAAiB;IACX,MAAM,CAAiB;IACvB,QAAQ,CAAqB;IAC7B,KAAK,CAAc;IAEpC,YACE,MAAsB,EACtB,QAA4B,EAC5B,KAAkB;QAElB,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM,CACV,SAA4B,EAC5B,OAA0B;QAE1B,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACjE,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,MAAM,EACJ,SAAS,GAAG,EAAE,EACd,UAAU,EACV,YAAY,EACZ,wBAAwB,GAAG,KAAK,EAChC,mBAAmB,GAAG,KAAK,EAC3B,qBAAqB,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,EACxC,qBAAqB,GAAG,KAAK,EAC7B,gBAAgB,GAAG,OAAO,EAC1B,cAAc,EACd,iBAAiB,EACjB,iBAAiB,GAClB,GAAG,OAAO,IAAI,EAAE,CAAC;QAElB,IAAI,cAAc,GAAG,CAAC,CAAC;QACvB,IAAI,WAAW,GAAG,CAAC,CAAC;QACpB,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,MAAM,MAAM,GAA2C,EAAE,CAAC;QAC1D,MAAM,qBAAqB,GAAa,EAAE,CAAC;QAC3C,IAAI,iBAAiB,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC;QACvD,MAAM,QAAQ,GAAa,EAAE,CAAC;QAE9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YAEtB,IAAI,CAAC;gBACH,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE;oBACtD,wBAAwB;oBACxB,mBAAmB;oBACnB,qBAAqB;oBACrB,qBAAqB;oBACrB,YAAY;oBACZ,qBAAqB;iBACtB,CAAC,CAAC;gBAEH,cAAc,IAAI,SAAS,CAAC,MAAM,CAAC;gBAEnC,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,wBAAwB,CAAC,IAAI,EAAE,SAAS,EAAE;oBACtE,SAAS;oBACT,gBAAgB;oBAChB,cAAc;oBACd,iBAAiB;oBACjB,UAAU;oBACV,SAAS,EAAE,CAAC;oBACZ,UAAU,EAAE,KAAK,CAAC,MAAM;iBACzB,CAAC,CAAC;gBAEH,WAAW,IAAI,UAAU,CAAC;gBAC1B,YAAY,EAAE,CAAC;YACjB,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,YAAY,EAAE,CAAC;gBACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAC5E,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,YAAY,EAAE,CAAC,CAAC;gBAEjD,IAAI,wBAAwB,IAAI,iBAAiB,EAAE,CAAC;oBAClD,iBAAiB,CAAC;wBAChB,IAAI,EAAE,IAAI;wBACV,KAAK,EAAE,YAAY;wBACnB,SAAS,EAAE,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC;qBACxC,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAED,iBAAiB,GAAG,IAAI,CAAC,mBAAmB,CAAC,IAAI,EAAE,iBAAiB,EAAE,QAAQ,CAAC,CAAC;QAClF,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC3B,MAAM,UAAU,GAAG,OAAO,GAAG,SAAS,CAAC;QACvC,MAAM,mBAAmB,GACvB,qBAAqB,CAAC,MAAM,GAAG,CAAC;YAC9B,CAAC,CAAC,qBAAqB,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,EAAE,EAAE,CAAC,GAAG,GAAG,EAAE,EAAE,CAAC,CAAC,GAAG,qBAAqB,CAAC,MAAM;YACvF,CAAC,CAAC,SAAS,CAAC;QAChB,MAAM,kBAAkB,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,UAAU,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QAEnF,OAAO;YACL,kBAAkB,EAAE,cAAc;YAClC,YAAY;YACZ,YAAY;YACZ,eAAe,EAAE,WAAW;YAC5B,MAAM;YACN,OAAO,EAAE;gBACP,mBAAmB;gBACnB,kBAAkB;gBAClB,cAAc,EAAE,iBAAiB,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC;gBACjD,QAAQ;aACT;YACD,UAAU;SACX,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,oBAAoB,CAChC,IAAY,EACZ,MAOC;QAED,IAAI,CAAC,MAAM,CAAC,wBAAwB,EAAE,CAAC;YACrC,OAAO,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,MAAM,CAAC,YAAY,CAAC,CAAC;QACrD,CAAC;QAED,MAAM,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACnC,MAAM,SAAS,GAAG,MAAM,iBAAiB,CAAC,IAAI,EAAE;YAC9C,OAAO,EAAE,MAAM,CAAC,mBAAmB;YACnC,WAAW,EAAE,MAAM,CAAC,qBAAqB;YACzC,qBAAqB,EAAE,MAAM,CAAC,qBAAqB;YACnD,gBAAgB,EAAE,IAAI;SACvB,CAAC,CAAC;QACH,MAAM,CAAC,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,eAAe,CAAC,CAAC;QAEhE,OAAO;YACL;gBACE,EAAE,EAAE,GAAG,IAAI,IAAI,IAAI,CAAC,GAAG,EAAE,EAAE;gBAC3B,OAAO,EAAE,SAAS,CAAC,QAAQ;gBAC3B,QAAQ,EAAE;oBACR,MAAM,EAAE,IAAI;oBACZ,IAAI,EAAE,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC;oBAClC,SAAS,EAAE,IAAI,IAAI,EAAE;oBACrB,mBAAmB,EAAE,SAAS,CAAC,QAAQ,CAAC,WAAW;oBACnD,gBAAgB,EAAE,SAAS,CAAC,QAAQ,CAAC,MAAM;oBAC3C,SAAS,EAAE,SAAS,CAAC,QAAQ,CAAC,SAAS;oBACvC,oBAAoB,EAAE,SAAS,CAAC,QAAQ,CAAC,UAAU;oBACnD,mBAAmB,EAAE,SAAS,CAAC,QAAQ,CAAC,mBAAmB;iBAC5D;aACF;SACF,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,wBAAwB,CACpC,IAAY,EACZ,SAAqB,EACrB,MAQC;QAED,MAAM,SAAS,GAAoB,EAAE,CAAC;QAEtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;YAC5D,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,SAAS,CAAC,CAAC;YACvD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,KAAK,EAAE;gBAChD,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;gBACzC,cAAc,EAAE,MAAM,CAAC,cAAc;gBACrC,iBAAiB,EAAE,MAAM,CAAC,iBAAiB;aAC5C,CAAC,CAAC;YACH,SAAS,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;YAE1B,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;gBACtB,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,GAAG,CAAC,EAAE,MAAM,CAAC,UAAU,EAAE;oBACzD,WAAW,EAAE,IAAI;oBACjB,kBAAkB,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,SAAS,EAAE,SAAS,CAAC,MAAM,CAAC;oBACpE,cAAc,EAAE,SAAS,CAAC,MAAM;oBAChC,eAAe,EAAE,SAAS,CAAC,MAAM;iBAClC,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,MAAM,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;QACzC,OAAO,SAAS,CAAC,MAAM,CAAC;IAC1B,CAAC;IAEO,gBAAgB,CAAC,KAAc;QACrC,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAI,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC,EAAE,CAAC;YACtE,OAAO,SAAS,CAAC;QACnB,CAAC;QACD,MAAM,OAAO,GAAI,KAA4B,CAAC,IAAI,CAAC;QACnD,OAAO,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC;IAC3D,CAAC;IAEO,mBAAmB,CAAC,IAAY,EAAE,WAAmB,EAAE,QAAkB;QAC/E,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC;QACnD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,WAAW,CAAC,CAAC;QACpD,IAAI,WAAW,GAAG,GAAG,GAAG,IAAI,GAAG,IAAI,EAAE,CAAC;YACpC,QAAQ,CAAC,IAAI,CACX,+CAA+C,IAAI,KAAK,CAAC,WAAW,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CACrG,CAAC;QACJ,CAAC;QACD,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,gBAAgB,CAC5B,SAAqB,EACrB,OAIC;QAED,MAAM,MAAM,GAAoB,EAAE,CAAC;QAEnC,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;YAC5B,oBAAoB;YACpB,MAAM,SAAS,GACb,OAAO,EAAE,gBAAgB,KAAK,UAAU;gBACtC,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,GAAG,CAAC;gBACjC,CAAC,CAAC,IAAI,CAAC,kBAAkB,CACrB,GAAG,EACH,OAAO,EAAE,cAAc,IAAI,IAAI,EAC/B,OAAO,EAAE,iBAAiB,IAAI,GAAG,CAClC,CAAC;YACR,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;QAC5B,CAAC;QAED,sBAAsB;QACtB,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACnD,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAEpD,oBAAoB;QACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;QACtC,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,kBAAkB,CAAC,GAAa,EAAE,SAAiB,EAAE,OAAe;QAC1E,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC;QAE5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,GAAG,OAAO,EAAE,CAAC;YAC7D,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC;YAC9C,MAAM,OAAO,GAAG,GAAG,GAAG,CAAC,EAAE,UAAU,MAAM,CAAC,MAAM,EAAE,CAAC;YAEnD,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,OAAO;gBACX,UAAU,EAAE,GAAG,CAAC,EAAE;gBAClB,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,MAAM,CAAC,MAAM;gBACpB,QAAQ,EAAE;oBACR,UAAU,EAAE,MAAM,CAAC,MAAM;oBACzB,UAAU,EAAE,CAAC,EAAE,oBAAoB;oBACnC,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,SAAS,EAAE,OAAO,CAAC,MAAM,CAAC;oBAChD,GAAG,GAAG,CAAC,QAAQ;iBAChB;aACF,CAAC,CAAC;QACL,CAAC;QAED,sBAAsB;QACtB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,KAAK,CAAC,QAAQ,CAAC,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC;QAC5C,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,qBAAqB,CAAC,GAAa;QACzC,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,MAAM,QAAQ,GAAG,GAAG,CAAC,OAAO;aACzB,KAAK,CAAC,iBAAiB,CAAC;aACxB,MAAM,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAElD,IAAI,QAAQ,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;YACzB,OAAO,IAAI,CAAC,kBAAkB,CAAC,GAAG,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;QACjD,CAAC;QAED,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACzC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;YAC5B,MAAM,SAAS,GAAG,GAAG,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;YACvD,MAAM,OAAO,GAAG,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC;YAC3C,MAAM,OAAO,GAAG,GAAG,GAAG,CAAC,EAAE,UAAU,CAAC,EAAE,CAAC;YAEvC,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,OAAO;gBACX,UAAU,EAAE,GAAG,CAAC,EAAE;gBAClB,OAAO,EAAE,OAAO;gBAChB,KAAK,EAAE,CAAC;gBACR,QAAQ,EAAE;oBACR,UAAU,EAAE,CAAC;oBACb,UAAU,EAAE,QAAQ,CAAC,MAAM;oBAC3B,SAAS;oBACT,OAAO;oBACP,gBAAgB,EAAE,UAAU;oBAC5B,GAAG,GAAG,CAAC,QAAQ;iBAChB;aACF,CAAC,CAAC;YAEH,MAAM,GAAG,OAAO,CAAC;QACnB,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,IAAY;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;QACrC,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,OAAO,KAAK,CAAC;QAC7C,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,WAAW,CAAC;YAAE,OAAO,UAAU,CAAC;QACpF,IAAI,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,OAAO,MAAM,CAAC;QAC7E,IAAI,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC;YAAE,OAAO,MAAM,CAAC;QAC/C,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,OAAO,MAAM,CAAC;QAC9C,OAAO,OAAO,CAAC;IACjB,CAAC;CACF"}
|
package/dist/types/index.d.ts
CHANGED
|
@@ -232,6 +232,26 @@ export interface IngestionOptions {
|
|
|
232
232
|
batchSize?: number;
|
|
233
233
|
/** Loader configuration */
|
|
234
234
|
loaderConfig?: LoaderConfig;
|
|
235
|
+
/** Enable MarkItDown preprocessing for non-text documents */
|
|
236
|
+
enableDocumentConversion?: boolean;
|
|
237
|
+
/** Conversion timeout in milliseconds */
|
|
238
|
+
conversionTimeoutMs?: number;
|
|
239
|
+
/** Conversion file size limit in bytes */
|
|
240
|
+
conversionMaxFileSize?: number;
|
|
241
|
+
/** Enable LLM image descriptions during conversion */
|
|
242
|
+
enableLLMDescriptions?: boolean;
|
|
243
|
+
/** Chunking strategy */
|
|
244
|
+
chunkingStrategy?: 'semantic' | 'fixed';
|
|
245
|
+
/** Fixed chunk size (characters) */
|
|
246
|
+
fixedChunkSize?: number;
|
|
247
|
+
/** Fixed chunk overlap (characters) */
|
|
248
|
+
fixedChunkOverlap?: number;
|
|
249
|
+
/** Callback for conversion errors */
|
|
250
|
+
onConversionError?: (error: {
|
|
251
|
+
file: string;
|
|
252
|
+
error: string;
|
|
253
|
+
errorType?: string;
|
|
254
|
+
}) => void;
|
|
235
255
|
/** Progress callback */
|
|
236
256
|
onProgress?: (current: number, total: number, details?: {
|
|
237
257
|
currentFile: string;
|
|
@@ -246,6 +266,10 @@ export interface IngestionOptions {
|
|
|
246
266
|
export interface IngestionResult {
|
|
247
267
|
/** Number of documents processed */
|
|
248
268
|
documentsProcessed: number;
|
|
269
|
+
/** Number of input files successfully processed */
|
|
270
|
+
successCount: number;
|
|
271
|
+
/** Number of input files that failed processing */
|
|
272
|
+
failureCount: number;
|
|
249
273
|
/** Number of chunks generated */
|
|
250
274
|
chunksGenerated: number;
|
|
251
275
|
/** Errors during ingestion */
|
|
@@ -253,6 +277,17 @@ export interface IngestionResult {
|
|
|
253
277
|
file: string;
|
|
254
278
|
error: string;
|
|
255
279
|
}>;
|
|
280
|
+
/** Performance and resource metrics */
|
|
281
|
+
metrics?: {
|
|
282
|
+
/** Average conversion duration per file in milliseconds */
|
|
283
|
+
averageConversionMs?: number;
|
|
284
|
+
/** Documents converted per second */
|
|
285
|
+
documentsPerSecond?: number;
|
|
286
|
+
/** Peak heap usage in MB during ingestion */
|
|
287
|
+
peakHeapUsedMb?: number;
|
|
288
|
+
/** Warnings generated during ingestion */
|
|
289
|
+
warnings?: string[];
|
|
290
|
+
};
|
|
256
291
|
/** Total duration in milliseconds */
|
|
257
292
|
durationMs: number;
|
|
258
293
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/types/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,wBAAwB;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,uBAAuB;IACvB,OAAO,EAAE,MAAM,CAAC;IAChB,wBAAwB;IACxB,QAAQ,EAAE,gBAAgB,CAAC;IAC3B,sCAAsC;IACtC,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,8BAA8B;IAC9B,MAAM,EAAE,MAAM,CAAC;IACf,oBAAoB;IACpB,IAAI,EAAE,KAAK,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;IAC9D,yBAAyB;IACzB,SAAS,EAAE,IAAI,CAAC;IAChB,8BAA8B;IAC9B,SAAS,CAAC,EAAE,IAAI,CAAC;IACjB,yBAAyB;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,qBAAqB;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,iCAAiC;IACjC,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,uBAAuB;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,yBAAyB;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,oBAAoB;IACpB,OAAO,EAAE,MAAM,CAAC;IAChB,8BAA8B;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,qBAAqB;IACrB,QAAQ,EAAE,aAAa,CAAC;IACxB,uBAAuB;IACvB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,kBAAkB;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAC;IACnB,oDAAoD;IACpD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,kDAAkD;IAClD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,yBAAyB;IACzB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,iCAAiC;IACjC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,+CAA+C;IAC/C,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,+BAA+B;IAC/B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,kCAAkC;IAClC,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qCAAqC;IACrC,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,6BAA6B;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,gCAAgC;IAChC,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;IACjE,gCAAgC;IAChC,mBAAmB,EAAE,MAAM,EAAE,CAAC;CAC/B;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,+BAA+B;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,2BAA2B;IAC3B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gCAAgC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,mCAAmC;IACnC,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,MAAM,CAAC,EAAE,eAAe,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;IACtE,+BAA+B;IAC/B,aAAa,IAAI,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,4BAA4B;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,2BAA2B;IAC3B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,sBAAsB;IACtB,cAAc,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,KAAK,CAAC;IAChD,2CAA2C;IAC3C,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,iCAAiC;IACjC,YAAY,CAAC,SAAS,EAAE,aAAa,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACxD,mCAAmC;IACnC,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,EAAE,KAAK,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,cAAc,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;IACnG,6BAA6B;IAC7B,eAAe,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAC9C,sBAAsB;IACtB,cAAc,CAAC,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,aAAa,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAC5E,yBAAyB;IACzB,WAAW,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,CAAC;IACvD,0BAA0B;IAC1B,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,yBAAyB;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,eAAe;IACf,QAAQ,EAAE,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,KAAK,GAAG,IAAI,GAAG,KAAK,GAAG,IAAI,GAAG,KAAK,CAAC;IACnE,uBAAuB;IACvB,KAAK,EAAE,OAAO,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,8BAA8B;IAC9B,QAAQ,EAAE,aAAa,CAAC;IACxB,qDAAqD;IACrD,KAAK,EAAE,MAAM,CAAC;IACd,kDAAkD;IAClD,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,iCAAiC;IACjC,WAAW,EAAE,iBAAiB,CAAC;IAC/B,8BAA8B;IAC9B,SAAS,CAAC,EAAE,eAAe,CAAC;IAC5B,2BAA2B;IAC3B,MAAM,CAAC,EAAE,YAAY,CAAC;IACtB,8BAA8B;IAC9B,SAAS,CAAC,EAAE,eAAe,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,oCAAoC;IACpC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,yCAAyC;IACzC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,gCAAgC;IAChC,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,gCAAgC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,oCAAoC;IACpC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,+CAA+C;IAC/C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,uBAAuB;IACvB,MAAM,CAAC,EAAE,cAAc,CAAC;IACxB,kCAAkC;IAClC,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,qBAAqB;IACrB,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,qBAAqB;IACrB,KAAK,EAAE,MAAM,CAAC;IACd,qBAAqB;IACrB,OAAO,EAAE,YAAY,EAAE,CAAC;IACxB,wBAAwB;IACxB,OAAO,EAAE,MAAM,CAAC;IAChB,+BAA+B;IAC/B,QAAQ,EAAE;QACR,6BAA6B;QAC7B,YAAY,EAAE,MAAM,CAAC;QACrB,iCAAiC;QACjC,UAAU,EAAE,MAAM,CAAC;QACnB,8BAA8B;QAC9B,YAAY,EAAE,MAAM,CAAC;KACtB,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,gCAAgC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,2BAA2B;IAC3B,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,wBAAwB;IACxB,UAAU,CAAC,EAAE,CACX,OAAO,EAAE,MAAM,EACf,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,kBAAkB,EAAE,MAAM,CAAC;QAC3B,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;KACzB,KACE,IAAI,CAAC;CACX;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,oCAAoC;IACpC,kBAAkB,EAAE,MAAM,CAAC;IAC3B,iCAAiC;IACjC,eAAe,EAAE,MAAM,CAAC;IACxB,8BAA8B;IAC9B,MAAM,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC/C,qCAAqC;IACrC,UAAU,EAAE,MAAM,CAAC;CACpB"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/types/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,wBAAwB;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,uBAAuB;IACvB,OAAO,EAAE,MAAM,CAAC;IAChB,wBAAwB;IACxB,QAAQ,EAAE,gBAAgB,CAAC;IAC3B,sCAAsC;IACtC,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,8BAA8B;IAC9B,MAAM,EAAE,MAAM,CAAC;IACf,oBAAoB;IACpB,IAAI,EAAE,KAAK,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;IAC9D,yBAAyB;IACzB,SAAS,EAAE,IAAI,CAAC;IAChB,8BAA8B;IAC9B,SAAS,CAAC,EAAE,IAAI,CAAC;IACjB,yBAAyB;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,qBAAqB;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,iCAAiC;IACjC,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,uBAAuB;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,yBAAyB;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,oBAAoB;IACpB,OAAO,EAAE,MAAM,CAAC;IAChB,8BAA8B;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,qBAAqB;IACrB,QAAQ,EAAE,aAAa,CAAC;IACxB,uBAAuB;IACvB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,kBAAkB;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAC;IACnB,oDAAoD;IACpD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,kDAAkD;IAClD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,yBAAyB;IACzB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,iCAAiC;IACjC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,+CAA+C;IAC/C,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,+BAA+B;IAC/B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,kCAAkC;IAClC,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qCAAqC;IACrC,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,6BAA6B;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,gCAAgC;IAChC,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;IACjE,gCAAgC;IAChC,mBAAmB,EAAE,MAAM,EAAE,CAAC;CAC/B;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,+BAA+B;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,2BAA2B;IAC3B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gCAAgC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,mCAAmC;IACnC,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,MAAM,CAAC,EAAE,eAAe,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;IACtE,+BAA+B;IAC/B,aAAa,IAAI,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,4BAA4B;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,2BAA2B;IAC3B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,sBAAsB;IACtB,cAAc,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,KAAK,CAAC;IAChD,2CAA2C;IAC3C,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,iCAAiC;IACjC,YAAY,CAAC,SAAS,EAAE,aAAa,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACxD,mCAAmC;IACnC,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,EAAE,KAAK,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,cAAc,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;IACnG,6BAA6B;IAC7B,eAAe,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAC9C,sBAAsB;IACtB,cAAc,CAAC,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,aAAa,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAC5E,yBAAyB;IACzB,WAAW,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,CAAC;IACvD,0BAA0B;IAC1B,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,yBAAyB;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,eAAe;IACf,QAAQ,EAAE,IAAI,GAAG,IAAI,GAAG,IAAI,GAAG,KAAK,GAAG,IAAI,GAAG,KAAK,GAAG,IAAI,GAAG,KAAK,CAAC;IACnE,uBAAuB;IACvB,KAAK,EAAE,OAAO,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,8BAA8B;IAC9B,QAAQ,EAAE,aAAa,CAAC;IACxB,qDAAqD;IACrD,KAAK,EAAE,MAAM,CAAC;IACd,kDAAkD;IAClD,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,iCAAiC;IACjC,WAAW,EAAE,iBAAiB,CAAC;IAC/B,8BAA8B;IAC9B,SAAS,CAAC,EAAE,eAAe,CAAC;IAC5B,2BAA2B;IAC3B,MAAM,CAAC,EAAE,YAAY,CAAC;IACtB,8BAA8B;IAC9B,SAAS,CAAC,EAAE,eAAe,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,oCAAoC;IACpC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,yCAAyC;IACzC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,gCAAgC;IAChC,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,gCAAgC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,oCAAoC;IACpC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,+CAA+C;IAC/C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,uBAAuB;IACvB,MAAM,CAAC,EAAE,cAAc,CAAC;IACxB,kCAAkC;IAClC,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,qBAAqB;IACrB,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,qBAAqB;IACrB,KAAK,EAAE,MAAM,CAAC;IACd,qBAAqB;IACrB,OAAO,EAAE,YAAY,EAAE,CAAC;IACxB,wBAAwB;IACxB,OAAO,EAAE,MAAM,CAAC;IAChB,+BAA+B;IAC/B,QAAQ,EAAE;QACR,6BAA6B;QAC7B,YAAY,EAAE,MAAM,CAAC;QACrB,iCAAiC;QACjC,UAAU,EAAE,MAAM,CAAC;QACnB,8BAA8B;QAC9B,YAAY,EAAE,MAAM,CAAC;KACtB,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,gCAAgC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,2BAA2B;IAC3B,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,6DAA6D;IAC7D,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC,yCAAyC;IACzC,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,0CAA0C;IAC1C,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAC/B,sDAAsD;IACtD,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC,wBAAwB;IACxB,gBAAgB,CAAC,EAAE,UAAU,GAAG,OAAO,CAAC;IACxC,oCAAoC;IACpC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,uCAAuC;IACvC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,qCAAqC;IACrC,iBAAiB,CAAC,EAAE,CAAC,KAAK,EAAE;QAC1B,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,KAAK,IAAI,CAAC;IACX,wBAAwB;IACxB,UAAU,CAAC,EAAE,CACX,OAAO,EAAE,MAAM,EACf,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,kBAAkB,EAAE,MAAM,CAAC;QAC3B,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;KACzB,KACE,IAAI,CAAC;CACX;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,oCAAoC;IACpC,kBAAkB,EAAE,MAAM,CAAC;IAC3B,mDAAmD;IACnD,YAAY,EAAE,MAAM,CAAC;IACrB,mDAAmD;IACnD,YAAY,EAAE,MAAM,CAAC;IACrB,iCAAiC;IACjC,eAAe,EAAE,MAAM,CAAC;IACxB,8BAA8B;IAC9B,MAAM,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC/C,uCAAuC;IACvC,OAAO,CAAC,EAAE;QACR,2DAA2D;QAC3D,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,qCAAqC;QACrC,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,6CAA6C;QAC7C,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,0CAA0C;QAC1C,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;KACrB,CAAC;IACF,qCAAqC;IACrC,UAAU,EAAE,MAAM,CAAC;CACpB"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dcyfr/ai-rag",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "RAG (Retrieval-Augmented Generation) system template - DCYFR AI starter",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -77,7 +77,7 @@
|
|
|
77
77
|
"zod": "^3.22.4"
|
|
78
78
|
},
|
|
79
79
|
"peerDependencies": {
|
|
80
|
-
"@dcyfr/ai": "^1.0
|
|
80
|
+
"@dcyfr/ai": "^2.1.0",
|
|
81
81
|
"chromadb": "^1.8.0"
|
|
82
82
|
},
|
|
83
83
|
"peerDependenciesMeta": {
|
|
@@ -90,14 +90,13 @@
|
|
|
90
90
|
},
|
|
91
91
|
"devDependencies": {
|
|
92
92
|
"@changesets/changelog-github": "^0.5.2",
|
|
93
|
-
"@changesets/cli": "^2.
|
|
94
|
-
"@types/node": "^
|
|
95
|
-
"@typescript-eslint/eslint-plugin": "^6.19.0",
|
|
96
|
-
"@typescript-eslint/parser": "^6.19.0",
|
|
93
|
+
"@changesets/cli": "^2.30.0",
|
|
94
|
+
"@types/node": "^22.0.0",
|
|
97
95
|
"@vitest/coverage-v8": "^4.0.18",
|
|
98
|
-
"eslint": "^
|
|
96
|
+
"eslint": "^9.39.2",
|
|
99
97
|
"tsx": "^4.7.0",
|
|
100
98
|
"typescript": "^5.3.3",
|
|
99
|
+
"typescript-eslint": "^8.56.0",
|
|
101
100
|
"vitest": "^4.0.18"
|
|
102
101
|
},
|
|
103
102
|
"engines": {
|