rag-lite-ts 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +606 -93
- package/dist/cli/indexer.js +192 -4
- package/dist/cli/search.js +50 -11
- package/dist/cli.js +183 -26
- package/dist/core/abstract-embedder.d.ts +125 -0
- package/dist/core/abstract-embedder.js +264 -0
- package/dist/core/actionable-error-messages.d.ts +60 -0
- package/dist/core/actionable-error-messages.js +397 -0
- package/dist/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/core/batch-processing-optimizer.js +541 -0
- package/dist/core/chunker.d.ts +2 -0
- package/dist/core/cli-database-utils.d.ts +53 -0
- package/dist/core/cli-database-utils.js +239 -0
- package/dist/core/config.js +10 -3
- package/dist/core/content-errors.d.ts +111 -0
- package/dist/core/content-errors.js +362 -0
- package/dist/core/content-manager.d.ts +343 -0
- package/dist/core/content-manager.js +1504 -0
- package/dist/core/content-performance-optimizer.d.ts +150 -0
- package/dist/core/content-performance-optimizer.js +516 -0
- package/dist/core/content-resolver.d.ts +104 -0
- package/dist/core/content-resolver.js +285 -0
- package/dist/core/cross-modal-search.d.ts +164 -0
- package/dist/core/cross-modal-search.js +342 -0
- package/dist/core/database-connection-manager.d.ts +109 -0
- package/dist/core/database-connection-manager.js +304 -0
- package/dist/core/db.d.ts +141 -2
- package/dist/core/db.js +631 -89
- package/dist/core/embedder-factory.d.ts +176 -0
- package/dist/core/embedder-factory.js +338 -0
- package/dist/core/index.d.ts +3 -1
- package/dist/core/index.js +4 -1
- package/dist/core/ingestion.d.ts +85 -15
- package/dist/core/ingestion.js +510 -45
- package/dist/core/lazy-dependency-loader.d.ts +152 -0
- package/dist/core/lazy-dependency-loader.js +453 -0
- package/dist/core/mode-detection-service.d.ts +150 -0
- package/dist/core/mode-detection-service.js +565 -0
- package/dist/core/mode-model-validator.d.ts +92 -0
- package/dist/core/mode-model-validator.js +203 -0
- package/dist/core/model-registry.d.ts +120 -0
- package/dist/core/model-registry.js +415 -0
- package/dist/core/model-validator.d.ts +217 -0
- package/dist/core/model-validator.js +782 -0
- package/dist/core/polymorphic-search-factory.d.ts +154 -0
- package/dist/core/polymorphic-search-factory.js +344 -0
- package/dist/core/raglite-paths.d.ts +121 -0
- package/dist/core/raglite-paths.js +145 -0
- package/dist/core/reranking-config.d.ts +42 -0
- package/dist/core/reranking-config.js +156 -0
- package/dist/core/reranking-factory.d.ts +92 -0
- package/dist/core/reranking-factory.js +591 -0
- package/dist/core/reranking-strategies.d.ts +325 -0
- package/dist/core/reranking-strategies.js +720 -0
- package/dist/core/resource-cleanup.d.ts +163 -0
- package/dist/core/resource-cleanup.js +371 -0
- package/dist/core/resource-manager.d.ts +212 -0
- package/dist/core/resource-manager.js +564 -0
- package/dist/core/search.d.ts +28 -1
- package/dist/core/search.js +83 -5
- package/dist/core/streaming-operations.d.ts +145 -0
- package/dist/core/streaming-operations.js +409 -0
- package/dist/core/types.d.ts +3 -0
- package/dist/core/universal-embedder.d.ts +177 -0
- package/dist/core/universal-embedder.js +139 -0
- package/dist/core/validation-messages.d.ts +99 -0
- package/dist/core/validation-messages.js +334 -0
- package/dist/core/vector-index.js +7 -8
- package/dist/factories/index.d.ts +1 -1
- package/dist/factories/text-factory.d.ts +128 -34
- package/dist/factories/text-factory.js +346 -97
- package/dist/file-processor.d.ts +88 -2
- package/dist/file-processor.js +720 -17
- package/dist/index.d.ts +9 -0
- package/dist/index.js +11 -0
- package/dist/ingestion.d.ts +16 -0
- package/dist/ingestion.js +21 -0
- package/dist/mcp-server.d.ts +35 -3
- package/dist/mcp-server.js +1107 -31
- package/dist/multimodal/clip-embedder.d.ts +314 -0
- package/dist/multimodal/clip-embedder.js +945 -0
- package/dist/multimodal/index.d.ts +6 -0
- package/dist/multimodal/index.js +6 -0
- package/dist/run-error-recovery-tests.d.ts +7 -0
- package/dist/run-error-recovery-tests.js +101 -0
- package/dist/search.d.ts +26 -0
- package/dist/search.js +54 -1
- package/dist/test-utils.d.ts +8 -26
- package/dist/text/chunker.d.ts +1 -0
- package/dist/text/embedder.js +15 -8
- package/dist/text/index.d.ts +1 -0
- package/dist/text/index.js +1 -0
- package/dist/text/reranker.d.ts +1 -2
- package/dist/text/reranker.js +17 -47
- package/dist/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/text/sentence-transformer-embedder.js +340 -0
- package/dist/types.d.ts +39 -0
- package/dist/utils/vector-math.d.ts +31 -0
- package/dist/utils/vector-math.js +70 -0
- package/package.json +15 -3
- package/dist/api-errors.d.ts.map +0 -1
- package/dist/api-errors.js.map +0 -1
- package/dist/cli/indexer.d.ts.map +0 -1
- package/dist/cli/indexer.js.map +0 -1
- package/dist/cli/search.d.ts.map +0 -1
- package/dist/cli/search.js.map +0 -1
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/core/adapters.d.ts.map +0 -1
- package/dist/core/adapters.js.map +0 -1
- package/dist/core/chunker.d.ts.map +0 -1
- package/dist/core/chunker.js.map +0 -1
- package/dist/core/config.d.ts.map +0 -1
- package/dist/core/config.js.map +0 -1
- package/dist/core/db.d.ts.map +0 -1
- package/dist/core/db.js.map +0 -1
- package/dist/core/error-handler.d.ts.map +0 -1
- package/dist/core/error-handler.js.map +0 -1
- package/dist/core/index.d.ts.map +0 -1
- package/dist/core/index.js.map +0 -1
- package/dist/core/ingestion.d.ts.map +0 -1
- package/dist/core/ingestion.js.map +0 -1
- package/dist/core/interfaces.d.ts.map +0 -1
- package/dist/core/interfaces.js.map +0 -1
- package/dist/core/path-manager.d.ts.map +0 -1
- package/dist/core/path-manager.js.map +0 -1
- package/dist/core/search-example.d.ts +0 -25
- package/dist/core/search-example.d.ts.map +0 -1
- package/dist/core/search-example.js +0 -138
- package/dist/core/search-example.js.map +0 -1
- package/dist/core/search-pipeline-example.d.ts +0 -21
- package/dist/core/search-pipeline-example.d.ts.map +0 -1
- package/dist/core/search-pipeline-example.js +0 -188
- package/dist/core/search-pipeline-example.js.map +0 -1
- package/dist/core/search-pipeline.d.ts.map +0 -1
- package/dist/core/search-pipeline.js.map +0 -1
- package/dist/core/search.d.ts.map +0 -1
- package/dist/core/search.js.map +0 -1
- package/dist/core/types.d.ts.map +0 -1
- package/dist/core/types.js.map +0 -1
- package/dist/core/vector-index.d.ts.map +0 -1
- package/dist/core/vector-index.js.map +0 -1
- package/dist/dom-polyfills.d.ts.map +0 -1
- package/dist/dom-polyfills.js.map +0 -1
- package/dist/examples/clean-api-examples.d.ts +0 -44
- package/dist/examples/clean-api-examples.d.ts.map +0 -1
- package/dist/examples/clean-api-examples.js +0 -206
- package/dist/examples/clean-api-examples.js.map +0 -1
- package/dist/factories/index.d.ts.map +0 -1
- package/dist/factories/index.js.map +0 -1
- package/dist/factories/text-factory.d.ts.map +0 -1
- package/dist/factories/text-factory.js.map +0 -1
- package/dist/file-processor.d.ts.map +0 -1
- package/dist/file-processor.js.map +0 -1
- package/dist/index-manager.d.ts.map +0 -1
- package/dist/index-manager.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/indexer.d.ts.map +0 -1
- package/dist/indexer.js.map +0 -1
- package/dist/ingestion.d.ts.map +0 -1
- package/dist/ingestion.js.map +0 -1
- package/dist/mcp-server.d.ts.map +0 -1
- package/dist/mcp-server.js.map +0 -1
- package/dist/preprocess.d.ts.map +0 -1
- package/dist/preprocess.js.map +0 -1
- package/dist/preprocessors/index.d.ts.map +0 -1
- package/dist/preprocessors/index.js.map +0 -1
- package/dist/preprocessors/mdx.d.ts.map +0 -1
- package/dist/preprocessors/mdx.js.map +0 -1
- package/dist/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/preprocessors/mermaid.js.map +0 -1
- package/dist/preprocessors/registry.d.ts.map +0 -1
- package/dist/preprocessors/registry.js.map +0 -1
- package/dist/search-standalone.d.ts.map +0 -1
- package/dist/search-standalone.js.map +0 -1
- package/dist/search.d.ts.map +0 -1
- package/dist/search.js.map +0 -1
- package/dist/test-utils.d.ts.map +0 -1
- package/dist/test-utils.js.map +0 -1
- package/dist/text/chunker.d.ts.map +0 -1
- package/dist/text/chunker.js.map +0 -1
- package/dist/text/embedder.d.ts.map +0 -1
- package/dist/text/embedder.js.map +0 -1
- package/dist/text/index.d.ts.map +0 -1
- package/dist/text/index.js.map +0 -1
- package/dist/text/preprocessors/index.d.ts.map +0 -1
- package/dist/text/preprocessors/index.js.map +0 -1
- package/dist/text/preprocessors/mdx.d.ts.map +0 -1
- package/dist/text/preprocessors/mdx.js.map +0 -1
- package/dist/text/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/text/preprocessors/mermaid.js.map +0 -1
- package/dist/text/preprocessors/registry.d.ts.map +0 -1
- package/dist/text/preprocessors/registry.js.map +0 -1
- package/dist/text/reranker.d.ts.map +0 -1
- package/dist/text/reranker.js.map +0 -1
- package/dist/text/tokenizer.d.ts.map +0 -1
- package/dist/text/tokenizer.js.map +0 -1
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js.map +0 -1
package/dist/core/ingestion.js
CHANGED
|
@@ -8,6 +8,7 @@ import { insertChunk, upsertDocument } from './db.js';
|
|
|
8
8
|
import { config } from './config.js';
|
|
9
9
|
import { DocumentPathManager } from './path-manager.js';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
|
+
import { ContentManager } from './content-manager.js';
|
|
11
12
|
/**
|
|
12
13
|
* Main ingestion pipeline class
|
|
13
14
|
* Coordinates the entire process from file discovery to vector storage
|
|
@@ -19,8 +20,10 @@ export class IngestionPipeline {
|
|
|
19
20
|
db;
|
|
20
21
|
defaultChunkConfig;
|
|
21
22
|
pathManager;
|
|
23
|
+
contentManager;
|
|
22
24
|
/**
|
|
23
25
|
* Creates a new IngestionPipeline with explicit dependency injection
|
|
26
|
+
* Enhanced with ContentManager integration for unified content system
|
|
24
27
|
*
|
|
25
28
|
* DEPENDENCY INJECTION PATTERN:
|
|
26
29
|
* This constructor requires all dependencies to be explicitly provided, enabling:
|
|
@@ -28,6 +31,7 @@ export class IngestionPipeline {
|
|
|
28
31
|
* - Support for different embedding models and content types
|
|
29
32
|
* - Testability through mock injection
|
|
30
33
|
* - Future extensibility for multimodal content processing
|
|
34
|
+
* - Unified content management for both filesystem and memory-based ingestion
|
|
31
35
|
*
|
|
32
36
|
* @param embedFn - Function to embed document chunks into vectors
|
|
33
37
|
* - Signature: (query: string, contentType?: string) => Promise<EmbeddingResult>
|
|
@@ -47,32 +51,33 @@ export class IngestionPipeline {
|
|
|
47
51
|
* - Supports different content types through metadata fields
|
|
48
52
|
* - Example: await openDatabase('./db.sqlite')
|
|
49
53
|
*
|
|
54
|
+
* @param contentManager - Optional ContentManager for unified content system
|
|
55
|
+
* - Handles content storage routing and deduplication
|
|
56
|
+
* - If not provided, creates default instance with standard configuration
|
|
57
|
+
* - Example: new ContentManager(db, { contentDir: '.raglite/content' })
|
|
58
|
+
*
|
|
50
59
|
* USAGE EXAMPLES:
|
|
51
60
|
* ```typescript
|
|
52
|
-
* // Text-only ingestion pipeline
|
|
61
|
+
* // Text-only ingestion pipeline with unified content system
|
|
53
62
|
* const textEmbedFn = await createTextEmbedder();
|
|
54
63
|
* const indexManager = new IndexManager('./index.bin');
|
|
55
64
|
* const db = await openDatabase('./db.sqlite');
|
|
65
|
+
* const contentManager = new ContentManager(db);
|
|
66
|
+
* const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db, undefined, contentManager);
|
|
67
|
+
*
|
|
68
|
+
* // Simple usage (ContentManager created automatically)
|
|
56
69
|
* const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db);
|
|
57
70
|
*
|
|
58
|
-
* // Custom embedding implementation
|
|
71
|
+
* // Custom embedding implementation with memory ingestion
|
|
59
72
|
* const customEmbedFn = async (text) => ({
|
|
60
73
|
* embedding_id: generateId(),
|
|
61
74
|
* vector: await myCustomModel.embed(text)
|
|
62
75
|
* });
|
|
63
76
|
* const ingestion = new IngestionPipeline(customEmbedFn, indexManager, db);
|
|
64
|
-
*
|
|
65
|
-
* // Multimodal ingestion (future)
|
|
66
|
-
* const multimodalEmbedFn = async (content, contentType) => {
|
|
67
|
-
* if (contentType === 'image') {
|
|
68
|
-
* return { embedding_id: generateId(), vector: await clipModel.embedImage(content) };
|
|
69
|
-
* }
|
|
70
|
-
* return { embedding_id: generateId(), vector: await clipModel.embedText(content) };
|
|
71
|
-
* };
|
|
72
|
-
* const ingestion = new IngestionPipeline(multimodalEmbedFn, indexManager, db);
|
|
77
|
+
* await ingestion.ingestFromMemory(buffer, { displayName: 'file.txt' });
|
|
73
78
|
* ```
|
|
74
79
|
*/
|
|
75
|
-
constructor(embedFn, indexManager, db, defaultChunkConfig) {
|
|
80
|
+
constructor(embedFn, indexManager, db, defaultChunkConfig, contentManager) {
|
|
76
81
|
this.embedFn = embedFn;
|
|
77
82
|
this.indexManager = indexManager;
|
|
78
83
|
this.db = db;
|
|
@@ -89,6 +94,8 @@ export class IngestionPipeline {
|
|
|
89
94
|
}
|
|
90
95
|
// Initialize path manager with default configuration
|
|
91
96
|
this.pathManager = new DocumentPathManager(config.path_storage_strategy, process.cwd());
|
|
97
|
+
// Initialize ContentManager (create default if not provided)
|
|
98
|
+
this.contentManager = contentManager || new ContentManager(this.db);
|
|
92
99
|
}
|
|
93
100
|
/**
|
|
94
101
|
* Ingest documents from a directory
|
|
@@ -114,15 +121,152 @@ export class IngestionPipeline {
|
|
|
114
121
|
}
|
|
115
122
|
return this.ingestPath(filePath, options);
|
|
116
123
|
}
|
|
124
|
+
/**
|
|
125
|
+
* Ingest content from memory buffer
|
|
126
|
+
* Enables MCP integration and real-time content processing
|
|
127
|
+
* @param content - Buffer containing the content to ingest
|
|
128
|
+
* @param metadata - Memory content metadata including display name and content type
|
|
129
|
+
* @param options - Optional ingestion configuration
|
|
130
|
+
* @returns Promise resolving to content ID for the ingested content
|
|
131
|
+
*/
|
|
132
|
+
async ingestFromMemory(content, metadata, options = {}) {
|
|
133
|
+
const startTime = Date.now();
|
|
134
|
+
console.log(`\n=== Starting memory ingestion: ${metadata.displayName} ===`);
|
|
135
|
+
try {
|
|
136
|
+
// Phase 1: Content Storage via ContentManager
|
|
137
|
+
console.log('\n--- Phase 1: Content Storage ---');
|
|
138
|
+
const contentResult = await this.contentManager.ingestFromMemory(content, metadata);
|
|
139
|
+
if (contentResult.wasDeduped) {
|
|
140
|
+
console.log(`✓ Content deduplicated: ${metadata.displayName} (ID: ${contentResult.contentId})`);
|
|
141
|
+
return contentResult.contentId;
|
|
142
|
+
}
|
|
143
|
+
console.log(`✓ Content stored: ${metadata.displayName} (ID: ${contentResult.contentId})`);
|
|
144
|
+
// Phase 2: Document Processing
|
|
145
|
+
console.log('\n--- Phase 2: Document Processing ---');
|
|
146
|
+
// Determine content type for processing
|
|
147
|
+
const detectedContentType = metadata.contentType || 'text/plain';
|
|
148
|
+
const isImageContent = detectedContentType.startsWith('image/');
|
|
149
|
+
let document;
|
|
150
|
+
if (isImageContent) {
|
|
151
|
+
// Process image content using the existing image processing pipeline
|
|
152
|
+
console.log(`Processing image content: ${metadata.displayName} (${detectedContentType})`);
|
|
153
|
+
document = await this.processImageFromMemory(content, contentResult, metadata, options);
|
|
154
|
+
}
|
|
155
|
+
else if (detectedContentType === 'application/pdf') {
|
|
156
|
+
// Process PDF content
|
|
157
|
+
console.log(`Processing PDF content: ${metadata.displayName}`);
|
|
158
|
+
document = await this.processPDFFromMemory(content, contentResult, metadata, options);
|
|
159
|
+
}
|
|
160
|
+
else if (detectedContentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
|
|
161
|
+
// Process DOCX content
|
|
162
|
+
console.log(`Processing DOCX content: ${metadata.displayName}`);
|
|
163
|
+
document = await this.processDOCXFromMemory(content, contentResult, metadata, options);
|
|
164
|
+
}
|
|
165
|
+
else {
|
|
166
|
+
// Process as text content
|
|
167
|
+
console.log(`Processing text content: ${metadata.displayName} (${detectedContentType})`);
|
|
168
|
+
document = {
|
|
169
|
+
source: metadata.displayName,
|
|
170
|
+
title: metadata.displayName,
|
|
171
|
+
content: content.toString('utf8'), // Convert buffer to string for processing
|
|
172
|
+
metadata: {
|
|
173
|
+
contentType: detectedContentType,
|
|
174
|
+
contentId: contentResult.contentId,
|
|
175
|
+
storageType: contentResult.storageType,
|
|
176
|
+
originalPath: metadata.originalPath
|
|
177
|
+
}
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
// Phase 3: Document Chunking
|
|
181
|
+
console.log('\n--- Phase 3: Document Chunking ---');
|
|
182
|
+
const effectiveChunkConfig = options.chunkConfig || this.defaultChunkConfig || {
|
|
183
|
+
chunkSize: config.chunk_size,
|
|
184
|
+
chunkOverlap: config.chunk_overlap
|
|
185
|
+
};
|
|
186
|
+
const chunks = await chunkDocument(document, effectiveChunkConfig);
|
|
187
|
+
console.log(`✓ Created ${chunks.length} chunks from memory content`);
|
|
188
|
+
if (chunks.length === 0) {
|
|
189
|
+
console.log('No chunks created from memory content');
|
|
190
|
+
return contentResult.contentId;
|
|
191
|
+
}
|
|
192
|
+
// Phase 4: Embedding Generation
|
|
193
|
+
console.log('\n--- Phase 4: Embedding Generation ---');
|
|
194
|
+
const embeddings = [];
|
|
195
|
+
let embeddingErrors = 0;
|
|
196
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
197
|
+
const chunk = chunks[i];
|
|
198
|
+
try {
|
|
199
|
+
// Convert MIME type to simple content type for embedding function
|
|
200
|
+
const contentTypeForEmbedding = this.getContentTypeForEmbedding(document.metadata?.contentType);
|
|
201
|
+
const embedding = await this.embedFn(chunk.text, contentTypeForEmbedding);
|
|
202
|
+
// Enhance embedding result with content type metadata
|
|
203
|
+
if (!embedding.contentType) {
|
|
204
|
+
embedding.contentType = contentTypeForEmbedding;
|
|
205
|
+
}
|
|
206
|
+
if (!embedding.metadata) {
|
|
207
|
+
embedding.metadata = document.metadata;
|
|
208
|
+
}
|
|
209
|
+
embeddings.push(embedding);
|
|
210
|
+
}
|
|
211
|
+
catch (error) {
|
|
212
|
+
console.warn(`Failed to embed chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
|
|
213
|
+
embeddingErrors++;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
console.log(`✓ Generated ${embeddings.length} embeddings for memory content`);
|
|
217
|
+
if (embeddings.length === 0) {
|
|
218
|
+
console.log('No embeddings generated from memory content');
|
|
219
|
+
return contentResult.contentId;
|
|
220
|
+
}
|
|
221
|
+
// Phase 5: Database Storage
|
|
222
|
+
console.log('\n--- Phase 5: Database Storage ---');
|
|
223
|
+
// Insert document with content_id reference
|
|
224
|
+
const documentContentType = this.getContentTypeForEmbedding(document.metadata?.contentType);
|
|
225
|
+
const documentId = await upsertDocument(this.db, document.source, document.title, documentContentType, document.metadata, contentResult.contentId);
|
|
226
|
+
// Insert chunks with embeddings
|
|
227
|
+
let chunksStored = 0;
|
|
228
|
+
for (let i = 0; i < chunks.length && i < embeddings.length; i++) {
|
|
229
|
+
const chunk = chunks[i];
|
|
230
|
+
const embedding = embeddings[i];
|
|
231
|
+
try {
|
|
232
|
+
await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex, documentContentType, document.metadata);
|
|
233
|
+
chunksStored++;
|
|
234
|
+
}
|
|
235
|
+
catch (error) {
|
|
236
|
+
console.error(`Failed to store chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
console.log(`✓ Stored document and ${chunksStored} chunks in database`);
|
|
240
|
+
// Phase 6: Vector Index Updates
|
|
241
|
+
console.log('\n--- Phase 6: Vector Index Updates ---');
|
|
242
|
+
await this.updateVectorIndex(embeddings);
|
|
243
|
+
const endTime = Date.now();
|
|
244
|
+
const processingTimeMs = endTime - startTime;
|
|
245
|
+
console.log('\n=== Memory Ingestion Complete ===');
|
|
246
|
+
console.log(`Content ID: ${contentResult.contentId}`);
|
|
247
|
+
console.log(`Chunks created: ${chunks.length}`);
|
|
248
|
+
console.log(`Embeddings generated: ${embeddings.length}`);
|
|
249
|
+
console.log(`Chunks stored: ${chunksStored}`);
|
|
250
|
+
console.log(`Embedding errors: ${embeddingErrors}`);
|
|
251
|
+
console.log(`Total time: ${(processingTimeMs / 1000).toFixed(2)}s`);
|
|
252
|
+
return contentResult.contentId;
|
|
253
|
+
}
|
|
254
|
+
catch (error) {
|
|
255
|
+
console.error('\n=== Memory Ingestion Failed ===');
|
|
256
|
+
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
257
|
+
throw new Error(`Memory ingestion failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
117
260
|
/**
|
|
118
261
|
* Ingest documents from a path (file or directory)
|
|
119
262
|
* Implements the complete pipeline: file processing → chunking → embedding → storage
|
|
263
|
+
* Enhanced to handle mixed content types (text and images) in multimodal mode
|
|
120
264
|
*/
|
|
121
265
|
async ingestPath(path, options = {}) {
|
|
122
266
|
const startTime = Date.now();
|
|
123
267
|
console.log(`\n=== Starting ingestion from: ${path} ===`);
|
|
124
268
|
try {
|
|
125
|
-
// Phase 1: File Discovery and Processing
|
|
269
|
+
// Phase 1: File Discovery and Processing with Content-Type Detection
|
|
126
270
|
console.log('\n--- Phase 1: File Discovery and Processing ---');
|
|
127
271
|
const fileResult = await discoverAndProcessFiles(path, options.fileOptions, this.pathManager);
|
|
128
272
|
if (fileResult.documents.length === 0) {
|
|
@@ -133,16 +277,20 @@ export class IngestionPipeline {
|
|
|
133
277
|
embeddingsGenerated: 0,
|
|
134
278
|
documentErrors: fileResult.processingResult.errors.length,
|
|
135
279
|
embeddingErrors: 0,
|
|
136
|
-
processingTimeMs: Date.now() - startTime
|
|
280
|
+
processingTimeMs: Date.now() - startTime,
|
|
281
|
+
contentIds: []
|
|
137
282
|
};
|
|
138
283
|
}
|
|
139
|
-
//
|
|
284
|
+
// Content-type detection and routing
|
|
285
|
+
const contentTypeStats = this.analyzeContentTypes(fileResult.documents);
|
|
286
|
+
console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
|
|
287
|
+
// Phase 2: Document Chunking with Content-Type Awareness
|
|
140
288
|
console.log('\n--- Phase 2: Document Chunking ---');
|
|
141
289
|
const effectiveChunkConfig = options.chunkConfig || this.defaultChunkConfig || {
|
|
142
290
|
chunkSize: config.chunk_size,
|
|
143
291
|
chunkOverlap: config.chunk_overlap
|
|
144
292
|
};
|
|
145
|
-
const chunkingResult = await this.
|
|
293
|
+
const chunkingResult = await this.chunkDocumentsWithContentTypes(fileResult.documents, effectiveChunkConfig);
|
|
146
294
|
if (chunkingResult.totalChunks === 0) {
|
|
147
295
|
console.log('No chunks created from documents');
|
|
148
296
|
return {
|
|
@@ -151,15 +299,16 @@ export class IngestionPipeline {
|
|
|
151
299
|
embeddingsGenerated: 0,
|
|
152
300
|
documentErrors: fileResult.processingResult.errors.length,
|
|
153
301
|
embeddingErrors: 0,
|
|
154
|
-
processingTimeMs: Date.now() - startTime
|
|
302
|
+
processingTimeMs: Date.now() - startTime,
|
|
303
|
+
contentIds: []
|
|
155
304
|
};
|
|
156
305
|
}
|
|
157
|
-
// Phase 3: Embedding Generation
|
|
306
|
+
// Phase 3: Embedding Generation with Content-Type Support
|
|
158
307
|
console.log('\n--- Phase 3: Embedding Generation ---');
|
|
159
|
-
const embeddingResult = await this.
|
|
160
|
-
// Phase 4: Database and Index Storage
|
|
308
|
+
const embeddingResult = await this.generateEmbeddingsWithContentTypes(chunkingResult.allChunks);
|
|
309
|
+
// Phase 4: Database and Index Storage with Content-Type Metadata
|
|
161
310
|
console.log('\n--- Phase 4: Storage Operations ---');
|
|
162
|
-
await this.
|
|
311
|
+
const contentIds = await this.storeDocumentsAndChunksWithContentTypes(chunkingResult.documentChunks, embeddingResult.embeddings);
|
|
163
312
|
// Phase 5: Vector Index Updates
|
|
164
313
|
console.log('\n--- Phase 5: Vector Index Updates ---');
|
|
165
314
|
await this.updateVectorIndex(embeddingResult.embeddings);
|
|
@@ -171,7 +320,8 @@ export class IngestionPipeline {
|
|
|
171
320
|
embeddingsGenerated: embeddingResult.embeddings.length,
|
|
172
321
|
documentErrors: fileResult.processingResult.errors.length,
|
|
173
322
|
embeddingErrors: embeddingResult.errors,
|
|
174
|
-
processingTimeMs
|
|
323
|
+
processingTimeMs,
|
|
324
|
+
contentIds
|
|
175
325
|
};
|
|
176
326
|
console.log('\n=== Ingestion Complete ===');
|
|
177
327
|
console.log(`Documents processed: ${result.documentsProcessed}`);
|
|
@@ -189,9 +339,32 @@ export class IngestionPipeline {
|
|
|
189
339
|
}
|
|
190
340
|
}
|
|
191
341
|
/**
|
|
192
|
-
*
|
|
342
|
+
* Analyze content types in the document collection
|
|
343
|
+
* @private
|
|
193
344
|
*/
|
|
194
|
-
|
|
345
|
+
analyzeContentTypes(documents) {
|
|
346
|
+
const stats = { text: 0, image: 0, other: 0 };
|
|
347
|
+
for (const document of documents) {
|
|
348
|
+
const contentType = document.metadata?.contentType || 'text';
|
|
349
|
+
switch (contentType) {
|
|
350
|
+
case 'text':
|
|
351
|
+
stats.text++;
|
|
352
|
+
break;
|
|
353
|
+
case 'image':
|
|
354
|
+
stats.image++;
|
|
355
|
+
break;
|
|
356
|
+
default:
|
|
357
|
+
stats.other++;
|
|
358
|
+
break;
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
return stats;
|
|
362
|
+
}
|
|
363
|
+
/**
|
|
364
|
+
* Chunk all documents and organize results with content-type awareness
|
|
365
|
+
* Enhanced to handle different content types appropriately
|
|
366
|
+
*/
|
|
367
|
+
async chunkDocumentsWithContentTypes(documents, chunkConfig) {
|
|
195
368
|
const documentChunks = [];
|
|
196
369
|
const allChunks = [];
|
|
197
370
|
let totalChunks = 0;
|
|
@@ -199,11 +372,36 @@ export class IngestionPipeline {
|
|
|
199
372
|
for (let i = 0; i < documents.length; i++) {
|
|
200
373
|
const document = documents[i];
|
|
201
374
|
try {
|
|
202
|
-
const
|
|
375
|
+
const contentType = document.metadata?.contentType || 'text';
|
|
376
|
+
// Handle different content types appropriately
|
|
377
|
+
let chunks;
|
|
378
|
+
if (contentType === 'image') {
|
|
379
|
+
// For images, create a single chunk with the full content (description + metadata)
|
|
380
|
+
chunks = [{
|
|
381
|
+
text: document.content,
|
|
382
|
+
chunkIndex: 0,
|
|
383
|
+
contentType: 'image',
|
|
384
|
+
metadata: document.metadata
|
|
385
|
+
}];
|
|
386
|
+
}
|
|
387
|
+
else {
|
|
388
|
+
// For text documents, use normal chunking
|
|
389
|
+
const textChunks = await chunkDocument(document, chunkConfig);
|
|
390
|
+
chunks = textChunks.map(chunk => ({
|
|
391
|
+
...chunk,
|
|
392
|
+
contentType: 'text',
|
|
393
|
+
metadata: document.metadata
|
|
394
|
+
}));
|
|
395
|
+
}
|
|
203
396
|
documentChunks.push({ document, chunks });
|
|
204
|
-
// Collect all
|
|
205
|
-
const
|
|
206
|
-
|
|
397
|
+
// Collect all chunks with their content type information
|
|
398
|
+
for (const chunk of chunks) {
|
|
399
|
+
allChunks.push({
|
|
400
|
+
text: chunk.text,
|
|
401
|
+
contentType: chunk.contentType,
|
|
402
|
+
metadata: chunk.metadata
|
|
403
|
+
});
|
|
404
|
+
}
|
|
207
405
|
totalChunks += chunks.length;
|
|
208
406
|
// Progress logging - more frequent for better user experience
|
|
209
407
|
if (documents.length <= 10 || (i + 1) % Math.max(1, Math.floor(documents.length / 10)) === 0 || i === documents.length - 1) {
|
|
@@ -221,28 +419,52 @@ export class IngestionPipeline {
|
|
|
221
419
|
return { documentChunks, allChunks, totalChunks };
|
|
222
420
|
}
|
|
223
421
|
/**
|
|
224
|
-
*
|
|
422
|
+
* Chunk all documents and organize results (legacy method for backward compatibility)
|
|
423
|
+
* @deprecated Use chunkDocumentsWithContentTypes for multimodal support
|
|
225
424
|
*/
|
|
226
|
-
async
|
|
227
|
-
|
|
425
|
+
async chunkDocuments(documents, chunkConfig) {
|
|
426
|
+
const result = await this.chunkDocumentsWithContentTypes(documents, chunkConfig);
|
|
427
|
+
// Convert to legacy format for backward compatibility
|
|
428
|
+
return {
|
|
429
|
+
documentChunks: result.documentChunks,
|
|
430
|
+
allChunks: result.allChunks.map(chunk => chunk.text),
|
|
431
|
+
totalChunks: result.totalChunks
|
|
432
|
+
};
|
|
433
|
+
}
|
|
434
|
+
/**
|
|
435
|
+
* Generate embeddings for all chunks with content-type support
|
|
436
|
+
* Enhanced to handle different content types and pass metadata to embedding function
|
|
437
|
+
*/
|
|
438
|
+
async generateEmbeddingsWithContentTypes(chunks) {
|
|
439
|
+
console.log(`Generating embeddings for ${chunks.length} chunk${chunks.length === 1 ? '' : 's'}...`);
|
|
228
440
|
console.log('This may take a few minutes depending on the number of chunks...');
|
|
229
441
|
try {
|
|
230
|
-
// Generate embeddings using injected embed function
|
|
442
|
+
// Generate embeddings using injected embed function with content type support
|
|
231
443
|
const embeddings = [];
|
|
232
444
|
let errors = 0;
|
|
233
|
-
for (let i = 0; i <
|
|
445
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
446
|
+
const chunk = chunks[i];
|
|
234
447
|
try {
|
|
235
|
-
|
|
448
|
+
// Convert MIME type to simple content type for embedding function
|
|
449
|
+
const contentTypeForEmbedding = this.getContentTypeForEmbedding(chunk.contentType);
|
|
450
|
+
const embedding = await this.embedFn(chunk.text, contentTypeForEmbedding);
|
|
451
|
+
// Enhance embedding result with content type metadata if not already present
|
|
452
|
+
if (!embedding.contentType) {
|
|
453
|
+
embedding.contentType = contentTypeForEmbedding;
|
|
454
|
+
}
|
|
455
|
+
if (!embedding.metadata && chunk.metadata) {
|
|
456
|
+
embedding.metadata = chunk.metadata;
|
|
457
|
+
}
|
|
236
458
|
embeddings.push(embedding);
|
|
237
459
|
}
|
|
238
460
|
catch (error) {
|
|
239
|
-
console.warn(`Failed to embed chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
|
|
461
|
+
console.warn(`Failed to embed ${chunk.contentType} chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
|
|
240
462
|
errors++;
|
|
241
463
|
}
|
|
242
464
|
// Progress logging
|
|
243
|
-
if (
|
|
244
|
-
const percentage = Math.round(((i + 1) /
|
|
245
|
-
console.log(`Generated ${i + 1} of ${
|
|
465
|
+
if (chunks.length > 10 && (i + 1) % Math.max(1, Math.floor(chunks.length / 10)) === 0) {
|
|
466
|
+
const percentage = Math.round(((i + 1) / chunks.length) * 100);
|
|
467
|
+
console.log(`Generated ${i + 1} of ${chunks.length} embeddings (${percentage}%)`);
|
|
246
468
|
}
|
|
247
469
|
}
|
|
248
470
|
if (errors > 0) {
|
|
@@ -257,9 +479,20 @@ export class IngestionPipeline {
|
|
|
257
479
|
}
|
|
258
480
|
}
|
|
259
481
|
/**
|
|
260
|
-
*
|
|
482
|
+
* Generate embeddings for all chunks with error handling (legacy method for backward compatibility)
|
|
483
|
+
* @deprecated Use generateEmbeddingsWithContentTypes for multimodal support
|
|
261
484
|
*/
|
|
262
|
-
async
|
|
485
|
+
async generateEmbeddings(chunkTexts) {
|
|
486
|
+
// Convert to new format for backward compatibility
|
|
487
|
+
const chunks = chunkTexts.map(text => ({ text, contentType: 'text' }));
|
|
488
|
+
return this.generateEmbeddingsWithContentTypes(chunks);
|
|
489
|
+
}
|
|
490
|
+
/**
|
|
491
|
+
* Store documents and chunks in database with content-type support
|
|
492
|
+
* Enhanced to handle content type metadata and multimodal content
|
|
493
|
+
* @returns Array of content IDs for successfully stored documents
|
|
494
|
+
*/
|
|
495
|
+
async storeDocumentsAndChunksWithContentTypes(documentChunks, embeddings) {
|
|
263
496
|
console.log(`Storing ${documentChunks.length} document${documentChunks.length === 1 ? '' : 's'} and chunks in database...`);
|
|
264
497
|
// Create a mapping of chunk text to embedding for efficient lookup
|
|
265
498
|
const embeddingMap = new Map();
|
|
@@ -275,24 +508,51 @@ export class IngestionPipeline {
|
|
|
275
508
|
}
|
|
276
509
|
let totalChunksStored = 0;
|
|
277
510
|
let documentsStored = 0;
|
|
511
|
+
const contentIds = [];
|
|
278
512
|
// Process each document sequentially
|
|
279
513
|
for (const { document, chunks } of documentChunks) {
|
|
280
514
|
try {
|
|
281
|
-
//
|
|
282
|
-
|
|
515
|
+
// Generate content ID for filesystem content using ContentManager
|
|
516
|
+
let contentId = document.metadata?.contentId;
|
|
517
|
+
if (!contentId) {
|
|
518
|
+
try {
|
|
519
|
+
// Use ContentManager to create filesystem reference and get content ID
|
|
520
|
+
const contentResult = await this.contentManager.ingestFromFilesystem(document.source);
|
|
521
|
+
contentId = contentResult.contentId;
|
|
522
|
+
// Update document metadata with content ID
|
|
523
|
+
if (!document.metadata) {
|
|
524
|
+
document.metadata = {};
|
|
525
|
+
}
|
|
526
|
+
document.metadata.contentId = contentId;
|
|
527
|
+
document.metadata.storageType = contentResult.storageType;
|
|
528
|
+
}
|
|
529
|
+
catch (contentError) {
|
|
530
|
+
console.warn(`Failed to create content reference for ${document.source}:`, contentError instanceof Error ? contentError.message : String(contentError));
|
|
531
|
+
// Continue without content ID - fallback to legacy behavior
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
// Insert or get existing document with content type support and content_id reference
|
|
535
|
+
const documentContentType = document.metadata?.contentType || 'text';
|
|
536
|
+
const documentId = await upsertDocument(this.db, document.source, document.title, documentContentType, document.metadata, contentId);
|
|
283
537
|
documentsStored++;
|
|
284
|
-
//
|
|
538
|
+
// Add content ID to results if available
|
|
539
|
+
if (contentId) {
|
|
540
|
+
contentIds.push(contentId);
|
|
541
|
+
}
|
|
542
|
+
// Insert all chunks for this document with content type support
|
|
285
543
|
let chunksStoredForDoc = 0;
|
|
286
544
|
for (const chunk of chunks) {
|
|
287
545
|
const embedding = embeddingMap.get(chunk.text);
|
|
288
546
|
if (embedding) {
|
|
289
547
|
try {
|
|
290
|
-
|
|
548
|
+
const chunkContentType = chunk.contentType || documentContentType;
|
|
549
|
+
const chunkMetadata = chunk.metadata || document.metadata;
|
|
550
|
+
await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex, chunkContentType, chunkMetadata);
|
|
291
551
|
chunksStoredForDoc++;
|
|
292
552
|
totalChunksStored++;
|
|
293
553
|
}
|
|
294
554
|
catch (chunkError) {
|
|
295
|
-
console.error(`Failed to store chunk ${chunk.chunkIndex} for document ${document.source}:`, chunkError instanceof Error ? chunkError.message : String(chunkError));
|
|
555
|
+
console.error(`Failed to store ${chunk.contentType || 'text'} chunk ${chunk.chunkIndex} for document ${document.source}:`, chunkError instanceof Error ? chunkError.message : String(chunkError));
|
|
296
556
|
// Continue with other chunks
|
|
297
557
|
}
|
|
298
558
|
}
|
|
@@ -312,6 +572,14 @@ export class IngestionPipeline {
|
|
|
312
572
|
}
|
|
313
573
|
}
|
|
314
574
|
console.log(`✓ Storage complete: ${documentsStored} documents, ${totalChunksStored} chunks saved to database`);
|
|
575
|
+
return contentIds;
|
|
576
|
+
}
|
|
577
|
+
/**
|
|
578
|
+
* Store documents and chunks in database (legacy method for backward compatibility)
|
|
579
|
+
* @deprecated Use storeDocumentsAndChunksWithContentTypes for multimodal support
|
|
580
|
+
*/
|
|
581
|
+
async storeDocumentsAndChunks(documentChunks, embeddings) {
|
|
582
|
+
await this.storeDocumentsAndChunksWithContentTypes(documentChunks, embeddings);
|
|
315
583
|
}
|
|
316
584
|
/**
|
|
317
585
|
* Update vector index with new embeddings
|
|
@@ -331,11 +599,208 @@ export class IngestionPipeline {
|
|
|
331
599
|
throw error;
|
|
332
600
|
}
|
|
333
601
|
}
|
|
602
|
+
/**
|
|
603
|
+
* Converts MIME type to simple content type for embedding function
|
|
604
|
+
* @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
|
|
605
|
+
* @returns Simple content type ('text', 'image', etc.)
|
|
606
|
+
*/
|
|
607
|
+
getContentTypeForEmbedding(mimeType) {
|
|
608
|
+
if (!mimeType) {
|
|
609
|
+
return 'text';
|
|
610
|
+
}
|
|
611
|
+
// Convert MIME types to simple content types
|
|
612
|
+
if (mimeType.startsWith('text/')) {
|
|
613
|
+
return 'text';
|
|
614
|
+
}
|
|
615
|
+
else if (mimeType.startsWith('image/')) {
|
|
616
|
+
return 'image';
|
|
617
|
+
}
|
|
618
|
+
else if (mimeType === 'application/pdf') {
|
|
619
|
+
return 'text'; // PDFs are processed as text
|
|
620
|
+
}
|
|
621
|
+
else if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
|
|
622
|
+
return 'text'; // DOCX files are processed as text
|
|
623
|
+
}
|
|
624
|
+
else {
|
|
625
|
+
return 'text'; // Default to text for unknown types
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
/**
|
|
629
|
+
* Save the vector index to disk
|
|
630
|
+
*/
|
|
631
|
+
async saveIndex() {
|
|
632
|
+
await this.indexManager.saveIndex();
|
|
633
|
+
}
|
|
634
|
+
/**
|
|
635
|
+
* Process image content from memory using the existing image processing pipeline
|
|
636
|
+
* @private
|
|
637
|
+
*/
|
|
638
|
+
async processImageFromMemory(content, contentResult, metadata, options) {
|
|
639
|
+
try {
|
|
640
|
+
// Import image processing functions
|
|
641
|
+
const { generateImageDescriptionForFile, extractImageMetadataForFile } = await import('../file-processor.js');
|
|
642
|
+
// Use the content path from the content manager (where the image is stored)
|
|
643
|
+
const imagePath = contentResult.contentPath;
|
|
644
|
+
// Extract image metadata
|
|
645
|
+
let imageMetadata = {};
|
|
646
|
+
try {
|
|
647
|
+
imageMetadata = await extractImageMetadataForFile(imagePath);
|
|
648
|
+
}
|
|
649
|
+
catch (error) {
|
|
650
|
+
console.warn(`Failed to extract image metadata for ${metadata.displayName}:`, error instanceof Error ? error.message : String(error));
|
|
651
|
+
// Continue with empty metadata
|
|
652
|
+
}
|
|
653
|
+
// Generate text description for the image
|
|
654
|
+
let descriptionResult = { description: 'Image content', model: 'none', confidence: 0 };
|
|
655
|
+
try {
|
|
656
|
+
const imageToTextOptions = {}; // Use default options for now
|
|
657
|
+
descriptionResult = await generateImageDescriptionForFile(imagePath, imageToTextOptions);
|
|
658
|
+
console.log(`✓ Generated image description: "${descriptionResult.description}"`);
|
|
659
|
+
}
|
|
660
|
+
catch (error) {
|
|
661
|
+
console.warn(`Failed to generate image description for ${metadata.displayName}:`, error instanceof Error ? error.message : String(error));
|
|
662
|
+
// Continue with fallback description
|
|
663
|
+
}
|
|
664
|
+
// Update metadata with description information
|
|
665
|
+
imageMetadata.description = descriptionResult.description;
|
|
666
|
+
imageMetadata.descriptionModel = descriptionResult.model;
|
|
667
|
+
imageMetadata.descriptionConfidence = descriptionResult.confidence;
|
|
668
|
+
// Create document with image description as content
|
|
669
|
+
const title = metadata.displayName;
|
|
670
|
+
// Create content that includes description and key metadata
|
|
671
|
+
const contentParts = [
|
|
672
|
+
`Image: ${title}`,
|
|
673
|
+
`Description: ${descriptionResult.description}`
|
|
674
|
+
];
|
|
675
|
+
if (imageMetadata.dimensions) {
|
|
676
|
+
contentParts.push(`Dimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}`);
|
|
677
|
+
}
|
|
678
|
+
if (imageMetadata.format) {
|
|
679
|
+
contentParts.push(`Format: ${imageMetadata.format}`);
|
|
680
|
+
}
|
|
681
|
+
const documentContent = contentParts.join('\n');
|
|
682
|
+
return {
|
|
683
|
+
source: metadata.displayName,
|
|
684
|
+
title,
|
|
685
|
+
content: documentContent.trim(),
|
|
686
|
+
metadata: {
|
|
687
|
+
contentType: 'image',
|
|
688
|
+
contentId: contentResult.contentId,
|
|
689
|
+
storageType: contentResult.storageType,
|
|
690
|
+
originalPath: metadata.originalPath,
|
|
691
|
+
...imageMetadata // Spread all image metadata fields
|
|
692
|
+
}
|
|
693
|
+
};
|
|
694
|
+
}
|
|
695
|
+
catch (error) {
|
|
696
|
+
console.warn(`Failed to process image from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
|
|
697
|
+
// Fallback to basic document creation
|
|
698
|
+
return {
|
|
699
|
+
source: metadata.displayName,
|
|
700
|
+
title: metadata.displayName,
|
|
701
|
+
content: `Image: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
|
|
702
|
+
metadata: {
|
|
703
|
+
contentType: 'image',
|
|
704
|
+
contentId: contentResult.contentId,
|
|
705
|
+
storageType: contentResult.storageType,
|
|
706
|
+
originalPath: metadata.originalPath,
|
|
707
|
+
processingError: error instanceof Error ? error.message : String(error)
|
|
708
|
+
}
|
|
709
|
+
};
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
/**
|
|
713
|
+
* Process PDF content from memory using the existing PDF processing pipeline
|
|
714
|
+
* @private
|
|
715
|
+
*/
|
|
716
|
+
async processPDFFromMemory(content, contentResult, metadata, options) {
|
|
717
|
+
try {
|
|
718
|
+
// Import PDF processing
|
|
719
|
+
const pdfParse = require('pdf-parse');
|
|
720
|
+
// Parse PDF content directly from buffer
|
|
721
|
+
const pdfData = await pdfParse(content);
|
|
722
|
+
console.log(`✓ Extracted ${pdfData.text.length} characters from PDF`);
|
|
723
|
+
return {
|
|
724
|
+
source: metadata.displayName,
|
|
725
|
+
title: metadata.displayName,
|
|
726
|
+
content: pdfData.text.trim(),
|
|
727
|
+
metadata: {
|
|
728
|
+
contentType: 'application/pdf',
|
|
729
|
+
contentId: contentResult.contentId,
|
|
730
|
+
storageType: contentResult.storageType,
|
|
731
|
+
originalPath: metadata.originalPath,
|
|
732
|
+
pages: pdfData.numpages,
|
|
733
|
+
pdfInfo: pdfData.info
|
|
734
|
+
}
|
|
735
|
+
};
|
|
736
|
+
}
|
|
737
|
+
catch (error) {
|
|
738
|
+
console.warn(`Failed to process PDF from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
|
|
739
|
+
// Fallback to basic document creation
|
|
740
|
+
return {
|
|
741
|
+
source: metadata.displayName,
|
|
742
|
+
title: metadata.displayName,
|
|
743
|
+
content: `PDF Document: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
|
|
744
|
+
metadata: {
|
|
745
|
+
contentType: 'application/pdf',
|
|
746
|
+
contentId: contentResult.contentId,
|
|
747
|
+
storageType: contentResult.storageType,
|
|
748
|
+
originalPath: metadata.originalPath,
|
|
749
|
+
processingError: error instanceof Error ? error.message : String(error)
|
|
750
|
+
}
|
|
751
|
+
};
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
/**
|
|
755
|
+
* Process DOCX content from memory using the existing DOCX processing pipeline
|
|
756
|
+
* @private
|
|
757
|
+
*/
|
|
758
|
+
async processDOCXFromMemory(content, contentResult, metadata, options) {
|
|
759
|
+
try {
|
|
760
|
+
// Import DOCX processing
|
|
761
|
+
const mammoth = await import('mammoth');
|
|
762
|
+
// Parse DOCX content directly from buffer
|
|
763
|
+
const docxResult = await mammoth.extractRawText({ buffer: content });
|
|
764
|
+
console.log(`✓ Extracted ${docxResult.value.length} characters from DOCX`);
|
|
765
|
+
return {
|
|
766
|
+
source: metadata.displayName,
|
|
767
|
+
title: metadata.displayName,
|
|
768
|
+
content: docxResult.value.trim(),
|
|
769
|
+
metadata: {
|
|
770
|
+
contentType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
771
|
+
contentId: contentResult.contentId,
|
|
772
|
+
storageType: contentResult.storageType,
|
|
773
|
+
originalPath: metadata.originalPath,
|
|
774
|
+
messages: docxResult.messages
|
|
775
|
+
}
|
|
776
|
+
};
|
|
777
|
+
}
|
|
778
|
+
catch (error) {
|
|
779
|
+
console.warn(`Failed to process DOCX from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
|
|
780
|
+
// Fallback to basic document creation
|
|
781
|
+
return {
|
|
782
|
+
source: metadata.displayName,
|
|
783
|
+
title: metadata.displayName,
|
|
784
|
+
content: `DOCX Document: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
|
|
785
|
+
metadata: {
|
|
786
|
+
contentType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
787
|
+
contentId: contentResult.contentId,
|
|
788
|
+
storageType: contentResult.storageType,
|
|
789
|
+
originalPath: metadata.originalPath,
|
|
790
|
+
processingError: error instanceof Error ? error.message : String(error)
|
|
791
|
+
}
|
|
792
|
+
};
|
|
793
|
+
}
|
|
794
|
+
}
|
|
334
795
|
/**
|
|
335
796
|
* Clean up resources - explicit cleanup method
|
|
336
797
|
*/
|
|
337
798
|
async cleanup() {
|
|
338
799
|
try {
|
|
800
|
+
// Clean up ContentManager to prevent resource leaks
|
|
801
|
+
if (this.contentManager && typeof this.contentManager.cleanup === 'function') {
|
|
802
|
+
this.contentManager.cleanup();
|
|
803
|
+
}
|
|
339
804
|
await this.db.close();
|
|
340
805
|
await this.indexManager.close();
|
|
341
806
|
}
|