rag-lite-ts 1.0.2 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. package/README.md +605 -93
  2. package/dist/cli/indexer.js +192 -4
  3. package/dist/cli/search.js +50 -11
  4. package/dist/cli.js +183 -26
  5. package/dist/core/abstract-embedder.d.ts +125 -0
  6. package/dist/core/abstract-embedder.js +264 -0
  7. package/dist/core/actionable-error-messages.d.ts +60 -0
  8. package/dist/core/actionable-error-messages.js +397 -0
  9. package/dist/core/batch-processing-optimizer.d.ts +155 -0
  10. package/dist/core/batch-processing-optimizer.js +541 -0
  11. package/dist/core/binary-index-format.d.ts +52 -0
  12. package/dist/core/binary-index-format.js +122 -0
  13. package/dist/core/chunker.d.ts +2 -0
  14. package/dist/core/cli-database-utils.d.ts +53 -0
  15. package/dist/core/cli-database-utils.js +239 -0
  16. package/dist/core/config.js +10 -3
  17. package/dist/core/content-errors.d.ts +111 -0
  18. package/dist/core/content-errors.js +362 -0
  19. package/dist/core/content-manager.d.ts +343 -0
  20. package/dist/core/content-manager.js +1504 -0
  21. package/dist/core/content-performance-optimizer.d.ts +150 -0
  22. package/dist/core/content-performance-optimizer.js +516 -0
  23. package/dist/core/content-resolver.d.ts +104 -0
  24. package/dist/core/content-resolver.js +285 -0
  25. package/dist/core/cross-modal-search.d.ts +164 -0
  26. package/dist/core/cross-modal-search.js +342 -0
  27. package/dist/core/database-connection-manager.d.ts +109 -0
  28. package/dist/core/database-connection-manager.js +304 -0
  29. package/dist/core/db.d.ts +141 -2
  30. package/dist/core/db.js +631 -89
  31. package/dist/core/embedder-factory.d.ts +176 -0
  32. package/dist/core/embedder-factory.js +338 -0
  33. package/dist/core/index.d.ts +3 -1
  34. package/dist/core/index.js +4 -1
  35. package/dist/core/ingestion.d.ts +85 -15
  36. package/dist/core/ingestion.js +510 -45
  37. package/dist/core/lazy-dependency-loader.d.ts +152 -0
  38. package/dist/core/lazy-dependency-loader.js +453 -0
  39. package/dist/core/mode-detection-service.d.ts +150 -0
  40. package/dist/core/mode-detection-service.js +565 -0
  41. package/dist/core/mode-model-validator.d.ts +92 -0
  42. package/dist/core/mode-model-validator.js +203 -0
  43. package/dist/core/model-registry.d.ts +120 -0
  44. package/dist/core/model-registry.js +415 -0
  45. package/dist/core/model-validator.d.ts +217 -0
  46. package/dist/core/model-validator.js +782 -0
  47. package/dist/core/polymorphic-search-factory.d.ts +154 -0
  48. package/dist/core/polymorphic-search-factory.js +344 -0
  49. package/dist/core/raglite-paths.d.ts +121 -0
  50. package/dist/core/raglite-paths.js +145 -0
  51. package/dist/core/reranking-config.d.ts +42 -0
  52. package/dist/core/reranking-config.js +156 -0
  53. package/dist/core/reranking-factory.d.ts +92 -0
  54. package/dist/core/reranking-factory.js +591 -0
  55. package/dist/core/reranking-strategies.d.ts +325 -0
  56. package/dist/core/reranking-strategies.js +720 -0
  57. package/dist/core/resource-cleanup.d.ts +163 -0
  58. package/dist/core/resource-cleanup.js +371 -0
  59. package/dist/core/resource-manager.d.ts +212 -0
  60. package/dist/core/resource-manager.js +564 -0
  61. package/dist/core/search.d.ts +28 -1
  62. package/dist/core/search.js +83 -5
  63. package/dist/core/streaming-operations.d.ts +145 -0
  64. package/dist/core/streaming-operations.js +409 -0
  65. package/dist/core/types.d.ts +3 -0
  66. package/dist/core/universal-embedder.d.ts +177 -0
  67. package/dist/core/universal-embedder.js +139 -0
  68. package/dist/core/validation-messages.d.ts +99 -0
  69. package/dist/core/validation-messages.js +334 -0
  70. package/dist/core/vector-index.d.ts +1 -1
  71. package/dist/core/vector-index.js +37 -39
  72. package/dist/factories/index.d.ts +3 -1
  73. package/dist/factories/index.js +2 -0
  74. package/dist/factories/polymorphic-factory.d.ts +50 -0
  75. package/dist/factories/polymorphic-factory.js +159 -0
  76. package/dist/factories/text-factory.d.ts +128 -34
  77. package/dist/factories/text-factory.js +346 -97
  78. package/dist/file-processor.d.ts +88 -2
  79. package/dist/file-processor.js +720 -17
  80. package/dist/index.d.ts +32 -0
  81. package/dist/index.js +29 -0
  82. package/dist/ingestion.d.ts +16 -0
  83. package/dist/ingestion.js +21 -0
  84. package/dist/mcp-server.d.ts +35 -3
  85. package/dist/mcp-server.js +1107 -31
  86. package/dist/multimodal/clip-embedder.d.ts +327 -0
  87. package/dist/multimodal/clip-embedder.js +992 -0
  88. package/dist/multimodal/index.d.ts +6 -0
  89. package/dist/multimodal/index.js +6 -0
  90. package/dist/run-error-recovery-tests.d.ts +7 -0
  91. package/dist/run-error-recovery-tests.js +101 -0
  92. package/dist/search.d.ts +60 -9
  93. package/dist/search.js +82 -11
  94. package/dist/test-utils.d.ts +8 -26
  95. package/dist/text/chunker.d.ts +1 -0
  96. package/dist/text/embedder.js +15 -8
  97. package/dist/text/index.d.ts +1 -0
  98. package/dist/text/index.js +1 -0
  99. package/dist/text/reranker.d.ts +1 -2
  100. package/dist/text/reranker.js +17 -47
  101. package/dist/text/sentence-transformer-embedder.d.ts +96 -0
  102. package/dist/text/sentence-transformer-embedder.js +340 -0
  103. package/dist/types.d.ts +39 -0
  104. package/dist/utils/vector-math.d.ts +31 -0
  105. package/dist/utils/vector-math.js +70 -0
  106. package/package.json +27 -6
  107. package/dist/api-errors.d.ts.map +0 -1
  108. package/dist/api-errors.js.map +0 -1
  109. package/dist/cli/indexer.d.ts.map +0 -1
  110. package/dist/cli/indexer.js.map +0 -1
  111. package/dist/cli/search.d.ts.map +0 -1
  112. package/dist/cli/search.js.map +0 -1
  113. package/dist/cli.d.ts.map +0 -1
  114. package/dist/cli.js.map +0 -1
  115. package/dist/config.d.ts.map +0 -1
  116. package/dist/config.js.map +0 -1
  117. package/dist/core/adapters.d.ts.map +0 -1
  118. package/dist/core/adapters.js.map +0 -1
  119. package/dist/core/chunker.d.ts.map +0 -1
  120. package/dist/core/chunker.js.map +0 -1
  121. package/dist/core/config.d.ts.map +0 -1
  122. package/dist/core/config.js.map +0 -1
  123. package/dist/core/db.d.ts.map +0 -1
  124. package/dist/core/db.js.map +0 -1
  125. package/dist/core/error-handler.d.ts.map +0 -1
  126. package/dist/core/error-handler.js.map +0 -1
  127. package/dist/core/index.d.ts.map +0 -1
  128. package/dist/core/index.js.map +0 -1
  129. package/dist/core/ingestion.d.ts.map +0 -1
  130. package/dist/core/ingestion.js.map +0 -1
  131. package/dist/core/interfaces.d.ts.map +0 -1
  132. package/dist/core/interfaces.js.map +0 -1
  133. package/dist/core/path-manager.d.ts.map +0 -1
  134. package/dist/core/path-manager.js.map +0 -1
  135. package/dist/core/search-example.d.ts +0 -25
  136. package/dist/core/search-example.d.ts.map +0 -1
  137. package/dist/core/search-example.js +0 -138
  138. package/dist/core/search-example.js.map +0 -1
  139. package/dist/core/search-pipeline-example.d.ts +0 -21
  140. package/dist/core/search-pipeline-example.d.ts.map +0 -1
  141. package/dist/core/search-pipeline-example.js +0 -188
  142. package/dist/core/search-pipeline-example.js.map +0 -1
  143. package/dist/core/search-pipeline.d.ts.map +0 -1
  144. package/dist/core/search-pipeline.js.map +0 -1
  145. package/dist/core/search.d.ts.map +0 -1
  146. package/dist/core/search.js.map +0 -1
  147. package/dist/core/types.d.ts.map +0 -1
  148. package/dist/core/types.js.map +0 -1
  149. package/dist/core/vector-index.d.ts.map +0 -1
  150. package/dist/core/vector-index.js.map +0 -1
  151. package/dist/dom-polyfills.d.ts.map +0 -1
  152. package/dist/dom-polyfills.js.map +0 -1
  153. package/dist/examples/clean-api-examples.d.ts +0 -44
  154. package/dist/examples/clean-api-examples.d.ts.map +0 -1
  155. package/dist/examples/clean-api-examples.js +0 -206
  156. package/dist/examples/clean-api-examples.js.map +0 -1
  157. package/dist/factories/index.d.ts.map +0 -1
  158. package/dist/factories/index.js.map +0 -1
  159. package/dist/factories/text-factory.d.ts.map +0 -1
  160. package/dist/factories/text-factory.js.map +0 -1
  161. package/dist/file-processor.d.ts.map +0 -1
  162. package/dist/file-processor.js.map +0 -1
  163. package/dist/index-manager.d.ts.map +0 -1
  164. package/dist/index-manager.js.map +0 -1
  165. package/dist/index.d.ts.map +0 -1
  166. package/dist/index.js.map +0 -1
  167. package/dist/indexer.d.ts.map +0 -1
  168. package/dist/indexer.js.map +0 -1
  169. package/dist/ingestion.d.ts.map +0 -1
  170. package/dist/ingestion.js.map +0 -1
  171. package/dist/mcp-server.d.ts.map +0 -1
  172. package/dist/mcp-server.js.map +0 -1
  173. package/dist/preprocess.d.ts.map +0 -1
  174. package/dist/preprocess.js.map +0 -1
  175. package/dist/preprocessors/index.d.ts.map +0 -1
  176. package/dist/preprocessors/index.js.map +0 -1
  177. package/dist/preprocessors/mdx.d.ts.map +0 -1
  178. package/dist/preprocessors/mdx.js.map +0 -1
  179. package/dist/preprocessors/mermaid.d.ts.map +0 -1
  180. package/dist/preprocessors/mermaid.js.map +0 -1
  181. package/dist/preprocessors/registry.d.ts.map +0 -1
  182. package/dist/preprocessors/registry.js.map +0 -1
  183. package/dist/search-standalone.d.ts.map +0 -1
  184. package/dist/search-standalone.js.map +0 -1
  185. package/dist/search.d.ts.map +0 -1
  186. package/dist/search.js.map +0 -1
  187. package/dist/test-utils.d.ts.map +0 -1
  188. package/dist/test-utils.js.map +0 -1
  189. package/dist/text/chunker.d.ts.map +0 -1
  190. package/dist/text/chunker.js.map +0 -1
  191. package/dist/text/embedder.d.ts.map +0 -1
  192. package/dist/text/embedder.js.map +0 -1
  193. package/dist/text/index.d.ts.map +0 -1
  194. package/dist/text/index.js.map +0 -1
  195. package/dist/text/preprocessors/index.d.ts.map +0 -1
  196. package/dist/text/preprocessors/index.js.map +0 -1
  197. package/dist/text/preprocessors/mdx.d.ts.map +0 -1
  198. package/dist/text/preprocessors/mdx.js.map +0 -1
  199. package/dist/text/preprocessors/mermaid.d.ts.map +0 -1
  200. package/dist/text/preprocessors/mermaid.js.map +0 -1
  201. package/dist/text/preprocessors/registry.d.ts.map +0 -1
  202. package/dist/text/preprocessors/registry.js.map +0 -1
  203. package/dist/text/reranker.d.ts.map +0 -1
  204. package/dist/text/reranker.js.map +0 -1
  205. package/dist/text/tokenizer.d.ts.map +0 -1
  206. package/dist/text/tokenizer.js.map +0 -1
  207. package/dist/types.d.ts.map +0 -1
  208. package/dist/types.js.map +0 -1
@@ -8,6 +8,7 @@ import { insertChunk, upsertDocument } from './db.js';
8
8
  import { config } from './config.js';
9
9
  import { DocumentPathManager } from './path-manager.js';
10
10
  import { existsSync } from 'fs';
11
+ import { ContentManager } from './content-manager.js';
11
12
  /**
12
13
  * Main ingestion pipeline class
13
14
  * Coordinates the entire process from file discovery to vector storage
@@ -19,8 +20,10 @@ export class IngestionPipeline {
19
20
  db;
20
21
  defaultChunkConfig;
21
22
  pathManager;
23
+ contentManager;
22
24
  /**
23
25
  * Creates a new IngestionPipeline with explicit dependency injection
26
+ * Enhanced with ContentManager integration for unified content system
24
27
  *
25
28
  * DEPENDENCY INJECTION PATTERN:
26
29
  * This constructor requires all dependencies to be explicitly provided, enabling:
@@ -28,6 +31,7 @@ export class IngestionPipeline {
28
31
  * - Support for different embedding models and content types
29
32
  * - Testability through mock injection
30
33
  * - Future extensibility for multimodal content processing
34
+ * - Unified content management for both filesystem and memory-based ingestion
31
35
  *
32
36
  * @param embedFn - Function to embed document chunks into vectors
33
37
  * - Signature: (query: string, contentType?: string) => Promise<EmbeddingResult>
@@ -47,32 +51,33 @@ export class IngestionPipeline {
47
51
  * - Supports different content types through metadata fields
48
52
  * - Example: await openDatabase('./db.sqlite')
49
53
  *
54
+ * @param contentManager - Optional ContentManager for unified content system
55
+ * - Handles content storage routing and deduplication
56
+ * - If not provided, creates default instance with standard configuration
57
+ * - Example: new ContentManager(db, { contentDir: '.raglite/content' })
58
+ *
50
59
  * USAGE EXAMPLES:
51
60
  * ```typescript
52
- * // Text-only ingestion pipeline
61
+ * // Text-only ingestion pipeline with unified content system
53
62
  * const textEmbedFn = await createTextEmbedder();
54
63
  * const indexManager = new IndexManager('./index.bin');
55
64
  * const db = await openDatabase('./db.sqlite');
65
+ * const contentManager = new ContentManager(db);
66
+ * const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db, undefined, contentManager);
67
+ *
68
+ * // Simple usage (ContentManager created automatically)
56
69
  * const ingestion = new IngestionPipeline(textEmbedFn, indexManager, db);
57
70
  *
58
- * // Custom embedding implementation
71
+ * // Custom embedding implementation with memory ingestion
59
72
  * const customEmbedFn = async (text) => ({
60
73
  * embedding_id: generateId(),
61
74
  * vector: await myCustomModel.embed(text)
62
75
  * });
63
76
  * const ingestion = new IngestionPipeline(customEmbedFn, indexManager, db);
64
- *
65
- * // Multimodal ingestion (future)
66
- * const multimodalEmbedFn = async (content, contentType) => {
67
- * if (contentType === 'image') {
68
- * return { embedding_id: generateId(), vector: await clipModel.embedImage(content) };
69
- * }
70
- * return { embedding_id: generateId(), vector: await clipModel.embedText(content) };
71
- * };
72
- * const ingestion = new IngestionPipeline(multimodalEmbedFn, indexManager, db);
77
+ * await ingestion.ingestFromMemory(buffer, { displayName: 'file.txt' });
73
78
  * ```
74
79
  */
75
- constructor(embedFn, indexManager, db, defaultChunkConfig) {
80
+ constructor(embedFn, indexManager, db, defaultChunkConfig, contentManager) {
76
81
  this.embedFn = embedFn;
77
82
  this.indexManager = indexManager;
78
83
  this.db = db;
@@ -89,6 +94,8 @@ export class IngestionPipeline {
89
94
  }
90
95
  // Initialize path manager with default configuration
91
96
  this.pathManager = new DocumentPathManager(config.path_storage_strategy, process.cwd());
97
+ // Initialize ContentManager (create default if not provided)
98
+ this.contentManager = contentManager || new ContentManager(this.db);
92
99
  }
93
100
  /**
94
101
  * Ingest documents from a directory
@@ -114,15 +121,152 @@ export class IngestionPipeline {
114
121
  }
115
122
  return this.ingestPath(filePath, options);
116
123
  }
124
+ /**
125
+ * Ingest content from memory buffer
126
+ * Enables MCP integration and real-time content processing
127
+ * @param content - Buffer containing the content to ingest
128
+ * @param metadata - Memory content metadata including display name and content type
129
+ * @param options - Optional ingestion configuration
130
+ * @returns Promise resolving to content ID for the ingested content
131
+ */
132
+ async ingestFromMemory(content, metadata, options = {}) {
133
+ const startTime = Date.now();
134
+ console.log(`\n=== Starting memory ingestion: ${metadata.displayName} ===`);
135
+ try {
136
+ // Phase 1: Content Storage via ContentManager
137
+ console.log('\n--- Phase 1: Content Storage ---');
138
+ const contentResult = await this.contentManager.ingestFromMemory(content, metadata);
139
+ if (contentResult.wasDeduped) {
140
+ console.log(`✓ Content deduplicated: ${metadata.displayName} (ID: ${contentResult.contentId})`);
141
+ return contentResult.contentId;
142
+ }
143
+ console.log(`✓ Content stored: ${metadata.displayName} (ID: ${contentResult.contentId})`);
144
+ // Phase 2: Document Processing
145
+ console.log('\n--- Phase 2: Document Processing ---');
146
+ // Determine content type for processing
147
+ const detectedContentType = metadata.contentType || 'text/plain';
148
+ const isImageContent = detectedContentType.startsWith('image/');
149
+ let document;
150
+ if (isImageContent) {
151
+ // Process image content using the existing image processing pipeline
152
+ console.log(`Processing image content: ${metadata.displayName} (${detectedContentType})`);
153
+ document = await this.processImageFromMemory(content, contentResult, metadata, options);
154
+ }
155
+ else if (detectedContentType === 'application/pdf') {
156
+ // Process PDF content
157
+ console.log(`Processing PDF content: ${metadata.displayName}`);
158
+ document = await this.processPDFFromMemory(content, contentResult, metadata, options);
159
+ }
160
+ else if (detectedContentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
161
+ // Process DOCX content
162
+ console.log(`Processing DOCX content: ${metadata.displayName}`);
163
+ document = await this.processDOCXFromMemory(content, contentResult, metadata, options);
164
+ }
165
+ else {
166
+ // Process as text content
167
+ console.log(`Processing text content: ${metadata.displayName} (${detectedContentType})`);
168
+ document = {
169
+ source: metadata.displayName,
170
+ title: metadata.displayName,
171
+ content: content.toString('utf8'), // Convert buffer to string for processing
172
+ metadata: {
173
+ contentType: detectedContentType,
174
+ contentId: contentResult.contentId,
175
+ storageType: contentResult.storageType,
176
+ originalPath: metadata.originalPath
177
+ }
178
+ };
179
+ }
180
+ // Phase 3: Document Chunking
181
+ console.log('\n--- Phase 3: Document Chunking ---');
182
+ const effectiveChunkConfig = options.chunkConfig || this.defaultChunkConfig || {
183
+ chunkSize: config.chunk_size,
184
+ chunkOverlap: config.chunk_overlap
185
+ };
186
+ const chunks = await chunkDocument(document, effectiveChunkConfig);
187
+ console.log(`✓ Created ${chunks.length} chunks from memory content`);
188
+ if (chunks.length === 0) {
189
+ console.log('No chunks created from memory content');
190
+ return contentResult.contentId;
191
+ }
192
+ // Phase 4: Embedding Generation
193
+ console.log('\n--- Phase 4: Embedding Generation ---');
194
+ const embeddings = [];
195
+ let embeddingErrors = 0;
196
+ for (let i = 0; i < chunks.length; i++) {
197
+ const chunk = chunks[i];
198
+ try {
199
+ // Convert MIME type to simple content type for embedding function
200
+ const contentTypeForEmbedding = this.getContentTypeForEmbedding(document.metadata?.contentType);
201
+ const embedding = await this.embedFn(chunk.text, contentTypeForEmbedding);
202
+ // Enhance embedding result with content type metadata
203
+ if (!embedding.contentType) {
204
+ embedding.contentType = contentTypeForEmbedding;
205
+ }
206
+ if (!embedding.metadata) {
207
+ embedding.metadata = document.metadata;
208
+ }
209
+ embeddings.push(embedding);
210
+ }
211
+ catch (error) {
212
+ console.warn(`Failed to embed chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
213
+ embeddingErrors++;
214
+ }
215
+ }
216
+ console.log(`✓ Generated ${embeddings.length} embeddings for memory content`);
217
+ if (embeddings.length === 0) {
218
+ console.log('No embeddings generated from memory content');
219
+ return contentResult.contentId;
220
+ }
221
+ // Phase 5: Database Storage
222
+ console.log('\n--- Phase 5: Database Storage ---');
223
+ // Insert document with content_id reference
224
+ const documentContentType = this.getContentTypeForEmbedding(document.metadata?.contentType);
225
+ const documentId = await upsertDocument(this.db, document.source, document.title, documentContentType, document.metadata, contentResult.contentId);
226
+ // Insert chunks with embeddings
227
+ let chunksStored = 0;
228
+ for (let i = 0; i < chunks.length && i < embeddings.length; i++) {
229
+ const chunk = chunks[i];
230
+ const embedding = embeddings[i];
231
+ try {
232
+ await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex, documentContentType, document.metadata);
233
+ chunksStored++;
234
+ }
235
+ catch (error) {
236
+ console.error(`Failed to store chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
237
+ }
238
+ }
239
+ console.log(`✓ Stored document and ${chunksStored} chunks in database`);
240
+ // Phase 6: Vector Index Updates
241
+ console.log('\n--- Phase 6: Vector Index Updates ---');
242
+ await this.updateVectorIndex(embeddings);
243
+ const endTime = Date.now();
244
+ const processingTimeMs = endTime - startTime;
245
+ console.log('\n=== Memory Ingestion Complete ===');
246
+ console.log(`Content ID: ${contentResult.contentId}`);
247
+ console.log(`Chunks created: ${chunks.length}`);
248
+ console.log(`Embeddings generated: ${embeddings.length}`);
249
+ console.log(`Chunks stored: ${chunksStored}`);
250
+ console.log(`Embedding errors: ${embeddingErrors}`);
251
+ console.log(`Total time: ${(processingTimeMs / 1000).toFixed(2)}s`);
252
+ return contentResult.contentId;
253
+ }
254
+ catch (error) {
255
+ console.error('\n=== Memory Ingestion Failed ===');
256
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
257
+ throw new Error(`Memory ingestion failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
258
+ }
259
+ }
117
260
  /**
118
261
  * Ingest documents from a path (file or directory)
119
262
  * Implements the complete pipeline: file processing → chunking → embedding → storage
263
+ * Enhanced to handle mixed content types (text and images) in multimodal mode
120
264
  */
121
265
  async ingestPath(path, options = {}) {
122
266
  const startTime = Date.now();
123
267
  console.log(`\n=== Starting ingestion from: ${path} ===`);
124
268
  try {
125
- // Phase 1: File Discovery and Processing
269
+ // Phase 1: File Discovery and Processing with Content-Type Detection
126
270
  console.log('\n--- Phase 1: File Discovery and Processing ---');
127
271
  const fileResult = await discoverAndProcessFiles(path, options.fileOptions, this.pathManager);
128
272
  if (fileResult.documents.length === 0) {
@@ -133,16 +277,20 @@ export class IngestionPipeline {
133
277
  embeddingsGenerated: 0,
134
278
  documentErrors: fileResult.processingResult.errors.length,
135
279
  embeddingErrors: 0,
136
- processingTimeMs: Date.now() - startTime
280
+ processingTimeMs: Date.now() - startTime,
281
+ contentIds: []
137
282
  };
138
283
  }
139
- // Phase 2: Document Chunking
284
+ // Content-type detection and routing
285
+ const contentTypeStats = this.analyzeContentTypes(fileResult.documents);
286
+ console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
287
+ // Phase 2: Document Chunking with Content-Type Awareness
140
288
  console.log('\n--- Phase 2: Document Chunking ---');
141
289
  const effectiveChunkConfig = options.chunkConfig || this.defaultChunkConfig || {
142
290
  chunkSize: config.chunk_size,
143
291
  chunkOverlap: config.chunk_overlap
144
292
  };
145
- const chunkingResult = await this.chunkDocuments(fileResult.documents, effectiveChunkConfig);
293
+ const chunkingResult = await this.chunkDocumentsWithContentTypes(fileResult.documents, effectiveChunkConfig);
146
294
  if (chunkingResult.totalChunks === 0) {
147
295
  console.log('No chunks created from documents');
148
296
  return {
@@ -151,15 +299,16 @@ export class IngestionPipeline {
151
299
  embeddingsGenerated: 0,
152
300
  documentErrors: fileResult.processingResult.errors.length,
153
301
  embeddingErrors: 0,
154
- processingTimeMs: Date.now() - startTime
302
+ processingTimeMs: Date.now() - startTime,
303
+ contentIds: []
155
304
  };
156
305
  }
157
- // Phase 3: Embedding Generation
306
+ // Phase 3: Embedding Generation with Content-Type Support
158
307
  console.log('\n--- Phase 3: Embedding Generation ---');
159
- const embeddingResult = await this.generateEmbeddings(chunkingResult.allChunks);
160
- // Phase 4: Database and Index Storage
308
+ const embeddingResult = await this.generateEmbeddingsWithContentTypes(chunkingResult.allChunks);
309
+ // Phase 4: Database and Index Storage with Content-Type Metadata
161
310
  console.log('\n--- Phase 4: Storage Operations ---');
162
- await this.storeDocumentsAndChunks(chunkingResult.documentChunks, embeddingResult.embeddings);
311
+ const contentIds = await this.storeDocumentsAndChunksWithContentTypes(chunkingResult.documentChunks, embeddingResult.embeddings);
163
312
  // Phase 5: Vector Index Updates
164
313
  console.log('\n--- Phase 5: Vector Index Updates ---');
165
314
  await this.updateVectorIndex(embeddingResult.embeddings);
@@ -171,7 +320,8 @@ export class IngestionPipeline {
171
320
  embeddingsGenerated: embeddingResult.embeddings.length,
172
321
  documentErrors: fileResult.processingResult.errors.length,
173
322
  embeddingErrors: embeddingResult.errors,
174
- processingTimeMs
323
+ processingTimeMs,
324
+ contentIds
175
325
  };
176
326
  console.log('\n=== Ingestion Complete ===');
177
327
  console.log(`Documents processed: ${result.documentsProcessed}`);
@@ -189,9 +339,32 @@ export class IngestionPipeline {
189
339
  }
190
340
  }
191
341
  /**
192
- * Chunk all documents and organize results
342
+ * Analyze content types in the document collection
343
+ * @private
193
344
  */
194
- async chunkDocuments(documents, chunkConfig) {
345
+ analyzeContentTypes(documents) {
346
+ const stats = { text: 0, image: 0, other: 0 };
347
+ for (const document of documents) {
348
+ const contentType = document.metadata?.contentType || 'text';
349
+ switch (contentType) {
350
+ case 'text':
351
+ stats.text++;
352
+ break;
353
+ case 'image':
354
+ stats.image++;
355
+ break;
356
+ default:
357
+ stats.other++;
358
+ break;
359
+ }
360
+ }
361
+ return stats;
362
+ }
363
+ /**
364
+ * Chunk all documents and organize results with content-type awareness
365
+ * Enhanced to handle different content types appropriately
366
+ */
367
+ async chunkDocumentsWithContentTypes(documents, chunkConfig) {
195
368
  const documentChunks = [];
196
369
  const allChunks = [];
197
370
  let totalChunks = 0;
@@ -199,11 +372,36 @@ export class IngestionPipeline {
199
372
  for (let i = 0; i < documents.length; i++) {
200
373
  const document = documents[i];
201
374
  try {
202
- const chunks = await chunkDocument(document, chunkConfig);
375
+ const contentType = document.metadata?.contentType || 'text';
376
+ // Handle different content types appropriately
377
+ let chunks;
378
+ if (contentType === 'image') {
379
+ // For images, create a single chunk with the full content (description + metadata)
380
+ chunks = [{
381
+ text: document.content,
382
+ chunkIndex: 0,
383
+ contentType: 'image',
384
+ metadata: document.metadata
385
+ }];
386
+ }
387
+ else {
388
+ // For text documents, use normal chunking
389
+ const textChunks = await chunkDocument(document, chunkConfig);
390
+ chunks = textChunks.map(chunk => ({
391
+ ...chunk,
392
+ contentType: 'text',
393
+ metadata: document.metadata
394
+ }));
395
+ }
203
396
  documentChunks.push({ document, chunks });
204
- // Collect all chunk texts for embedding
205
- const chunkTexts = chunks.map(chunk => chunk.text);
206
- allChunks.push(...chunkTexts);
397
+ // Collect all chunks with their content type information
398
+ for (const chunk of chunks) {
399
+ allChunks.push({
400
+ text: chunk.text,
401
+ contentType: chunk.contentType,
402
+ metadata: chunk.metadata
403
+ });
404
+ }
207
405
  totalChunks += chunks.length;
208
406
  // Progress logging - more frequent for better user experience
209
407
  if (documents.length <= 10 || (i + 1) % Math.max(1, Math.floor(documents.length / 10)) === 0 || i === documents.length - 1) {
@@ -221,28 +419,52 @@ export class IngestionPipeline {
221
419
  return { documentChunks, allChunks, totalChunks };
222
420
  }
223
421
  /**
224
- * Generate embeddings for all chunks with error handling
422
+ * Chunk all documents and organize results (legacy method for backward compatibility)
423
+ * @deprecated Use chunkDocumentsWithContentTypes for multimodal support
225
424
  */
226
- async generateEmbeddings(chunkTexts) {
227
- console.log(`Generating embeddings for ${chunkTexts.length} chunk${chunkTexts.length === 1 ? '' : 's'}...`);
425
+ async chunkDocuments(documents, chunkConfig) {
426
+ const result = await this.chunkDocumentsWithContentTypes(documents, chunkConfig);
427
+ // Convert to legacy format for backward compatibility
428
+ return {
429
+ documentChunks: result.documentChunks,
430
+ allChunks: result.allChunks.map(chunk => chunk.text),
431
+ totalChunks: result.totalChunks
432
+ };
433
+ }
434
+ /**
435
+ * Generate embeddings for all chunks with content-type support
436
+ * Enhanced to handle different content types and pass metadata to embedding function
437
+ */
438
+ async generateEmbeddingsWithContentTypes(chunks) {
439
+ console.log(`Generating embeddings for ${chunks.length} chunk${chunks.length === 1 ? '' : 's'}...`);
228
440
  console.log('This may take a few minutes depending on the number of chunks...');
229
441
  try {
230
- // Generate embeddings using injected embed function
442
+ // Generate embeddings using injected embed function with content type support
231
443
  const embeddings = [];
232
444
  let errors = 0;
233
- for (let i = 0; i < chunkTexts.length; i++) {
445
+ for (let i = 0; i < chunks.length; i++) {
446
+ const chunk = chunks[i];
234
447
  try {
235
- const embedding = await this.embedFn(chunkTexts[i]);
448
+ // Convert MIME type to simple content type for embedding function
449
+ const contentTypeForEmbedding = this.getContentTypeForEmbedding(chunk.contentType);
450
+ const embedding = await this.embedFn(chunk.text, contentTypeForEmbedding);
451
+ // Enhance embedding result with content type metadata if not already present
452
+ if (!embedding.contentType) {
453
+ embedding.contentType = contentTypeForEmbedding;
454
+ }
455
+ if (!embedding.metadata && chunk.metadata) {
456
+ embedding.metadata = chunk.metadata;
457
+ }
236
458
  embeddings.push(embedding);
237
459
  }
238
460
  catch (error) {
239
- console.warn(`Failed to embed chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
461
+ console.warn(`Failed to embed ${chunk.contentType} chunk ${i + 1}:`, error instanceof Error ? error.message : String(error));
240
462
  errors++;
241
463
  }
242
464
  // Progress logging
243
- if (chunkTexts.length > 10 && (i + 1) % Math.max(1, Math.floor(chunkTexts.length / 10)) === 0) {
244
- const percentage = Math.round(((i + 1) / chunkTexts.length) * 100);
245
- console.log(`Generated ${i + 1} of ${chunkTexts.length} embeddings (${percentage}%)`);
465
+ if (chunks.length > 10 && (i + 1) % Math.max(1, Math.floor(chunks.length / 10)) === 0) {
466
+ const percentage = Math.round(((i + 1) / chunks.length) * 100);
467
+ console.log(`Generated ${i + 1} of ${chunks.length} embeddings (${percentage}%)`);
246
468
  }
247
469
  }
248
470
  if (errors > 0) {
@@ -257,9 +479,20 @@ export class IngestionPipeline {
257
479
  }
258
480
  }
259
481
  /**
260
- * Store documents and chunks in database
482
+ * Generate embeddings for all chunks with error handling (legacy method for backward compatibility)
483
+ * @deprecated Use generateEmbeddingsWithContentTypes for multimodal support
261
484
  */
262
- async storeDocumentsAndChunks(documentChunks, embeddings) {
485
+ async generateEmbeddings(chunkTexts) {
486
+ // Convert to new format for backward compatibility
487
+ const chunks = chunkTexts.map(text => ({ text, contentType: 'text' }));
488
+ return this.generateEmbeddingsWithContentTypes(chunks);
489
+ }
490
+ /**
491
+ * Store documents and chunks in database with content-type support
492
+ * Enhanced to handle content type metadata and multimodal content
493
+ * @returns Array of content IDs for successfully stored documents
494
+ */
495
+ async storeDocumentsAndChunksWithContentTypes(documentChunks, embeddings) {
263
496
  console.log(`Storing ${documentChunks.length} document${documentChunks.length === 1 ? '' : 's'} and chunks in database...`);
264
497
  // Create a mapping of chunk text to embedding for efficient lookup
265
498
  const embeddingMap = new Map();
@@ -275,24 +508,51 @@ export class IngestionPipeline {
275
508
  }
276
509
  let totalChunksStored = 0;
277
510
  let documentsStored = 0;
511
+ const contentIds = [];
278
512
  // Process each document sequentially
279
513
  for (const { document, chunks } of documentChunks) {
280
514
  try {
281
- // Insert or get existing document
282
- const documentId = await upsertDocument(this.db, document.source, document.title);
515
+ // Generate content ID for filesystem content using ContentManager
516
+ let contentId = document.metadata?.contentId;
517
+ if (!contentId) {
518
+ try {
519
+ // Use ContentManager to create filesystem reference and get content ID
520
+ const contentResult = await this.contentManager.ingestFromFilesystem(document.source);
521
+ contentId = contentResult.contentId;
522
+ // Update document metadata with content ID
523
+ if (!document.metadata) {
524
+ document.metadata = {};
525
+ }
526
+ document.metadata.contentId = contentId;
527
+ document.metadata.storageType = contentResult.storageType;
528
+ }
529
+ catch (contentError) {
530
+ console.warn(`Failed to create content reference for ${document.source}:`, contentError instanceof Error ? contentError.message : String(contentError));
531
+ // Continue without content ID - fallback to legacy behavior
532
+ }
533
+ }
534
+ // Insert or get existing document with content type support and content_id reference
535
+ const documentContentType = document.metadata?.contentType || 'text';
536
+ const documentId = await upsertDocument(this.db, document.source, document.title, documentContentType, document.metadata, contentId);
283
537
  documentsStored++;
284
- // Insert all chunks for this document
538
+ // Add content ID to results if available
539
+ if (contentId) {
540
+ contentIds.push(contentId);
541
+ }
542
+ // Insert all chunks for this document with content type support
285
543
  let chunksStoredForDoc = 0;
286
544
  for (const chunk of chunks) {
287
545
  const embedding = embeddingMap.get(chunk.text);
288
546
  if (embedding) {
289
547
  try {
290
- await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex);
548
+ const chunkContentType = chunk.contentType || documentContentType;
549
+ const chunkMetadata = chunk.metadata || document.metadata;
550
+ await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex, chunkContentType, chunkMetadata);
291
551
  chunksStoredForDoc++;
292
552
  totalChunksStored++;
293
553
  }
294
554
  catch (chunkError) {
295
- console.error(`Failed to store chunk ${chunk.chunkIndex} for document ${document.source}:`, chunkError instanceof Error ? chunkError.message : String(chunkError));
555
+ console.error(`Failed to store ${chunk.contentType || 'text'} chunk ${chunk.chunkIndex} for document ${document.source}:`, chunkError instanceof Error ? chunkError.message : String(chunkError));
296
556
  // Continue with other chunks
297
557
  }
298
558
  }
@@ -312,6 +572,14 @@ export class IngestionPipeline {
312
572
  }
313
573
  }
314
574
  console.log(`✓ Storage complete: ${documentsStored} documents, ${totalChunksStored} chunks saved to database`);
575
+ return contentIds;
576
+ }
577
+ /**
578
+ * Store documents and chunks in database (legacy method for backward compatibility)
579
+ * @deprecated Use storeDocumentsAndChunksWithContentTypes for multimodal support
580
+ */
581
+ async storeDocumentsAndChunks(documentChunks, embeddings) {
582
+ await this.storeDocumentsAndChunksWithContentTypes(documentChunks, embeddings);
315
583
  }
316
584
  /**
317
585
  * Update vector index with new embeddings
@@ -331,11 +599,208 @@ export class IngestionPipeline {
331
599
  throw error;
332
600
  }
333
601
  }
602
+ /**
603
+ * Converts MIME type to simple content type for embedding function
604
+ * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
605
+ * @returns Simple content type ('text', 'image', etc.)
606
+ */
607
+ getContentTypeForEmbedding(mimeType) {
608
+ if (!mimeType) {
609
+ return 'text';
610
+ }
611
+ // Convert MIME types to simple content types
612
+ if (mimeType.startsWith('text/')) {
613
+ return 'text';
614
+ }
615
+ else if (mimeType.startsWith('image/')) {
616
+ return 'image';
617
+ }
618
+ else if (mimeType === 'application/pdf') {
619
+ return 'text'; // PDFs are processed as text
620
+ }
621
+ else if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
622
+ return 'text'; // DOCX files are processed as text
623
+ }
624
+ else {
625
+ return 'text'; // Default to text for unknown types
626
+ }
627
+ }
628
+ /**
629
+ * Save the vector index to disk
630
+ */
631
+ async saveIndex() {
632
+ await this.indexManager.saveIndex();
633
+ }
634
+ /**
635
+ * Process image content from memory using the existing image processing pipeline
636
+ * @private
637
+ */
638
+ async processImageFromMemory(content, contentResult, metadata, options) {
639
+ try {
640
+ // Import image processing functions
641
+ const { generateImageDescriptionForFile, extractImageMetadataForFile } = await import('../file-processor.js');
642
+ // Use the content path from the content manager (where the image is stored)
643
+ const imagePath = contentResult.contentPath;
644
+ // Extract image metadata
645
+ let imageMetadata = {};
646
+ try {
647
+ imageMetadata = await extractImageMetadataForFile(imagePath);
648
+ }
649
+ catch (error) {
650
+ console.warn(`Failed to extract image metadata for ${metadata.displayName}:`, error instanceof Error ? error.message : String(error));
651
+ // Continue with empty metadata
652
+ }
653
+ // Generate text description for the image
654
+ let descriptionResult = { description: 'Image content', model: 'none', confidence: 0 };
655
+ try {
656
+ const imageToTextOptions = {}; // Use default options for now
657
+ descriptionResult = await generateImageDescriptionForFile(imagePath, imageToTextOptions);
658
+ console.log(`✓ Generated image description: "${descriptionResult.description}"`);
659
+ }
660
+ catch (error) {
661
+ console.warn(`Failed to generate image description for ${metadata.displayName}:`, error instanceof Error ? error.message : String(error));
662
+ // Continue with fallback description
663
+ }
664
+ // Update metadata with description information
665
+ imageMetadata.description = descriptionResult.description;
666
+ imageMetadata.descriptionModel = descriptionResult.model;
667
+ imageMetadata.descriptionConfidence = descriptionResult.confidence;
668
+ // Create document with image description as content
669
+ const title = metadata.displayName;
670
+ // Create content that includes description and key metadata
671
+ const contentParts = [
672
+ `Image: ${title}`,
673
+ `Description: ${descriptionResult.description}`
674
+ ];
675
+ if (imageMetadata.dimensions) {
676
+ contentParts.push(`Dimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}`);
677
+ }
678
+ if (imageMetadata.format) {
679
+ contentParts.push(`Format: ${imageMetadata.format}`);
680
+ }
681
+ const documentContent = contentParts.join('\n');
682
+ return {
683
+ source: metadata.displayName,
684
+ title,
685
+ content: documentContent.trim(),
686
+ metadata: {
687
+ contentType: 'image',
688
+ contentId: contentResult.contentId,
689
+ storageType: contentResult.storageType,
690
+ originalPath: metadata.originalPath,
691
+ ...imageMetadata // Spread all image metadata fields
692
+ }
693
+ };
694
+ }
695
+ catch (error) {
696
+ console.warn(`Failed to process image from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
697
+ // Fallback to basic document creation
698
+ return {
699
+ source: metadata.displayName,
700
+ title: metadata.displayName,
701
+ content: `Image: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
702
+ metadata: {
703
+ contentType: 'image',
704
+ contentId: contentResult.contentId,
705
+ storageType: contentResult.storageType,
706
+ originalPath: metadata.originalPath,
707
+ processingError: error instanceof Error ? error.message : String(error)
708
+ }
709
+ };
710
+ }
711
+ }
712
+ /**
713
+ * Process PDF content from memory using the existing PDF processing pipeline
714
+ * @private
715
+ */
716
+ async processPDFFromMemory(content, contentResult, metadata, options) {
717
+ try {
718
+ // Import PDF processing
719
+ const pdfParse = require('pdf-parse');
720
+ // Parse PDF content directly from buffer
721
+ const pdfData = await pdfParse(content);
722
+ console.log(`✓ Extracted ${pdfData.text.length} characters from PDF`);
723
+ return {
724
+ source: metadata.displayName,
725
+ title: metadata.displayName,
726
+ content: pdfData.text.trim(),
727
+ metadata: {
728
+ contentType: 'application/pdf',
729
+ contentId: contentResult.contentId,
730
+ storageType: contentResult.storageType,
731
+ originalPath: metadata.originalPath,
732
+ pages: pdfData.numpages,
733
+ pdfInfo: pdfData.info
734
+ }
735
+ };
736
+ }
737
+ catch (error) {
738
+ console.warn(`Failed to process PDF from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
739
+ // Fallback to basic document creation
740
+ return {
741
+ source: metadata.displayName,
742
+ title: metadata.displayName,
743
+ content: `PDF Document: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
744
+ metadata: {
745
+ contentType: 'application/pdf',
746
+ contentId: contentResult.contentId,
747
+ storageType: contentResult.storageType,
748
+ originalPath: metadata.originalPath,
749
+ processingError: error instanceof Error ? error.message : String(error)
750
+ }
751
+ };
752
+ }
753
+ }
754
+ /**
755
+ * Process DOCX content from memory using the existing DOCX processing pipeline
756
+ * @private
757
+ */
758
+ async processDOCXFromMemory(content, contentResult, metadata, options) {
759
+ try {
760
+ // Import DOCX processing
761
+ const mammoth = await import('mammoth');
762
+ // Parse DOCX content directly from buffer
763
+ const docxResult = await mammoth.extractRawText({ buffer: content });
764
+ console.log(`✓ Extracted ${docxResult.value.length} characters from DOCX`);
765
+ return {
766
+ source: metadata.displayName,
767
+ title: metadata.displayName,
768
+ content: docxResult.value.trim(),
769
+ metadata: {
770
+ contentType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
771
+ contentId: contentResult.contentId,
772
+ storageType: contentResult.storageType,
773
+ originalPath: metadata.originalPath,
774
+ messages: docxResult.messages
775
+ }
776
+ };
777
+ }
778
+ catch (error) {
779
+ console.warn(`Failed to process DOCX from memory, falling back to basic processing:`, error instanceof Error ? error.message : String(error));
780
+ // Fallback to basic document creation
781
+ return {
782
+ source: metadata.displayName,
783
+ title: metadata.displayName,
784
+ content: `DOCX Document: ${metadata.displayName}\nPath: ${contentResult.contentPath}`,
785
+ metadata: {
786
+ contentType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
787
+ contentId: contentResult.contentId,
788
+ storageType: contentResult.storageType,
789
+ originalPath: metadata.originalPath,
790
+ processingError: error instanceof Error ? error.message : String(error)
791
+ }
792
+ };
793
+ }
794
+ }
334
795
  /**
335
796
  * Clean up resources - explicit cleanup method
336
797
  */
337
798
  async cleanup() {
338
799
  try {
800
+ // Clean up ContentManager to prevent resource leaks
801
+ if (this.contentManager && typeof this.contentManager.cleanup === 'function') {
802
+ this.contentManager.cleanup();
803
+ }
339
804
  await this.db.close();
340
805
  await this.indexManager.close();
341
806
  }