rag-lite-ts 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/README.md +606 -93
  2. package/dist/cli/indexer.js +192 -4
  3. package/dist/cli/search.js +50 -11
  4. package/dist/cli.js +183 -26
  5. package/dist/core/abstract-embedder.d.ts +125 -0
  6. package/dist/core/abstract-embedder.js +264 -0
  7. package/dist/core/actionable-error-messages.d.ts +60 -0
  8. package/dist/core/actionable-error-messages.js +397 -0
  9. package/dist/core/batch-processing-optimizer.d.ts +155 -0
  10. package/dist/core/batch-processing-optimizer.js +541 -0
  11. package/dist/core/chunker.d.ts +2 -0
  12. package/dist/core/cli-database-utils.d.ts +53 -0
  13. package/dist/core/cli-database-utils.js +239 -0
  14. package/dist/core/config.js +10 -3
  15. package/dist/core/content-errors.d.ts +111 -0
  16. package/dist/core/content-errors.js +362 -0
  17. package/dist/core/content-manager.d.ts +343 -0
  18. package/dist/core/content-manager.js +1504 -0
  19. package/dist/core/content-performance-optimizer.d.ts +150 -0
  20. package/dist/core/content-performance-optimizer.js +516 -0
  21. package/dist/core/content-resolver.d.ts +104 -0
  22. package/dist/core/content-resolver.js +285 -0
  23. package/dist/core/cross-modal-search.d.ts +164 -0
  24. package/dist/core/cross-modal-search.js +342 -0
  25. package/dist/core/database-connection-manager.d.ts +109 -0
  26. package/dist/core/database-connection-manager.js +304 -0
  27. package/dist/core/db.d.ts +141 -2
  28. package/dist/core/db.js +631 -89
  29. package/dist/core/embedder-factory.d.ts +176 -0
  30. package/dist/core/embedder-factory.js +338 -0
  31. package/dist/core/index.d.ts +3 -1
  32. package/dist/core/index.js +4 -1
  33. package/dist/core/ingestion.d.ts +85 -15
  34. package/dist/core/ingestion.js +510 -45
  35. package/dist/core/lazy-dependency-loader.d.ts +152 -0
  36. package/dist/core/lazy-dependency-loader.js +453 -0
  37. package/dist/core/mode-detection-service.d.ts +150 -0
  38. package/dist/core/mode-detection-service.js +565 -0
  39. package/dist/core/mode-model-validator.d.ts +92 -0
  40. package/dist/core/mode-model-validator.js +203 -0
  41. package/dist/core/model-registry.d.ts +120 -0
  42. package/dist/core/model-registry.js +415 -0
  43. package/dist/core/model-validator.d.ts +217 -0
  44. package/dist/core/model-validator.js +782 -0
  45. package/dist/core/polymorphic-search-factory.d.ts +154 -0
  46. package/dist/core/polymorphic-search-factory.js +344 -0
  47. package/dist/core/raglite-paths.d.ts +121 -0
  48. package/dist/core/raglite-paths.js +145 -0
  49. package/dist/core/reranking-config.d.ts +42 -0
  50. package/dist/core/reranking-config.js +156 -0
  51. package/dist/core/reranking-factory.d.ts +92 -0
  52. package/dist/core/reranking-factory.js +591 -0
  53. package/dist/core/reranking-strategies.d.ts +325 -0
  54. package/dist/core/reranking-strategies.js +720 -0
  55. package/dist/core/resource-cleanup.d.ts +163 -0
  56. package/dist/core/resource-cleanup.js +371 -0
  57. package/dist/core/resource-manager.d.ts +212 -0
  58. package/dist/core/resource-manager.js +564 -0
  59. package/dist/core/search.d.ts +28 -1
  60. package/dist/core/search.js +83 -5
  61. package/dist/core/streaming-operations.d.ts +145 -0
  62. package/dist/core/streaming-operations.js +409 -0
  63. package/dist/core/types.d.ts +3 -0
  64. package/dist/core/universal-embedder.d.ts +177 -0
  65. package/dist/core/universal-embedder.js +139 -0
  66. package/dist/core/validation-messages.d.ts +99 -0
  67. package/dist/core/validation-messages.js +334 -0
  68. package/dist/core/vector-index.js +7 -8
  69. package/dist/factories/index.d.ts +1 -1
  70. package/dist/factories/text-factory.d.ts +128 -34
  71. package/dist/factories/text-factory.js +346 -97
  72. package/dist/file-processor.d.ts +88 -2
  73. package/dist/file-processor.js +720 -17
  74. package/dist/index.d.ts +9 -0
  75. package/dist/index.js +11 -0
  76. package/dist/ingestion.d.ts +16 -0
  77. package/dist/ingestion.js +21 -0
  78. package/dist/mcp-server.d.ts +35 -3
  79. package/dist/mcp-server.js +1107 -31
  80. package/dist/multimodal/clip-embedder.d.ts +314 -0
  81. package/dist/multimodal/clip-embedder.js +945 -0
  82. package/dist/multimodal/index.d.ts +6 -0
  83. package/dist/multimodal/index.js +6 -0
  84. package/dist/run-error-recovery-tests.d.ts +7 -0
  85. package/dist/run-error-recovery-tests.js +101 -0
  86. package/dist/search.d.ts +26 -0
  87. package/dist/search.js +54 -1
  88. package/dist/test-utils.d.ts +8 -26
  89. package/dist/text/chunker.d.ts +1 -0
  90. package/dist/text/embedder.js +15 -8
  91. package/dist/text/index.d.ts +1 -0
  92. package/dist/text/index.js +1 -0
  93. package/dist/text/reranker.d.ts +1 -2
  94. package/dist/text/reranker.js +17 -47
  95. package/dist/text/sentence-transformer-embedder.d.ts +96 -0
  96. package/dist/text/sentence-transformer-embedder.js +340 -0
  97. package/dist/types.d.ts +39 -0
  98. package/dist/utils/vector-math.d.ts +31 -0
  99. package/dist/utils/vector-math.js +70 -0
  100. package/package.json +15 -3
  101. package/dist/api-errors.d.ts.map +0 -1
  102. package/dist/api-errors.js.map +0 -1
  103. package/dist/cli/indexer.d.ts.map +0 -1
  104. package/dist/cli/indexer.js.map +0 -1
  105. package/dist/cli/search.d.ts.map +0 -1
  106. package/dist/cli/search.js.map +0 -1
  107. package/dist/cli.d.ts.map +0 -1
  108. package/dist/cli.js.map +0 -1
  109. package/dist/config.d.ts.map +0 -1
  110. package/dist/config.js.map +0 -1
  111. package/dist/core/adapters.d.ts.map +0 -1
  112. package/dist/core/adapters.js.map +0 -1
  113. package/dist/core/chunker.d.ts.map +0 -1
  114. package/dist/core/chunker.js.map +0 -1
  115. package/dist/core/config.d.ts.map +0 -1
  116. package/dist/core/config.js.map +0 -1
  117. package/dist/core/db.d.ts.map +0 -1
  118. package/dist/core/db.js.map +0 -1
  119. package/dist/core/error-handler.d.ts.map +0 -1
  120. package/dist/core/error-handler.js.map +0 -1
  121. package/dist/core/index.d.ts.map +0 -1
  122. package/dist/core/index.js.map +0 -1
  123. package/dist/core/ingestion.d.ts.map +0 -1
  124. package/dist/core/ingestion.js.map +0 -1
  125. package/dist/core/interfaces.d.ts.map +0 -1
  126. package/dist/core/interfaces.js.map +0 -1
  127. package/dist/core/path-manager.d.ts.map +0 -1
  128. package/dist/core/path-manager.js.map +0 -1
  129. package/dist/core/search-example.d.ts +0 -25
  130. package/dist/core/search-example.d.ts.map +0 -1
  131. package/dist/core/search-example.js +0 -138
  132. package/dist/core/search-example.js.map +0 -1
  133. package/dist/core/search-pipeline-example.d.ts +0 -21
  134. package/dist/core/search-pipeline-example.d.ts.map +0 -1
  135. package/dist/core/search-pipeline-example.js +0 -188
  136. package/dist/core/search-pipeline-example.js.map +0 -1
  137. package/dist/core/search-pipeline.d.ts.map +0 -1
  138. package/dist/core/search-pipeline.js.map +0 -1
  139. package/dist/core/search.d.ts.map +0 -1
  140. package/dist/core/search.js.map +0 -1
  141. package/dist/core/types.d.ts.map +0 -1
  142. package/dist/core/types.js.map +0 -1
  143. package/dist/core/vector-index.d.ts.map +0 -1
  144. package/dist/core/vector-index.js.map +0 -1
  145. package/dist/dom-polyfills.d.ts.map +0 -1
  146. package/dist/dom-polyfills.js.map +0 -1
  147. package/dist/examples/clean-api-examples.d.ts +0 -44
  148. package/dist/examples/clean-api-examples.d.ts.map +0 -1
  149. package/dist/examples/clean-api-examples.js +0 -206
  150. package/dist/examples/clean-api-examples.js.map +0 -1
  151. package/dist/factories/index.d.ts.map +0 -1
  152. package/dist/factories/index.js.map +0 -1
  153. package/dist/factories/text-factory.d.ts.map +0 -1
  154. package/dist/factories/text-factory.js.map +0 -1
  155. package/dist/file-processor.d.ts.map +0 -1
  156. package/dist/file-processor.js.map +0 -1
  157. package/dist/index-manager.d.ts.map +0 -1
  158. package/dist/index-manager.js.map +0 -1
  159. package/dist/index.d.ts.map +0 -1
  160. package/dist/index.js.map +0 -1
  161. package/dist/indexer.d.ts.map +0 -1
  162. package/dist/indexer.js.map +0 -1
  163. package/dist/ingestion.d.ts.map +0 -1
  164. package/dist/ingestion.js.map +0 -1
  165. package/dist/mcp-server.d.ts.map +0 -1
  166. package/dist/mcp-server.js.map +0 -1
  167. package/dist/preprocess.d.ts.map +0 -1
  168. package/dist/preprocess.js.map +0 -1
  169. package/dist/preprocessors/index.d.ts.map +0 -1
  170. package/dist/preprocessors/index.js.map +0 -1
  171. package/dist/preprocessors/mdx.d.ts.map +0 -1
  172. package/dist/preprocessors/mdx.js.map +0 -1
  173. package/dist/preprocessors/mermaid.d.ts.map +0 -1
  174. package/dist/preprocessors/mermaid.js.map +0 -1
  175. package/dist/preprocessors/registry.d.ts.map +0 -1
  176. package/dist/preprocessors/registry.js.map +0 -1
  177. package/dist/search-standalone.d.ts.map +0 -1
  178. package/dist/search-standalone.js.map +0 -1
  179. package/dist/search.d.ts.map +0 -1
  180. package/dist/search.js.map +0 -1
  181. package/dist/test-utils.d.ts.map +0 -1
  182. package/dist/test-utils.js.map +0 -1
  183. package/dist/text/chunker.d.ts.map +0 -1
  184. package/dist/text/chunker.js.map +0 -1
  185. package/dist/text/embedder.d.ts.map +0 -1
  186. package/dist/text/embedder.js.map +0 -1
  187. package/dist/text/index.d.ts.map +0 -1
  188. package/dist/text/index.js.map +0 -1
  189. package/dist/text/preprocessors/index.d.ts.map +0 -1
  190. package/dist/text/preprocessors/index.js.map +0 -1
  191. package/dist/text/preprocessors/mdx.d.ts.map +0 -1
  192. package/dist/text/preprocessors/mdx.js.map +0 -1
  193. package/dist/text/preprocessors/mermaid.d.ts.map +0 -1
  194. package/dist/text/preprocessors/mermaid.js.map +0 -1
  195. package/dist/text/preprocessors/registry.d.ts.map +0 -1
  196. package/dist/text/preprocessors/registry.js.map +0 -1
  197. package/dist/text/reranker.d.ts.map +0 -1
  198. package/dist/text/reranker.js.map +0 -1
  199. package/dist/text/tokenizer.d.ts.map +0 -1
  200. package/dist/text/tokenizer.js.map +0 -1
  201. package/dist/types.d.ts.map +0 -1
  202. package/dist/types.js.map +0 -1
@@ -5,10 +5,21 @@
5
5
  * FACTORY PATTERN BENEFITS:
6
6
  * - Abstracts complex initialization (model loading, database setup, index initialization)
7
7
  * - Provides simple API for common use cases while preserving access to dependency injection
8
- * - Handles error recovery and validation
8
+ * - Clear validation and error handling without fallback mechanisms
9
9
  * - Supports different embedding models and configurations
10
10
  * - Enables clean separation between simple usage and advanced customization
11
11
  *
12
+ * MODE SELECTION GUIDE:
13
+ * - Text Mode (default): Optimized for text-only content
14
+ * - Uses sentence-transformer models (fast, accurate for text)
15
+ * - Images converted to text descriptions
16
+ * - Best for: document search, text clustering, semantic similarity
17
+ *
18
+ * - Multimodal Mode: Optimized for mixed text/image content
19
+ * - Uses CLIP models (unified embedding space)
20
+ * - True cross-modal search (text finds images, images find text)
21
+ * - Best for: image search, visual QA, multimodal retrieval
22
+ *
12
23
  * USAGE PATTERNS:
13
24
  *
14
25
  * 1. Simple Search Setup:
@@ -43,15 +54,31 @@
43
54
  * const results = await searchEngine.search('query');
44
55
  * ```
45
56
  *
46
- * 4. Error Recovery:
57
+ * 4. Clear Error Handling:
47
58
  * ```typescript
48
- * // Create with automatic fallback options
49
- * const search = await TextFactoryHelpers.createSearchWithFallback(
59
+ * // Create with clear validation and error reporting
60
+ * const search = await TextFactoryHelpers.createSearchWithValidation(
50
61
  * './index.bin',
51
62
  * './db.sqlite',
52
- * { enableReranking: true } // Will fallback to disabled if reranking fails
63
+ * { enableReranking: true } // Clear errors if issues occur
53
64
  * );
54
65
  * ```
66
+ *
67
+ * 5. Mode Selection:
68
+ * ```typescript
69
+ * // Text mode (default) - optimized for text-only content
70
+ * const textIngestion = await TextIngestionFactory.create('./db.sqlite', './index.bin', {
71
+ * mode: 'text',
72
+ * embeddingModel: 'sentence-transformers/all-MiniLM-L6-v2'
73
+ * });
74
+ *
75
+ * // Multimodal mode - enables cross-modal search
76
+ * const multimodalIngestion = await TextIngestionFactory.create('./db.sqlite', './index.bin', {
77
+ * mode: 'multimodal',
78
+ * embeddingModel: 'Xenova/clip-vit-base-patch32',
79
+ * rerankingStrategy: 'text-derived'
80
+ * });
81
+ * ```
55
82
  */
56
83
  import { SearchEngine } from '../core/search.js';
57
84
  import { IngestionPipeline } from '../core/ingestion.js';
@@ -63,16 +90,26 @@ import { config, getModelDefaults } from '../core/config.js';
63
90
  import { existsSync } from 'fs';
64
91
  import { dirname } from 'path';
65
92
  import { mkdirSync } from 'fs';
93
+ import { ContentManager } from '../core/content-manager.js';
94
+ import { validateModeModelCompatibilityOrThrow } from '../core/mode-model-validator.js';
95
+ import { createMissingFileError, createInvalidPathError, createFactoryCreationError, createModeMismatchError } from '../core/actionable-error-messages.js';
66
96
  /**
67
97
  * Factory for creating text-based SearchEngine instances
68
98
  * Handles model loading, database initialization, and index setup
69
99
  *
70
100
  * This factory abstracts the complex initialization process required for text search:
71
- * 1. Loads and validates text embedding models
72
- * 2. Optionally loads reranking models with fallback handling
73
- * 3. Establishes database connections and initializes schema
74
- * 4. Loads vector indexes with proper model compatibility checking
75
- * 5. Creates SearchEngine with proper dependency injection
101
+ * 1. Auto-detects embedding model from database configuration
102
+ * 2. Validates mode-model compatibility (no fallback mechanisms)
103
+ * 3. Loads embedding models with clear error reporting
104
+ * 4. Optionally loads reranking models based on configuration
105
+ * 5. Establishes database connections and initializes schema
106
+ * 6. Loads vector indexes with proper model compatibility checking
107
+ * 7. Creates SearchEngine with proper dependency injection
108
+ *
109
+ * Mode Support:
110
+ * - Automatically detects mode from database (text or multimodal)
111
+ * - Each mode uses its optimal implementation without fallbacks
112
+ * - Clear validation ensures mode-model compatibility
76
113
  *
77
114
  * @example
78
115
  * ```typescript
@@ -100,7 +137,7 @@ export class TextSearchFactory {
100
137
  * This method handles the complete initialization process:
101
138
  * - Validates that required files exist
102
139
  * - Loads text embedding model (with lazy initialization)
103
- * - Optionally loads reranking model (with graceful fallback)
140
+ * - Optionally loads reranking model (with clear error reporting)
104
141
  * - Opens database connection and initializes schema
105
142
  * - Loads vector index with compatibility validation
106
143
  * - Creates SearchEngine with dependency injection
@@ -135,18 +172,21 @@ export class TextSearchFactory {
135
172
  console.log('🏭 TextSearchFactory: Initializing text search engine...');
136
173
  // Validate input paths
137
174
  if (!indexPath || !dbPath) {
138
- throw new Error('Both indexPath and dbPath are required');
175
+ throw createInvalidPathError([
176
+ { name: 'indexPath', value: indexPath },
177
+ { name: 'dbPath', value: dbPath }
178
+ ], { operationContext: 'TextSearchFactory.create' });
139
179
  }
140
180
  // Check if required files exist
141
181
  if (!existsSync(indexPath)) {
142
- throw new Error(`Vector index not found at: ${indexPath}\n` +
143
- 'Run ingestion first to create the index, or check the path.\n' +
144
- 'Example: const ingestion = await IngestionFactory.create(dbPath, indexPath);');
182
+ throw createMissingFileError(indexPath, 'index', {
183
+ operationContext: 'TextSearchFactory.create'
184
+ });
145
185
  }
146
186
  if (!existsSync(dbPath)) {
147
- throw new Error(`Database not found at: ${dbPath}\n` +
148
- 'Run ingestion first to create the database, or check the path.\n' +
149
- 'Example: const ingestion = await IngestionFactory.create(dbPath, indexPath);');
187
+ throw createMissingFileError(dbPath, 'database', {
188
+ operationContext: 'TextSearchFactory.create'
189
+ });
150
190
  }
151
191
  // Step 1: Auto-detect embedding model from database
152
192
  let embeddingModel = options.embeddingModel;
@@ -180,6 +220,10 @@ export class TextSearchFactory {
180
220
  modelDimensions = modelDefaults.dimensions;
181
221
  console.log(`📊 Using specified embedding model: ${embeddingModel} (${modelDimensions} dimensions)`);
182
222
  }
223
+ // Step 1.5: Validate mode-model compatibility at creation time
224
+ console.log('🔍 Validating mode-model compatibility...');
225
+ validateModeModelCompatibilityOrThrow('text', embeddingModel);
226
+ console.log('✓ Mode-model compatibility validated');
183
227
  // Step 2: Initialize embedding function
184
228
  console.log('📊 Loading text embedding model...');
185
229
  const embedFn = createTextEmbedFunction(embeddingModel, options.batchSize);
@@ -188,17 +232,11 @@ export class TextSearchFactory {
188
232
  // Step 3: Initialize reranking function (optional)
189
233
  let rerankFn;
190
234
  if (options.enableReranking === true) { // Default to disabled for local-first, fast RAG-lite
191
- try {
192
- console.log('🔄 Loading text reranking model...');
193
- rerankFn = createTextRerankFunction(options.rerankingModel);
194
- // Test reranking function
195
- await rerankFn('test query', []);
196
- console.log('✓ Text reranking model loaded successfully');
197
- }
198
- catch (error) {
199
- console.warn(`Failed to load reranking model, continuing without reranking: ${error instanceof Error ? error.message : 'Unknown error'}`);
200
- rerankFn = undefined;
201
- }
235
+ console.log('🔄 Loading text reranking model...');
236
+ rerankFn = createTextRerankFunction(options.rerankingModel);
237
+ // Test reranking function - fail clearly if there are issues
238
+ await rerankFn('test query', []);
239
+ console.log(' Text reranking model loaded successfully');
202
240
  }
203
241
  else {
204
242
  console.log('🔄 Reranking disabled by default (local-first, fast mode)');
@@ -215,9 +253,14 @@ export class TextSearchFactory {
215
253
  const indexManager = new IndexManager(indexPath, dbPath, modelDimensions, embeddingModel);
216
254
  await indexManager.initialize();
217
255
  console.log('✓ Vector index loaded successfully');
218
- // Step 7: Create SearchEngine with dependency injection
219
- const searchEngine = new SearchEngine(embedFn, indexManager, db, rerankFn);
220
- // Step 8: Validate the setup
256
+ // Step 7: Create ContentResolver for unified content system
257
+ console.log('📁 Initializing content resolver...');
258
+ const { ContentResolver } = await import('../core/content-resolver.js');
259
+ const contentResolver = new ContentResolver(db);
260
+ console.log('✓ Content resolver ready');
261
+ // Step 8: Create SearchEngine with dependency injection
262
+ const searchEngine = new SearchEngine(embedFn, indexManager, db, rerankFn, contentResolver);
263
+ // Step 9: Validate the setup
221
264
  const stats = await searchEngine.getStats();
222
265
  console.log(`✓ Search engine ready: ${stats.totalChunks} chunks indexed, reranking ${stats.rerankingEnabled ? 'enabled' : 'disabled'}`);
223
266
  console.log('🎉 TextSearchFactory: Search engine initialized successfully');
@@ -225,7 +268,7 @@ export class TextSearchFactory {
225
268
  }
226
269
  catch (error) {
227
270
  console.error('❌ TextSearchFactory: Failed to create search engine');
228
- throw new Error(`TextSearchFactory.create failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
271
+ throw createFactoryCreationError('TextSearchFactory', error instanceof Error ? error.message : 'Unknown error', { operationContext: 'search engine creation' });
229
272
  }
230
273
  }
231
274
  /**
@@ -266,10 +309,18 @@ export class TextSearchFactory {
266
309
  *
267
310
  * This factory abstracts the complex initialization process required for text ingestion:
268
311
  * 1. Creates necessary directories if they don't exist
269
- * 2. Loads and validates text embedding models
270
- * 3. Establishes database connections and initializes schema
271
- * 4. Creates or loads vector indexes with proper configuration
272
- * 5. Creates IngestionPipeline with proper dependency injection
312
+ * 2. Validates mode-model compatibility (no fallback mechanisms)
313
+ * 3. Loads and validates embedding models with clear error reporting
314
+ * 4. Establishes database connections and initializes schema
315
+ * 5. Stores mode configuration in database for automatic detection
316
+ * 6. Creates or loads vector indexes with proper configuration
317
+ * 7. Creates IngestionPipeline with proper dependency injection
318
+ *
319
+ * Mode Configuration:
320
+ * - Text Mode (default): Uses sentence-transformer models for text-only content
321
+ * - Multimodal Mode: Uses CLIP models for mixed text/image content
322
+ * - Mode is stored in database and auto-detected during search
323
+ * - Clear validation prevents mode-model mismatches
273
324
  *
274
325
  * @example
275
326
  * ```typescript
@@ -311,20 +362,39 @@ export class TextIngestionFactory {
311
362
  * @param options.chunkSize - Override chunk size (default: from config)
312
363
  * @param options.chunkOverlap - Override chunk overlap (default: from config)
313
364
  * @param options.forceRebuild - Force rebuild of existing index (default: false)
365
+ * @param options.contentSystemConfig - Content system configuration options
366
+ * @param options.contentSystemConfig.contentDir - Content directory path (default: '.raglite/content')
367
+ * @param options.contentSystemConfig.maxFileSize - Maximum file size in bytes (default: 50MB)
368
+ * @param options.contentSystemConfig.maxContentDirSize - Maximum content directory size (default: 2GB)
369
+ * @param options.contentSystemConfig.enableDeduplication - Enable content deduplication (default: true)
370
+ * @param options.contentSystemConfig.enableStorageTracking - Enable storage tracking (default: true)
314
371
  * @returns Promise resolving to configured IngestionPipeline
315
372
  * @throws {Error} If initialization fails
316
373
  *
317
374
  * @example
318
375
  * ```typescript
319
- * // Create ingestion pipeline
376
+ * // Create ingestion pipeline with default content system
320
377
  * const ingestion = await TextIngestionFactory.create('./my-db.sqlite', './my-index.bin');
321
378
  *
379
+ * // Create with custom content system configuration
380
+ * const ingestion = await TextIngestionFactory.create('./my-db.sqlite', './my-index.bin', {
381
+ * contentSystemConfig: {
382
+ * contentDir: './custom-content',
383
+ * maxFileSize: 100 * 1024 * 1024, // 100MB
384
+ * maxContentDirSize: 5 * 1024 * 1024 * 1024, // 5GB
385
+ * enableDeduplication: true
386
+ * }
387
+ * });
388
+ *
322
389
  * // Ingest documents from directory
323
390
  * const result = await ingestion.ingestDirectory('./documents');
324
391
  * console.log(`Processed ${result.documentsProcessed} documents`);
325
392
  *
326
- * // Ingest single file
327
- * await ingestion.ingestFile('./document.pdf');
393
+ * // Ingest content from memory (MCP integration)
394
+ * const contentId = await ingestion.ingestFromMemory(buffer, {
395
+ * displayName: 'uploaded-file.pdf',
396
+ * contentType: 'application/pdf'
397
+ * });
328
398
  *
329
399
  * // Clean up when done
330
400
  * await ingestion.cleanup();
@@ -335,7 +405,10 @@ export class TextIngestionFactory {
335
405
  console.log('🏭 TextIngestionFactory: Initializing text ingestion pipeline...');
336
406
  // Validate input paths
337
407
  if (!dbPath || !indexPath) {
338
- throw new Error('Both dbPath and indexPath are required');
408
+ throw createInvalidPathError([
409
+ { name: 'dbPath', value: dbPath },
410
+ { name: 'indexPath', value: indexPath }
411
+ ], { operationContext: 'TextIngestionFactory.create' });
339
412
  }
340
413
  // Ensure directories exist
341
414
  const dbDir = dirname(dbPath);
@@ -353,12 +426,35 @@ export class TextIngestionFactory {
353
426
  const effectiveBatchSize = options.batchSize ?? modelDefaults.batch_size;
354
427
  const effectiveChunkSize = options.chunkSize ?? modelDefaults.chunk_size;
355
428
  const effectiveChunkOverlap = options.chunkOverlap ?? modelDefaults.chunk_overlap;
356
- // Step 2: Initialize embedding function
357
- console.log('📊 Loading text embedding model...');
358
- const embedFn = createTextEmbedFunction(options.embeddingModel, effectiveBatchSize);
359
- // Test embedding function to ensure it works
360
- // Embedding function created successfully (will be tested on first use)
361
- console.log('✓ Text embedding function created successfully');
429
+ // Step 1.5: Validate mode-model compatibility at creation time
430
+ const effectiveMode = options.mode || 'text';
431
+ const effectiveModel = options.embeddingModel || config.embedding_model;
432
+ console.log('🔍 Validating mode-model compatibility...');
433
+ validateModeModelCompatibilityOrThrow(effectiveMode, effectiveModel);
434
+ console.log('✓ Mode-model compatibility validated');
435
+ // Step 2: Initialize embedding function based on mode
436
+ let embedFn;
437
+ if (effectiveMode === 'multimodal') {
438
+ console.log('📊 Loading CLIP embedding model for multimodal mode...');
439
+ const { createEmbedder } = await import('../core/embedder-factory.js');
440
+ const clipEmbedder = await createEmbedder(effectiveModel);
441
+ // Wrap CLIP embedder to match EmbedFunction signature
442
+ embedFn = async (content, contentType) => {
443
+ if (contentType === 'image') {
444
+ // Use CLIP image embedding for image content
445
+ return await clipEmbedder.embedImage(content);
446
+ }
447
+ // Use CLIP text embedding for text content
448
+ return await clipEmbedder.embedText(content);
449
+ };
450
+ console.log('✓ CLIP embedder created for multimodal mode');
451
+ }
452
+ else {
453
+ // Text mode: use sentence-transformer embedder (existing behavior)
454
+ console.log('📊 Loading text embedding model...');
455
+ embedFn = createTextEmbedFunction(options.embeddingModel, effectiveBatchSize);
456
+ console.log('✓ Text embedding function created successfully');
457
+ }
362
458
  // Step 3: Initialize database connection
363
459
  console.log('💾 Opening database connection...');
364
460
  const db = await openDatabase(dbPath);
@@ -366,13 +462,17 @@ export class TextIngestionFactory {
366
462
  const { initializeSchema } = await import('../core/db.js');
367
463
  await initializeSchema(db);
368
464
  console.log('✓ Database connection established');
465
+ // Step 3.1: Handle mode storage during ingestion
466
+ await this.handleModeStorage(db, options, modelDefaults);
369
467
  // Step 4: Initialize index manager
370
468
  console.log('📇 Initializing vector index...');
371
469
  const indexManager = new IndexManager(indexPath, dbPath, modelDefaults.dimensions, options.embeddingModel || config.embedding_model);
372
470
  // Check if we need to force recreation due to model change
373
471
  let forceRecreate = false;
374
472
  if (options.forceRebuild && existsSync(indexPath) && existsSync(dbPath)) {
375
- // Check if model has changed during rebuild
473
+ // When forceRebuild is true, always force recreation to handle any model/dimension mismatches
474
+ forceRecreate = true;
475
+ // Check if model has changed during rebuild for logging purposes
376
476
  const { getStoredModelInfo } = await import('../core/db.js');
377
477
  const tempDb = await openDatabase(dbPath);
378
478
  try {
@@ -381,7 +481,9 @@ export class TextIngestionFactory {
381
481
  if (storedModel && storedModel.modelName !== currentModel) {
382
482
  console.log(`🔄 Model change detected: ${storedModel.modelName} → ${currentModel}`);
383
483
  console.log(`🔄 Dimensions change: ${storedModel.dimensions} → ${modelDefaults.dimensions}`);
384
- forceRecreate = true;
484
+ }
485
+ else if (storedModel && storedModel.dimensions !== modelDefaults.dimensions) {
486
+ console.log(`🔄 Dimension mismatch detected: ${storedModel.dimensions} → ${modelDefaults.dimensions}`);
385
487
  }
386
488
  }
387
489
  finally {
@@ -411,18 +513,30 @@ export class TextIngestionFactory {
411
513
  await indexManager.initialize();
412
514
  }
413
515
  console.log('✓ Vector index ready');
414
- // Step 4: Create IngestionPipeline with dependency injection and chunk configuration
516
+ // Step 5: Create ContentManager for unified content system
517
+ console.log('📁 Initializing content management system...');
518
+ const contentSystemConfig = await this.validateAndPrepareContentSystemConfig(options.contentSystemConfig);
519
+ const contentManager = new ContentManager(db, contentSystemConfig);
520
+ console.log('✓ Content management system ready');
521
+ // Step 6: Create IngestionPipeline with dependency injection and chunk configuration
415
522
  const chunkConfig = {
416
523
  chunkSize: effectiveChunkSize,
417
524
  chunkOverlap: effectiveChunkOverlap
418
525
  };
419
- const ingestionPipeline = new IngestionPipeline(embedFn, indexManager, db, chunkConfig);
526
+ const ingestionPipeline = new IngestionPipeline(embedFn, indexManager, db, chunkConfig, contentManager);
420
527
  console.log('🎉 TextIngestionFactory: Ingestion pipeline initialized successfully');
421
528
  return ingestionPipeline;
422
529
  }
423
530
  catch (error) {
424
531
  console.error('❌ TextIngestionFactory: Failed to create ingestion pipeline');
425
- throw new Error(`TextIngestionFactory.create failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
532
+ // Preserve custom error messages for model mismatch and mode mismatch
533
+ if (error instanceof Error && (error.message.includes('Model mismatch') ||
534
+ error.message.includes('Mode mismatch') ||
535
+ error.message.includes('--force-rebuild') ||
536
+ error.message.includes('--rebuild-if-needed'))) {
537
+ throw error; // Re-throw custom validation errors as-is
538
+ }
539
+ throw createFactoryCreationError('TextIngestionFactory', error instanceof Error ? error.message : 'Unknown error', { operationContext: 'ingestion pipeline creation' });
426
540
  }
427
541
  }
428
542
  /**
@@ -436,6 +550,164 @@ export class TextIngestionFactory {
436
550
  const indexPath = config.index_file || './index.bin';
437
551
  return this.create(dbPath, indexPath, options);
438
552
  }
553
+ /**
554
+ * Handles mode storage during ingestion
555
+ * Creates or validates system info based on the provided mode and options
556
+ * @private
557
+ */
558
+ static async handleModeStorage(db, options, modelDefaults) {
559
+ const { getSystemInfo, setSystemInfo } = await import('../core/db.js');
560
+ // Determine the effective mode and model
561
+ const effectiveMode = options.mode || 'text';
562
+ const effectiveModel = options.embeddingModel || config.embedding_model;
563
+ const effectiveRerankingStrategy = options.rerankingStrategy || 'cross-encoder';
564
+ // Determine model type based on model name
565
+ let modelType;
566
+ if (effectiveModel.includes('clip')) {
567
+ modelType = 'clip';
568
+ }
569
+ else {
570
+ modelType = 'sentence-transformer';
571
+ }
572
+ // Determine supported content types based on mode
573
+ const supportedContentTypes = effectiveMode === 'multimodal' ? ['text', 'image'] : ['text'];
574
+ try {
575
+ // Check if system info already exists
576
+ const existingSystemInfo = await getSystemInfo(db);
577
+ if (existingSystemInfo) {
578
+ // Validate mode consistency for subsequent ingestions
579
+ if (existingSystemInfo.mode !== effectiveMode) {
580
+ console.warn(`⚠️ Mode mismatch detected!`);
581
+ console.warn(` Database mode: ${existingSystemInfo.mode}`);
582
+ console.warn(` Requested mode: ${effectiveMode}`);
583
+ if (options.forceRebuild) {
584
+ console.log('🔄 Force rebuild enabled, updating mode configuration...');
585
+ await this.updateSystemInfo(db, effectiveMode, effectiveModel, modelType, modelDefaults, effectiveRerankingStrategy, supportedContentTypes);
586
+ }
587
+ else {
588
+ throw createModeMismatchError(existingSystemInfo.mode, effectiveMode, { operationContext: 'TextIngestionFactory.create' });
589
+ }
590
+ }
591
+ else if (existingSystemInfo.modelName !== effectiveModel) {
592
+ // Model change within the same mode
593
+ console.log(`🔄 Model change detected: ${existingSystemInfo.modelName} → ${effectiveModel}`);
594
+ if (options.forceRebuild) {
595
+ console.log('🔄 Force rebuild enabled, updating model configuration...');
596
+ await this.updateSystemInfo(db, effectiveMode, effectiveModel, modelType, modelDefaults, effectiveRerankingStrategy, supportedContentTypes);
597
+ }
598
+ else {
599
+ // Create a specific error message for model mismatch with rebuild suggestions
600
+ const errorMessage = [
601
+ `❌ Model mismatch: Database is configured for '${existingSystemInfo.modelName}', but '${effectiveModel}' was requested.`,
602
+ '',
603
+ '🛠️ How to fix this:',
604
+ ' 1. Use --force-rebuild to change models:',
605
+ ' raglite ingest <path> --model ' + effectiveModel + ' --force-rebuild',
606
+ '',
607
+ ' 2. Or use --rebuild-if-needed for automatic handling:',
608
+ ' raglite ingest <path> --model ' + effectiveModel + ' --rebuild-if-needed',
609
+ '',
610
+ ' 3. Or continue using the existing model:',
611
+ ' raglite ingest <path> # Uses ' + existingSystemInfo.modelName,
612
+ '',
613
+ '🔍 Model switching requires rebuilding the vector index because different models',
614
+ ' produce embeddings with different dimensions and characteristics.'
615
+ ].join('\n');
616
+ throw new Error(errorMessage);
617
+ }
618
+ }
619
+ else {
620
+ console.log(`✅ Mode consistency validated: ${effectiveMode} mode with ${effectiveModel}`);
621
+ }
622
+ }
623
+ else {
624
+ // First ingestion - create system info
625
+ console.log(`🔧 First ingestion detected, storing system configuration...`);
626
+ console.log(` Mode: ${effectiveMode}`);
627
+ console.log(` Model: ${effectiveModel} (${modelType})`);
628
+ console.log(` Dimensions: ${modelDefaults.dimensions}`);
629
+ console.log(` Reranking: ${effectiveRerankingStrategy}`);
630
+ console.log(` Content types: ${supportedContentTypes.join(', ')}`);
631
+ await this.updateSystemInfo(db, effectiveMode, effectiveModel, modelType, modelDefaults, effectiveRerankingStrategy, supportedContentTypes);
632
+ console.log('✅ System configuration stored successfully');
633
+ }
634
+ }
635
+ catch (error) {
636
+ if (error instanceof Error && (error.message.includes('Mode mismatch') || error.message.includes('Model mismatch'))) {
637
+ throw error; // Re-throw validation errors with custom messages
638
+ }
639
+ console.error('❌ Failed to handle mode storage:', error);
640
+ throw new Error(`Mode storage failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
641
+ }
642
+ }
643
+ /**
644
+ * Updates system info in the database
645
+ * @private
646
+ */
647
+ static async updateSystemInfo(db, mode, modelName, modelType, modelDefaults, rerankingStrategy, supportedContentTypes) {
648
+ const { setSystemInfo } = await import('../core/db.js');
649
+ await setSystemInfo(db, {
650
+ mode,
651
+ modelName,
652
+ modelType,
653
+ modelDimensions: modelDefaults.dimensions,
654
+ modelVersion: '1.0.0', // TODO: Get actual version from model
655
+ supportedContentTypes,
656
+ rerankingStrategy: rerankingStrategy,
657
+ rerankingModel: undefined,
658
+ rerankingConfig: undefined
659
+ });
660
+ }
661
+ /**
662
+ * Validates and prepares content system configuration
663
+ * @private
664
+ */
665
+ static async validateAndPrepareContentSystemConfig(userConfig) {
666
+ // Default configuration
667
+ const defaultConfig = {
668
+ contentDir: '.raglite/content',
669
+ maxFileSize: 50 * 1024 * 1024, // 50MB
670
+ maxContentDirSize: 2 * 1024 * 1024 * 1024, // 2GB
671
+ enableDeduplication: true,
672
+ enableStorageTracking: true
673
+ };
674
+ // Merge with user configuration
675
+ const config = { ...defaultConfig, ...userConfig };
676
+ // Validate content directory path
677
+ if (!config.contentDir || typeof config.contentDir !== 'string') {
678
+ throw new Error('Content directory path must be a non-empty string');
679
+ }
680
+ // Validate file size limits
681
+ if (config.maxFileSize && (typeof config.maxFileSize !== 'number' || config.maxFileSize <= 0)) {
682
+ throw new Error('Maximum file size must be a positive number');
683
+ }
684
+ if (config.maxContentDirSize && (typeof config.maxContentDirSize !== 'number' || config.maxContentDirSize <= 0)) {
685
+ throw new Error('Maximum content directory size must be a positive number');
686
+ }
687
+ // Validate that maxFileSize is not larger than maxContentDirSize
688
+ if (config.maxFileSize && config.maxContentDirSize && config.maxFileSize > config.maxContentDirSize) {
689
+ throw new Error('Maximum file size cannot be larger than maximum content directory size');
690
+ }
691
+ // Validate boolean options
692
+ if (config.enableDeduplication !== undefined && typeof config.enableDeduplication !== 'boolean') {
693
+ throw new Error('enableDeduplication must be a boolean value');
694
+ }
695
+ if (config.enableStorageTracking !== undefined && typeof config.enableStorageTracking !== 'boolean') {
696
+ throw new Error('enableStorageTracking must be a boolean value');
697
+ }
698
+ // Create content directory if it doesn't exist
699
+ try {
700
+ const { promises: fs } = await import('fs');
701
+ await fs.mkdir(config.contentDir, { recursive: true });
702
+ // Verify directory is writable
703
+ await fs.access(config.contentDir, (await import('fs')).constants.W_OK);
704
+ console.log(`✓ Content directory validated: ${config.contentDir}`);
705
+ }
706
+ catch (error) {
707
+ throw new Error(`Failed to create or access content directory '${config.contentDir}': ${error instanceof Error ? error.message : 'Unknown error'}. Please check permissions and path validity.`);
708
+ }
709
+ return config;
710
+ }
439
711
  }
440
712
  /**
441
713
  * Convenience factory to create both search and ingestion instances
@@ -548,9 +820,9 @@ export class TextRAGFactory {
548
820
  * const { searchOptions, ingestionOptions } = TextFactoryHelpers.getRecommendedConfig('quality');
549
821
  * const search = await TextSearchFactory.create('./index.bin', './db.sqlite', searchOptions);
550
822
  *
551
- * // Create with automatic error recovery
552
- * const search = await TextFactoryHelpers.createSearchWithFallback('./index.bin', './db.sqlite', {
553
- * enableReranking: true // Will fallback to disabled if reranking fails
823
+ * // Create with clear validation and error reporting
824
+ * const search = await TextFactoryHelpers.createSearchWithValidation('./index.bin', './db.sqlite', {
825
+ * enableReranking: true // Will fail clearly if reranking has issues
554
826
  * });
555
827
  * ```
556
828
  */
@@ -581,16 +853,14 @@ export class TextFactoryHelpers {
581
853
  */
582
854
  static validateSearchFiles(indexPath, dbPath) {
583
855
  if (!existsSync(indexPath)) {
584
- throw new Error(`Vector index not found: ${indexPath}\n` +
585
- 'Run ingestion first: raglite ingest <directory>\n' +
586
- 'Or use: const ingestion = await IngestionFactory.create(dbPath, indexPath);\n' +
587
- 'Or check if the path is correct.');
856
+ throw createMissingFileError(indexPath, 'index', {
857
+ operationContext: 'search file validation'
858
+ });
588
859
  }
589
860
  if (!existsSync(dbPath)) {
590
- throw new Error(`Database not found: ${dbPath}\n` +
591
- 'Run ingestion first: raglite ingest <directory>\n' +
592
- 'Or use: const ingestion = await IngestionFactory.create(dbPath, indexPath);\n' +
593
- 'Or check if the path is correct.');
861
+ throw createMissingFileError(dbPath, 'database', {
862
+ operationContext: 'search file validation'
863
+ });
594
864
  }
595
865
  }
596
866
  /**
@@ -664,56 +934,35 @@ export class TextFactoryHelpers {
664
934
  }
665
935
  }
666
936
  /**
667
- * Create a search engine with automatic error recovery
937
+ * Create a search engine with clear error reporting
668
938
  *
669
- * This method attempts to create a search engine with the provided options,
670
- * and if that fails, it tries again with fallback options (primarily
671
- * disabling reranking, which is a common source of initialization failures).
672
- * This provides a more robust way to create search engines in environments
673
- * where reranking models might not be available or might fail to load.
939
+ * This method creates a search engine with the provided options and fails
940
+ * clearly if there are any issues, providing actionable error messages.
674
941
  *
675
942
  * @param indexPath - Path to vector index file
676
943
  * @param dbPath - Path to database file
677
- * @param options - Initial options to try
678
- * @returns Promise resolving to SearchEngine (possibly with fallback options)
679
- * @throws {Error} If both original and fallback creation attempts fail
944
+ * @param options - Configuration options
945
+ * @returns Promise resolving to SearchEngine
946
+ * @throws {Error} If creation fails with clear error message
680
947
  *
681
948
  * @example
682
949
  * ```typescript
683
- * // Try to create with reranking, fallback to without if it fails
684
- * const search = await TextFactoryHelpers.createSearchWithFallback(
950
+ * // Create search engine with clear error handling
951
+ * const search = await TextFactoryHelpers.createSearchWithValidation(
685
952
  * './index.bin',
686
953
  * './db.sqlite',
687
954
  * { enableReranking: true, topK: 20 }
688
955
  * );
689
956
  *
690
- * // The search engine will work even if reranking model fails to load
691
957
  * const results = await search.search('query');
692
958
  * console.log(`Search created successfully with ${results.length} results`);
693
959
  * ```
694
960
  */
695
- static async createSearchWithFallback(indexPath, dbPath, options = {}) {
696
- try {
697
- // Try with original options
698
- return await TextSearchFactory.create(indexPath, dbPath, options);
699
- }
700
- catch (error) {
701
- console.warn(`Initial search creation failed, trying fallback options: ${error instanceof Error ? error.message : 'Unknown error'}`);
702
- // Try with reranking disabled as fallback
703
- const fallbackOptions = {
704
- ...options,
705
- enableReranking: false
706
- };
707
- try {
708
- return await TextSearchFactory.create(indexPath, dbPath, fallbackOptions);
709
- }
710
- catch (fallbackError) {
711
- console.error('Fallback search creation also failed');
712
- throw new Error(`Failed to create search engine with both original and fallback options:\n` +
713
- `Original error: ${error instanceof Error ? error.message : 'Unknown error'}\n` +
714
- `Fallback error: ${fallbackError instanceof Error ? fallbackError.message : 'Unknown error'}`);
715
- }
716
- }
961
+ static async createSearchWithValidation(indexPath, dbPath, options = {}) {
962
+ // Validate files first
963
+ this.validateSearchFiles(indexPath, dbPath);
964
+ // Create with clear error reporting
965
+ return await TextSearchFactory.create(indexPath, dbPath, options);
717
966
  }
718
967
  }
719
968
  //# sourceMappingURL=text-factory.js.map