rag-lite-ts 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/dist/cli/indexer.js +4 -4
  2. package/dist/cli/search.js +3 -3
  3. package/dist/cli.js +31 -4
  4. package/dist/core/actionable-error-messages.js +3 -3
  5. package/dist/core/content-manager.d.ts +0 -8
  6. package/dist/core/content-manager.js +2 -30
  7. package/dist/core/database-connection-manager.js +10 -0
  8. package/dist/core/db.d.ts +0 -32
  9. package/dist/core/db.js +11 -68
  10. package/dist/core/embedder-factory.d.ts +0 -22
  11. package/dist/core/embedder-factory.js +8 -35
  12. package/dist/core/index.d.ts +3 -3
  13. package/dist/core/index.js +3 -3
  14. package/dist/core/ingestion.d.ts +1 -16
  15. package/dist/core/ingestion.js +1 -30
  16. package/dist/core/interfaces.d.ts +1 -1
  17. package/dist/core/interfaces.js +1 -1
  18. package/dist/core/model-registry.d.ts +0 -4
  19. package/dist/core/model-registry.js +5 -9
  20. package/dist/core/search.d.ts +2 -2
  21. package/dist/core/search.js +2 -2
  22. package/dist/factories/index.d.ts +11 -29
  23. package/dist/factories/index.js +12 -29
  24. package/dist/factories/ingestion-factory.d.ts +200 -0
  25. package/dist/factories/ingestion-factory.js +475 -0
  26. package/dist/{core/polymorphic-search-factory.d.ts → factories/search-factory.d.ts} +7 -7
  27. package/dist/{core/polymorphic-search-factory.js → factories/search-factory.js} +22 -22
  28. package/dist/index-manager.js +25 -14
  29. package/dist/index.d.ts +5 -30
  30. package/dist/index.js +9 -24
  31. package/dist/ingestion.d.ts +2 -4
  32. package/dist/ingestion.js +2 -2
  33. package/dist/mcp-server.js +15 -16
  34. package/dist/search.js +2 -2
  35. package/dist/text/embedder.d.ts +0 -11
  36. package/dist/text/embedder.js +11 -22
  37. package/dist/text/index.d.ts +2 -2
  38. package/dist/text/index.js +2 -2
  39. package/dist/text/reranker.d.ts +0 -10
  40. package/dist/text/reranker.js +10 -33
  41. package/package.json +7 -3
  42. package/dist/factories/polymorphic-factory.d.ts +0 -50
  43. package/dist/factories/polymorphic-factory.js +0 -159
  44. package/dist/factories/text-factory.d.ts +0 -560
  45. package/dist/factories/text-factory.js +0 -982
@@ -1,982 +0,0 @@
1
- /**
2
- * Factory functions for creating text-specific search and ingestion instances
3
- * Handles complex initialization logic while providing clean API for common use cases
4
- *
5
- * FACTORY PATTERN BENEFITS:
6
- * - Abstracts complex initialization (model loading, database setup, index initialization)
7
- * - Provides simple API for common use cases while preserving access to dependency injection
8
- * - Clear validation and error handling without fallback mechanisms
9
- * - Supports different embedding models and configurations
10
- * - Enables clean separation between simple usage and advanced customization
11
- *
12
- * MODE SELECTION GUIDE:
13
- * - Text Mode (default): Optimized for text-only content
14
- * - Uses sentence-transformer models (fast, accurate for text)
15
- * - Images converted to text descriptions
16
- * - Best for: document search, text clustering, semantic similarity
17
- *
18
- * - Multimodal Mode: Optimized for mixed text/image content
19
- * - Uses CLIP models (unified embedding space)
20
- * - True cross-modal search (text finds images, images find text)
21
- * - Best for: image search, visual QA, multimodal retrieval
22
- *
23
- * USAGE PATTERNS:
24
- *
25
- * 1. Simple Search Setup:
26
- * ```typescript
27
- * // Create search engine with defaults
28
- * const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
29
- * const results = await search.search('query');
30
- * ```
31
- *
32
- * 2. Custom Configuration:
33
- * ```typescript
34
- * // Create with custom options
35
- * const search = await TextSearchFactory.create('./index.bin', './db.sqlite', {
36
- * embeddingModel: 'all-MiniLM-L6-v2',
37
- * enableReranking: true,
38
- * topK: 20
39
- * });
40
- * ```
41
- *
42
- * 3. Complete RAG System:
43
- * ```typescript
44
- * // Create both ingestion and search
45
- * const { searchEngine, ingestionPipeline } = await TextRAGFactory.createBoth(
46
- * './index.bin',
47
- * './db.sqlite'
48
- * );
49
- *
50
- * // Ingest documents
51
- * await ingestionPipeline.ingestDirectory('./docs');
52
- *
53
- * // Search documents
54
- * const results = await searchEngine.search('query');
55
- * ```
56
- *
57
- * 4. Clear Error Handling:
58
- * ```typescript
59
- * // Create with clear validation and error reporting
60
- * const search = await TextFactoryHelpers.createSearchWithValidation(
61
- * './index.bin',
62
- * './db.sqlite',
63
- * { enableReranking: true } // Clear errors if issues occur
64
- * );
65
- * ```
66
- *
67
- * 5. Mode Selection:
68
- * ```typescript
69
- * // Text mode (default) - optimized for text-only content
70
- * const textIngestion = await TextIngestionFactory.create('./db.sqlite', './index.bin', {
71
- * mode: 'text',
72
- * embeddingModel: 'sentence-transformers/all-MiniLM-L6-v2'
73
- * });
74
- *
75
- * // Multimodal mode - enables cross-modal search
76
- * const multimodalIngestion = await TextIngestionFactory.create('./db.sqlite', './index.bin', {
77
- * mode: 'multimodal',
78
- * embeddingModel: 'Xenova/clip-vit-base-patch32',
79
- * rerankingStrategy: 'text-derived'
80
- * });
81
- * ```
82
- */
83
- import { SearchEngine } from '../core/search.js';
84
- import { IngestionPipeline } from '../core/ingestion.js';
85
- import { IndexManager } from '../index-manager.js';
86
- import { openDatabase } from '../core/db.js';
87
- import { createTextEmbedFunction } from '../text/embedder.js';
88
- import { createTextRerankFunction } from '../text/reranker.js';
89
- import { config, getModelDefaults } from '../core/config.js';
90
- import { existsSync } from 'fs';
91
- import { dirname } from 'path';
92
- import { mkdirSync } from 'fs';
93
- import { ContentManager } from '../core/content-manager.js';
94
- import { validateModeModelCompatibilityOrThrow } from '../core/mode-model-validator.js';
95
- import { createMissingFileError, createInvalidPathError, createFactoryCreationError, createModeMismatchError } from '../core/actionable-error-messages.js';
96
- /**
97
- * Factory for creating text-based SearchEngine instances
98
- * Handles model loading, database initialization, and index setup
99
- *
100
- * This factory abstracts the complex initialization process required for text search:
101
- * 1. Auto-detects embedding model from database configuration
102
- * 2. Validates mode-model compatibility (no fallback mechanisms)
103
- * 3. Loads embedding models with clear error reporting
104
- * 4. Optionally loads reranking models based on configuration
105
- * 5. Establishes database connections and initializes schema
106
- * 6. Loads vector indexes with proper model compatibility checking
107
- * 7. Creates SearchEngine with proper dependency injection
108
- *
109
- * Mode Support:
110
- * - Automatically detects mode from database (text or multimodal)
111
- * - Each mode uses its optimal implementation without fallbacks
112
- * - Clear validation ensures mode-model compatibility
113
- *
114
- * @example
115
- * ```typescript
116
- * // Basic usage
117
- * const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
118
- * const results = await search.search('What is machine learning?');
119
- *
120
- * // With custom configuration
121
- * const search = await TextSearchFactory.create('./index.bin', './db.sqlite', {
122
- * embeddingModel: 'all-MiniLM-L6-v2',
123
- * enableReranking: true,
124
- * topK: 15
125
- * });
126
- *
127
- * // With defaults (uses config file paths)
128
- * const search = await TextSearchFactory.createWithDefaults({
129
- * enableReranking: false // Faster search
130
- * });
131
- * ```
132
- */
133
- export class TextSearchFactory {
134
- /**
135
- * Create a SearchEngine configured for text search
136
- *
137
- * This method handles the complete initialization process:
138
- * - Validates that required files exist
139
- * - Loads text embedding model (with lazy initialization)
140
- * - Optionally loads reranking model (with clear error reporting)
141
- * - Opens database connection and initializes schema
142
- * - Loads vector index with compatibility validation
143
- * - Creates SearchEngine with dependency injection
144
- * - Validates the complete setup
145
- *
146
- * @param indexPath - Path to the vector index file (must exist)
147
- * @param dbPath - Path to the SQLite database file (must exist)
148
- * @param options - Optional configuration overrides
149
- * @param options.embeddingModel - Override embedding model (default: from config)
150
- * @param options.batchSize - Override embedding batch size (default: from config)
151
- * @param options.rerankingModel - Override reranking model (default: from config)
152
- * @param options.enableReranking - Enable/disable reranking (default: true)
153
- * @param options.topK - Number of results to return (default: from config)
154
- * @returns Promise resolving to configured SearchEngine
155
- * @throws {Error} If required files don't exist or initialization fails
156
- *
157
- * @example
158
- * ```typescript
159
- * // Create search engine for existing index
160
- * const search = await TextSearchFactory.create('./my-index.bin', './my-db.sqlite');
161
- *
162
- * // Search with the created engine
163
- * const results = await search.search('artificial intelligence');
164
- * console.log(`Found ${results.length} results`);
165
- *
166
- * // Clean up when done
167
- * await search.cleanup();
168
- * ```
169
- */
170
- static async create(indexPath, dbPath, options = {}) {
171
- try {
172
- console.log('🏭 TextSearchFactory: Initializing text search engine...');
173
- // Validate input paths
174
- if (!indexPath || !dbPath) {
175
- throw createInvalidPathError([
176
- { name: 'indexPath', value: indexPath },
177
- { name: 'dbPath', value: dbPath }
178
- ], { operationContext: 'TextSearchFactory.create' });
179
- }
180
- // Check if required files exist
181
- if (!existsSync(indexPath)) {
182
- throw createMissingFileError(indexPath, 'index', {
183
- operationContext: 'TextSearchFactory.create'
184
- });
185
- }
186
- if (!existsSync(dbPath)) {
187
- throw createMissingFileError(dbPath, 'database', {
188
- operationContext: 'TextSearchFactory.create'
189
- });
190
- }
191
- // Step 1: Auto-detect embedding model from database
192
- let embeddingModel = options.embeddingModel;
193
- let modelDimensions;
194
- if (!embeddingModel) {
195
- // Auto-detect model from database
196
- const { openDatabase, getStoredModelInfo } = await import('../core/db.js');
197
- const db = await openDatabase(dbPath);
198
- try {
199
- const storedModelInfo = await getStoredModelInfo(db);
200
- if (storedModelInfo) {
201
- embeddingModel = storedModelInfo.modelName;
202
- modelDimensions = storedModelInfo.dimensions;
203
- console.log(`📊 Auto-detected embedding model: ${embeddingModel} (${modelDimensions} dimensions)`);
204
- }
205
- else {
206
- // Fallback to config default
207
- embeddingModel = config.embedding_model;
208
- const modelDefaults = getModelDefaults(embeddingModel);
209
- modelDimensions = modelDefaults.dimensions;
210
- console.log(`📊 Using default embedding model: ${embeddingModel} (${modelDimensions} dimensions)`);
211
- }
212
- }
213
- finally {
214
- await db.close();
215
- }
216
- }
217
- else {
218
- // Use provided model
219
- const modelDefaults = getModelDefaults(embeddingModel);
220
- modelDimensions = modelDefaults.dimensions;
221
- console.log(`📊 Using specified embedding model: ${embeddingModel} (${modelDimensions} dimensions)`);
222
- }
223
- // Step 1.5: Validate mode-model compatibility at creation time
224
- console.log('🔍 Validating mode-model compatibility...');
225
- validateModeModelCompatibilityOrThrow('text', embeddingModel);
226
- console.log('✓ Mode-model compatibility validated');
227
- // Step 2: Initialize embedding function
228
- console.log('📊 Loading text embedding model...');
229
- const embedFn = createTextEmbedFunction(embeddingModel, options.batchSize);
230
- // Embedding function created successfully (will be tested on first use)
231
- console.log('✓ Text embedding function created successfully');
232
- // Step 3: Initialize reranking function (optional)
233
- let rerankFn;
234
- if (options.enableReranking === true) { // Default to disabled for local-first, fast RAG-lite
235
- console.log('🔄 Loading text reranking model...');
236
- rerankFn = createTextRerankFunction(options.rerankingModel);
237
- // Test reranking function - fail clearly if there are issues
238
- await rerankFn('test query', []);
239
- console.log('✓ Text reranking model loaded successfully');
240
- }
241
- else {
242
- console.log('🔄 Reranking disabled by default (local-first, fast mode)');
243
- }
244
- // Step 5: Initialize database connection
245
- console.log('💾 Opening database connection...');
246
- const db = await openDatabase(dbPath);
247
- // Initialize database schema if needed
248
- const { initializeSchema } = await import('../core/db.js');
249
- await initializeSchema(db);
250
- console.log('✓ Database connection established');
251
- // Step 6: Initialize index manager
252
- console.log('📇 Loading vector index...');
253
- const indexManager = new IndexManager(indexPath, dbPath, modelDimensions, embeddingModel);
254
- await indexManager.initialize();
255
- console.log('✓ Vector index loaded successfully');
256
- // Step 7: Create ContentResolver for unified content system
257
- console.log('📁 Initializing content resolver...');
258
- const { ContentResolver } = await import('../core/content-resolver.js');
259
- const contentResolver = new ContentResolver(db);
260
- console.log('✓ Content resolver ready');
261
- // Step 8: Create SearchEngine with dependency injection
262
- const searchEngine = new SearchEngine(embedFn, indexManager, db, rerankFn, contentResolver);
263
- // Step 9: Validate the setup
264
- const stats = await searchEngine.getStats();
265
- console.log(`✓ Search engine ready: ${stats.totalChunks} chunks indexed, reranking ${stats.rerankingEnabled ? 'enabled' : 'disabled'}`);
266
- console.log('🎉 TextSearchFactory: Search engine initialized successfully');
267
- return searchEngine;
268
- }
269
- catch (error) {
270
- console.error('❌ TextSearchFactory: Failed to create search engine');
271
- throw createFactoryCreationError('TextSearchFactory', error instanceof Error ? error.message : 'Unknown error', { operationContext: 'search engine creation' });
272
- }
273
- }
274
- /**
275
- * Create a SearchEngine with automatic path resolution
276
- * Uses default paths from configuration (config.index_file, config.db_file)
277
- *
278
- * This is a convenience method that uses the default file paths from the configuration,
279
- * making it easy to create a search engine without specifying paths explicitly.
280
- *
281
- * @param options - Optional configuration overrides
282
- * @param options.embeddingModel - Override embedding model
283
- * @param options.enableReranking - Enable/disable reranking
284
- * @param options.topK - Number of results to return
285
- * @returns Promise resolving to configured SearchEngine
286
- * @throws {Error} If default files don't exist or initialization fails
287
- *
288
- * @example
289
- * ```typescript
290
- * // Use default paths from config
291
- * const search = await TextSearchFactory.createWithDefaults();
292
- *
293
- * // Use defaults with custom options
294
- * const search = await TextSearchFactory.createWithDefaults({
295
- * enableReranking: false,
296
- * topK: 5
297
- * });
298
- * ```
299
- */
300
- static async createWithDefaults(options = {}) {
301
- const indexPath = config.index_file || './index.bin';
302
- const dbPath = config.db_file || './database.sqlite';
303
- return this.create(indexPath, dbPath, options);
304
- }
305
- }
306
- /**
307
- * Factory for creating text-based IngestionPipeline instances
308
- * Handles model loading, database initialization, and index setup
309
- *
310
- * This factory abstracts the complex initialization process required for text ingestion:
311
- * 1. Creates necessary directories if they don't exist
312
- * 2. Validates mode-model compatibility (no fallback mechanisms)
313
- * 3. Loads and validates embedding models with clear error reporting
314
- * 4. Establishes database connections and initializes schema
315
- * 5. Stores mode configuration in database for automatic detection
316
- * 6. Creates or loads vector indexes with proper configuration
317
- * 7. Creates IngestionPipeline with proper dependency injection
318
- *
319
- * Mode Configuration:
320
- * - Text Mode (default): Uses sentence-transformer models for text-only content
321
- * - Multimodal Mode: Uses CLIP models for mixed text/image content
322
- * - Mode is stored in database and auto-detected during search
323
- * - Clear validation prevents mode-model mismatches
324
- *
325
- * @example
326
- * ```typescript
327
- * // Basic usage
328
- * const ingestion = await TextIngestionFactory.create('./db.sqlite', './index.bin');
329
- * await ingestion.ingestDirectory('./documents');
330
- *
331
- * // With custom configuration
332
- * const ingestion = await TextIngestionFactory.create('./db.sqlite', './index.bin', {
333
- * embeddingModel: 'all-MiniLM-L6-v2',
334
- * chunkSize: 512,
335
- * chunkOverlap: 50,
336
- * forceRebuild: true
337
- * });
338
- *
339
- * // With defaults
340
- * const ingestion = await TextIngestionFactory.createWithDefaults({
341
- * batchSize: 32 // Faster processing
342
- * });
343
- * ```
344
- */
345
- export class TextIngestionFactory {
346
- /**
347
- * Create an IngestionPipeline configured for text ingestion
348
- *
349
- * This method handles the complete initialization process:
350
- * - Creates necessary directories if they don't exist
351
- * - Loads text embedding model (with lazy initialization)
352
- * - Opens database connection and initializes schema
353
- * - Creates or loads vector index (with force rebuild option)
354
- * - Creates IngestionPipeline with dependency injection
355
- * - Validates the complete setup
356
- *
357
- * @param dbPath - Path to the SQLite database file (will be created if doesn't exist)
358
- * @param indexPath - Path to the vector index file (will be created if doesn't exist)
359
- * @param options - Optional configuration overrides
360
- * @param options.embeddingModel - Override embedding model (default: from config)
361
- * @param options.batchSize - Override embedding batch size (default: from config)
362
- * @param options.chunkSize - Override chunk size (default: from config)
363
- * @param options.chunkOverlap - Override chunk overlap (default: from config)
364
- * @param options.forceRebuild - Force rebuild of existing index (default: false)
365
- * @param options.contentSystemConfig - Content system configuration options
366
- * @param options.contentSystemConfig.contentDir - Content directory path (default: '.raglite/content')
367
- * @param options.contentSystemConfig.maxFileSize - Maximum file size in bytes (default: 50MB)
368
- * @param options.contentSystemConfig.maxContentDirSize - Maximum content directory size (default: 2GB)
369
- * @param options.contentSystemConfig.enableDeduplication - Enable content deduplication (default: true)
370
- * @param options.contentSystemConfig.enableStorageTracking - Enable storage tracking (default: true)
371
- * @returns Promise resolving to configured IngestionPipeline
372
- * @throws {Error} If initialization fails
373
- *
374
- * @example
375
- * ```typescript
376
- * // Create ingestion pipeline with default content system
377
- * const ingestion = await TextIngestionFactory.create('./my-db.sqlite', './my-index.bin');
378
- *
379
- * // Create with custom content system configuration
380
- * const ingestion = await TextIngestionFactory.create('./my-db.sqlite', './my-index.bin', {
381
- * contentSystemConfig: {
382
- * contentDir: './custom-content',
383
- * maxFileSize: 100 * 1024 * 1024, // 100MB
384
- * maxContentDirSize: 5 * 1024 * 1024 * 1024, // 5GB
385
- * enableDeduplication: true
386
- * }
387
- * });
388
- *
389
- * // Ingest documents from directory
390
- * const result = await ingestion.ingestDirectory('./documents');
391
- * console.log(`Processed ${result.documentsProcessed} documents`);
392
- *
393
- * // Ingest content from memory (MCP integration)
394
- * const contentId = await ingestion.ingestFromMemory(buffer, {
395
- * displayName: 'uploaded-file.pdf',
396
- * contentType: 'application/pdf'
397
- * });
398
- *
399
- * // Clean up when done
400
- * await ingestion.cleanup();
401
- * ```
402
- */
403
- static async create(dbPath, indexPath, options = {}) {
404
- try {
405
- console.log('🏭 TextIngestionFactory: Initializing text ingestion pipeline...');
406
- // Validate input paths
407
- if (!dbPath || !indexPath) {
408
- throw createInvalidPathError([
409
- { name: 'dbPath', value: dbPath },
410
- { name: 'indexPath', value: indexPath }
411
- ], { operationContext: 'TextIngestionFactory.create' });
412
- }
413
- // Ensure directories exist
414
- const dbDir = dirname(dbPath);
415
- const indexDir = dirname(indexPath);
416
- if (!existsSync(dbDir)) {
417
- console.log(`📁 Creating database directory: ${dbDir}`);
418
- mkdirSync(dbDir, { recursive: true });
419
- }
420
- if (!existsSync(indexDir)) {
421
- console.log(`📁 Creating index directory: ${indexDir}`);
422
- mkdirSync(indexDir, { recursive: true });
423
- }
424
- // Step 1: Determine effective mode and select appropriate default model
425
- const effectiveMode = options.mode || 'text';
426
- // Step 1.5: Select model based on mode if not explicitly provided
427
- let effectiveModel;
428
- if (options.embeddingModel) {
429
- // Use explicitly provided model
430
- effectiveModel = options.embeddingModel;
431
- }
432
- else {
433
- // Select default model based on mode
434
- if (effectiveMode === 'multimodal') {
435
- const { DEFAULT_MODELS } = await import('../core/model-registry.js');
436
- effectiveModel = DEFAULT_MODELS['clip'];
437
- console.log(`📊 No model specified for multimodal mode, using default: ${effectiveModel}`);
438
- }
439
- else {
440
- effectiveModel = config.embedding_model;
441
- }
442
- }
443
- // Step 2: Get model-specific defaults and merge with options
444
- const modelDefaults = getModelDefaults(effectiveModel);
445
- const effectiveBatchSize = options.batchSize ?? modelDefaults.batch_size;
446
- const effectiveChunkSize = options.chunkSize ?? modelDefaults.chunk_size;
447
- const effectiveChunkOverlap = options.chunkOverlap ?? modelDefaults.chunk_overlap;
448
- // Step 3: Validate mode-model compatibility at creation time
449
- console.log('🔍 Validating mode-model compatibility...');
450
- validateModeModelCompatibilityOrThrow(effectiveMode, effectiveModel);
451
- console.log('✓ Mode-model compatibility validated');
452
- // Step 4: Initialize embedding function based on mode
453
- let embedFn;
454
- if (effectiveMode === 'multimodal') {
455
- console.log('📊 Loading CLIP embedding model for multimodal mode...');
456
- const { createEmbedder } = await import('../core/embedder-factory.js');
457
- const clipEmbedder = await createEmbedder(effectiveModel);
458
- // Wrap CLIP embedder to match EmbedFunction signature
459
- embedFn = async (content, contentType) => {
460
- if (contentType === 'image') {
461
- // Use CLIP image embedding for image content
462
- return await clipEmbedder.embedImage(content);
463
- }
464
- // Use CLIP text embedding for text content
465
- return await clipEmbedder.embedText(content);
466
- };
467
- console.log('✓ CLIP embedder created for multimodal mode');
468
- }
469
- else {
470
- // Text mode: use sentence-transformer embedder (existing behavior)
471
- console.log('📊 Loading text embedding model...');
472
- embedFn = createTextEmbedFunction(options.embeddingModel, effectiveBatchSize);
473
- console.log('✓ Text embedding function created successfully');
474
- }
475
- // Step 3: Initialize database connection
476
- console.log('💾 Opening database connection...');
477
- const db = await openDatabase(dbPath);
478
- // Initialize database schema if needed
479
- const { initializeSchema } = await import('../core/db.js');
480
- await initializeSchema(db);
481
- console.log('✓ Database connection established');
482
- // Step 3.1: Handle mode storage during ingestion
483
- await this.handleModeStorage(db, options, modelDefaults, effectiveModel);
484
- // Step 5: Initialize index manager
485
- console.log('📇 Initializing vector index...');
486
- const indexManager = new IndexManager(indexPath, dbPath, modelDefaults.dimensions, effectiveModel);
487
- // Check if we need to force recreation due to model change
488
- let forceRecreate = false;
489
- if (options.forceRebuild && existsSync(indexPath) && existsSync(dbPath)) {
490
- // When forceRebuild is true, always force recreation to handle any model/dimension mismatches
491
- forceRecreate = true;
492
- // Check if model has changed during rebuild for logging purposes
493
- const { getStoredModelInfo } = await import('../core/db.js');
494
- const tempDb = await openDatabase(dbPath);
495
- try {
496
- const storedModel = await getStoredModelInfo(tempDb);
497
- if (storedModel && storedModel.modelName !== effectiveModel) {
498
- console.log(`🔄 Model change detected: ${storedModel.modelName} → ${effectiveModel}`);
499
- console.log(`🔄 Dimensions change: ${storedModel.dimensions} → ${modelDefaults.dimensions}`);
500
- }
501
- else if (storedModel && storedModel.dimensions !== modelDefaults.dimensions) {
502
- console.log(`🔄 Dimension mismatch detected: ${storedModel.dimensions} → ${modelDefaults.dimensions}`);
503
- }
504
- }
505
- finally {
506
- await tempDb.close();
507
- }
508
- }
509
- // Handle force rebuild or create new index
510
- if (options.forceRebuild || !existsSync(indexPath)) {
511
- if (options.forceRebuild && existsSync(indexPath)) {
512
- console.log('🔄 Force rebuild requested, recreating index...');
513
- }
514
- else {
515
- console.log('📇 Creating new vector index...');
516
- }
517
- // Initialize with skipModelCheck and forceRecreate for rebuilds
518
- await indexManager.initialize(options.forceRebuild, forceRecreate);
519
- // Update stored model info when rebuilding or creating new index
520
- if (options.forceRebuild || forceRecreate) {
521
- const { setStoredModelInfo } = await import('../core/db.js');
522
- await setStoredModelInfo(db, effectiveModel, modelDefaults.dimensions);
523
- console.log(`✓ Updated stored model info: ${effectiveModel} (${modelDefaults.dimensions} dimensions)`);
524
- }
525
- }
526
- else {
527
- // Load existing index
528
- await indexManager.initialize();
529
- }
530
- console.log('✓ Vector index ready');
531
- // Step 5: Create ContentManager for unified content system
532
- console.log('📁 Initializing content management system...');
533
- const contentSystemConfig = await this.validateAndPrepareContentSystemConfig(options.contentSystemConfig);
534
- const contentManager = new ContentManager(db, contentSystemConfig);
535
- console.log('✓ Content management system ready');
536
- // Step 6: Create IngestionPipeline with dependency injection and chunk configuration
537
- const chunkConfig = {
538
- chunkSize: effectiveChunkSize,
539
- chunkOverlap: effectiveChunkOverlap
540
- };
541
- const ingestionPipeline = new IngestionPipeline(embedFn, indexManager, db, chunkConfig, contentManager);
542
- console.log('🎉 TextIngestionFactory: Ingestion pipeline initialized successfully');
543
- return ingestionPipeline;
544
- }
545
- catch (error) {
546
- console.error('❌ TextIngestionFactory: Failed to create ingestion pipeline');
547
- // Preserve custom error messages for model mismatch and mode mismatch
548
- if (error instanceof Error && (error.message.includes('Model mismatch') ||
549
- error.message.includes('Mode mismatch') ||
550
- error.message.includes('--force-rebuild') ||
551
- error.message.includes('--rebuild-if-needed'))) {
552
- throw error; // Re-throw custom validation errors as-is
553
- }
554
- throw createFactoryCreationError('TextIngestionFactory', error instanceof Error ? error.message : 'Unknown error', { operationContext: 'ingestion pipeline creation' });
555
- }
556
- }
557
- /**
558
- * Create an IngestionPipeline with automatic path resolution
559
- * Uses default paths based on current working directory
560
- * @param options - Optional configuration overrides
561
- * @returns Promise resolving to configured IngestionPipeline
562
- */
563
- static async createWithDefaults(options = {}) {
564
- const dbPath = config.db_file || './database.sqlite';
565
- const indexPath = config.index_file || './index.bin';
566
- return this.create(dbPath, indexPath, options);
567
- }
568
- /**
569
- * Handles mode storage during ingestion
570
- * Creates or validates system info based on the provided mode and options
571
- * @private
572
- */
573
- static async handleModeStorage(db, options, modelDefaults, effectiveModel) {
574
- const { getSystemInfo, setSystemInfo } = await import('../core/db.js');
575
- // Determine the effective mode and reranking strategy
576
- const effectiveMode = options.mode || 'text';
577
- const effectiveRerankingStrategy = options.rerankingStrategy || 'cross-encoder';
578
- // Determine model type based on model name
579
- let modelType;
580
- if (effectiveModel.includes('clip')) {
581
- modelType = 'clip';
582
- }
583
- else {
584
- modelType = 'sentence-transformer';
585
- }
586
- // Determine supported content types based on mode
587
- const supportedContentTypes = effectiveMode === 'multimodal' ? ['text', 'image'] : ['text'];
588
- try {
589
- // Check if system info already exists
590
- const existingSystemInfo = await getSystemInfo(db);
591
- if (existingSystemInfo) {
592
- // Validate mode consistency for subsequent ingestions
593
- if (existingSystemInfo.mode !== effectiveMode) {
594
- console.warn(`⚠️ Mode mismatch detected!`);
595
- console.warn(` Database mode: ${existingSystemInfo.mode}`);
596
- console.warn(` Requested mode: ${effectiveMode}`);
597
- if (options.forceRebuild) {
598
- console.log('🔄 Force rebuild enabled, updating mode configuration...');
599
- await this.updateSystemInfo(db, effectiveMode, effectiveModel, modelType, modelDefaults, effectiveRerankingStrategy, supportedContentTypes);
600
- }
601
- else {
602
- throw createModeMismatchError(existingSystemInfo.mode, effectiveMode, { operationContext: 'TextIngestionFactory.create' });
603
- }
604
- }
605
- else if (existingSystemInfo.modelName !== effectiveModel) {
606
- // Model change within the same mode
607
- console.log(`🔄 Model change detected: ${existingSystemInfo.modelName} → ${effectiveModel}`);
608
- if (options.forceRebuild) {
609
- console.log('🔄 Force rebuild enabled, updating model configuration...');
610
- await this.updateSystemInfo(db, effectiveMode, effectiveModel, modelType, modelDefaults, effectiveRerankingStrategy, supportedContentTypes);
611
- }
612
- else {
613
- // Create a specific error message for model mismatch with rebuild suggestions
614
- const errorMessage = [
615
- `❌ Model mismatch: Database is configured for '${existingSystemInfo.modelName}', but '${effectiveModel}' was requested.`,
616
- '',
617
- '🛠️ How to fix this:',
618
- ' 1. Use --force-rebuild to change models:',
619
- ' raglite ingest <path> --model ' + effectiveModel + ' --force-rebuild',
620
- '',
621
- ' 2. Or use --rebuild-if-needed for automatic handling:',
622
- ' raglite ingest <path> --model ' + effectiveModel + ' --rebuild-if-needed',
623
- '',
624
- ' 3. Or continue using the existing model:',
625
- ' raglite ingest <path> # Uses ' + existingSystemInfo.modelName,
626
- '',
627
- '🔍 Model switching requires rebuilding the vector index because different models',
628
- ' produce embeddings with different dimensions and characteristics.'
629
- ].join('\n');
630
- throw new Error(errorMessage);
631
- }
632
- }
633
- else {
634
- console.log(`✅ Mode consistency validated: ${effectiveMode} mode with ${effectiveModel}`);
635
- }
636
- }
637
- else {
638
- // First ingestion - create system info
639
- console.log(`🔧 First ingestion detected, storing system configuration...`);
640
- console.log(` Mode: ${effectiveMode}`);
641
- console.log(` Model: ${effectiveModel} (${modelType})`);
642
- console.log(` Dimensions: ${modelDefaults.dimensions}`);
643
- console.log(` Reranking: ${effectiveRerankingStrategy}`);
644
- console.log(` Content types: ${supportedContentTypes.join(', ')}`);
645
- await this.updateSystemInfo(db, effectiveMode, effectiveModel, modelType, modelDefaults, effectiveRerankingStrategy, supportedContentTypes);
646
- console.log('✅ System configuration stored successfully');
647
- }
648
- }
649
- catch (error) {
650
- if (error instanceof Error && (error.message.includes('Mode mismatch') || error.message.includes('Model mismatch'))) {
651
- throw error; // Re-throw validation errors with custom messages
652
- }
653
- console.error('❌ Failed to handle mode storage:', error);
654
- throw new Error(`Mode storage failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
655
- }
656
- }
657
- /**
658
- * Updates system info in the database
659
- * @private
660
- */
661
- static async updateSystemInfo(db, mode, modelName, modelType, modelDefaults, rerankingStrategy, supportedContentTypes) {
662
- const { setSystemInfo } = await import('../core/db.js');
663
- await setSystemInfo(db, {
664
- mode,
665
- modelName,
666
- modelType,
667
- modelDimensions: modelDefaults.dimensions,
668
- modelVersion: '1.0.0', // TODO: Get actual version from model
669
- supportedContentTypes,
670
- rerankingStrategy: rerankingStrategy,
671
- rerankingModel: undefined,
672
- rerankingConfig: undefined
673
- });
674
- }
675
- /**
676
- * Validates and prepares content system configuration
677
- * @private
678
- */
679
- static async validateAndPrepareContentSystemConfig(userConfig) {
680
- // Default configuration
681
- const defaultConfig = {
682
- contentDir: '.raglite/content',
683
- maxFileSize: 50 * 1024 * 1024, // 50MB
684
- maxContentDirSize: 2 * 1024 * 1024 * 1024, // 2GB
685
- enableDeduplication: true,
686
- enableStorageTracking: true
687
- };
688
- // Merge with user configuration
689
- const config = { ...defaultConfig, ...userConfig };
690
- // Validate content directory path
691
- if (!config.contentDir || typeof config.contentDir !== 'string') {
692
- throw new Error('Content directory path must be a non-empty string');
693
- }
694
- // Validate file size limits
695
- if (config.maxFileSize && (typeof config.maxFileSize !== 'number' || config.maxFileSize <= 0)) {
696
- throw new Error('Maximum file size must be a positive number');
697
- }
698
- if (config.maxContentDirSize && (typeof config.maxContentDirSize !== 'number' || config.maxContentDirSize <= 0)) {
699
- throw new Error('Maximum content directory size must be a positive number');
700
- }
701
- // Validate that maxFileSize is not larger than maxContentDirSize
702
- if (config.maxFileSize && config.maxContentDirSize && config.maxFileSize > config.maxContentDirSize) {
703
- throw new Error('Maximum file size cannot be larger than maximum content directory size');
704
- }
705
- // Validate boolean options
706
- if (config.enableDeduplication !== undefined && typeof config.enableDeduplication !== 'boolean') {
707
- throw new Error('enableDeduplication must be a boolean value');
708
- }
709
- if (config.enableStorageTracking !== undefined && typeof config.enableStorageTracking !== 'boolean') {
710
- throw new Error('enableStorageTracking must be a boolean value');
711
- }
712
- // Create content directory if it doesn't exist
713
- try {
714
- const { promises: fs } = await import('fs');
715
- await fs.mkdir(config.contentDir, { recursive: true });
716
- // Verify directory is writable
717
- await fs.access(config.contentDir, (await import('fs')).constants.W_OK);
718
- console.log(`✓ Content directory validated: ${config.contentDir}`);
719
- }
720
- catch (error) {
721
- throw new Error(`Failed to create or access content directory '${config.contentDir}': ${error instanceof Error ? error.message : 'Unknown error'}. Please check permissions and path validity.`);
722
- }
723
- return config;
724
- }
725
- }
726
- /**
727
- * Convenience factory to create both search and ingestion instances
728
- * Useful for applications that need both capabilities with shared configuration
729
- *
730
- * This factory creates a complete RAG (Retrieval-Augmented Generation) system
731
- * by initializing both ingestion and search capabilities with shared resources.
732
- * The ingestion pipeline is created first to handle directory creation and
733
- * initial setup, then the search engine is created to use the same resources.
734
- *
735
- * @example
736
- * ```typescript
737
- * // Create complete RAG system
738
- * const { searchEngine, ingestionPipeline } = await TextRAGFactory.createBoth(
739
- * './index.bin',
740
- * './db.sqlite'
741
- * );
742
- *
743
- * // First, ingest some documents
744
- * await ingestionPipeline.ingestDirectory('./knowledge-base');
745
- *
746
- * // Then search the ingested content
747
- * const results = await searchEngine.search('What is the main topic?');
748
- *
749
- * // Clean up both instances
750
- * await Promise.all([
751
- * searchEngine.cleanup(),
752
- * ingestionPipeline.cleanup()
753
- * ]);
754
- * ```
755
- */
756
- export class TextRAGFactory {
757
- /**
758
- * Create both SearchEngine and IngestionPipeline instances
759
- *
760
- * This method creates a complete RAG system by:
761
- * 1. Creating an ingestion pipeline (handles directory creation)
762
- * 2. Creating a search engine (uses the same database and index)
763
- * 3. Ensuring both instances use compatible configurations
764
- *
765
- * The ingestion pipeline is created first because it handles directory
766
- * creation and initial setup, while the search engine requires existing
767
- * files to validate the setup.
768
- *
769
- * @param indexPath - Path to the vector index file
770
- * @param dbPath - Path to the SQLite database file
771
- * @param searchOptions - Optional search configuration
772
- * @param searchOptions.enableReranking - Enable reranking for better results
773
- * @param searchOptions.topK - Number of search results to return
774
- * @param ingestionOptions - Optional ingestion configuration
775
- * @param ingestionOptions.chunkSize - Size of text chunks for processing
776
- * @param ingestionOptions.forceRebuild - Force rebuild of existing index
777
- * @returns Promise resolving to both configured instances
778
- * @throws {Error} If initialization of either component fails
779
- *
780
- * @example
781
- * ```typescript
782
- * // Create with custom options for both components
783
- * const { searchEngine, ingestionPipeline } = await TextRAGFactory.createBoth(
784
- * './index.bin',
785
- * './db.sqlite',
786
- * { enableReranking: true, topK: 15 }, // Search options
787
- * { chunkSize: 512, forceRebuild: true } // Ingestion options
788
- * );
789
- *
790
- * // Use the complete system
791
- * await ingestionPipeline.ingestDirectory('./docs');
792
- * const results = await searchEngine.search('machine learning');
793
- * ```
794
- */
795
- static async createBoth(indexPath, dbPath, searchOptions = {}, ingestionOptions = {}) {
796
- console.log('🏭 TextRAGFactory: Creating complete RAG system...');
797
- // Create ingestion pipeline first (handles directory creation)
798
- const ingestionPipeline = await TextIngestionFactory.create(dbPath, indexPath, ingestionOptions);
799
- // Create search engine (requires existing files)
800
- const searchEngine = await TextSearchFactory.create(indexPath, dbPath, searchOptions);
801
- console.log('🎉 TextRAGFactory: Complete RAG system ready');
802
- return { searchEngine, ingestionPipeline };
803
- }
804
- /**
805
- * Create both instances with default paths
806
- * @param searchOptions - Optional search configuration
807
- * @param ingestionOptions - Optional ingestion configuration
808
- * @returns Promise resolving to both instances
809
- */
810
- static async createBothWithDefaults(searchOptions = {}, ingestionOptions = {}) {
811
- const indexPath = config.index_file || './index.bin';
812
- const dbPath = config.db_file || './database.sqlite';
813
- return this.createBoth(indexPath, dbPath, searchOptions, ingestionOptions);
814
- }
815
- }
816
- /**
817
- * Helper functions for common factory patterns and error recovery
818
- *
819
- * This class provides utility functions that support the main factory classes
820
- * with validation, configuration recommendations, and error recovery patterns.
821
- * These helpers enable more robust factory usage and better error handling.
822
- *
823
- * @example
824
- * ```typescript
825
- * // Validate files before creating search engine
826
- * try {
827
- * TextFactoryHelpers.validateSearchFiles('./index.bin', './db.sqlite');
828
- * const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
829
- * } catch (error) {
830
- * console.error('Files not ready for search:', error.message);
831
- * }
832
- *
833
- * // Get recommended configuration for different use cases
834
- * const { searchOptions, ingestionOptions } = TextFactoryHelpers.getRecommendedConfig('quality');
835
- * const search = await TextSearchFactory.create('./index.bin', './db.sqlite', searchOptions);
836
- *
837
- * // Create with clear validation and error reporting
838
- * const search = await TextFactoryHelpers.createSearchWithValidation('./index.bin', './db.sqlite', {
839
- * enableReranking: true // Will fail clearly if reranking has issues
840
- * });
841
- * ```
842
- */
843
- export class TextFactoryHelpers {
844
- /**
845
- * Validate that required files exist for search operations
846
- *
847
- * This method checks that both the vector index and database files exist
848
- * and provides helpful error messages with suggestions for resolution.
849
- * Use this before attempting to create a search engine to get better
850
- * error messages than the generic file not found errors.
851
- *
852
- * @param indexPath - Path to vector index file
853
- * @param dbPath - Path to database file
854
- * @throws {Error} If either file doesn't exist, with helpful resolution steps
855
- *
856
- * @example
857
- * ```typescript
858
- * // Validate before creating search engine
859
- * try {
860
- * TextFactoryHelpers.validateSearchFiles('./index.bin', './db.sqlite');
861
- * console.log('Files are ready for search');
862
- * } catch (error) {
863
- * console.error('Search files not ready:', error.message);
864
- * // Error message includes suggestions like "Run ingestion first"
865
- * }
866
- * ```
867
- */
868
- static validateSearchFiles(indexPath, dbPath) {
869
- if (!existsSync(indexPath)) {
870
- throw createMissingFileError(indexPath, 'index', {
871
- operationContext: 'search file validation'
872
- });
873
- }
874
- if (!existsSync(dbPath)) {
875
- throw createMissingFileError(dbPath, 'database', {
876
- operationContext: 'search file validation'
877
- });
878
- }
879
- }
880
- /**
881
- * Get recommended configuration for different use cases
882
- *
883
- * This method provides pre-configured options optimized for different
884
- * performance vs quality trade-offs. Use these as starting points
885
- * and adjust based on your specific requirements.
886
- *
887
- * @param useCase - The intended use case scenario
888
- * @param useCase.fast - Optimized for speed (no reranking, smaller chunks)
889
- * @param useCase.balanced - Good balance of speed and quality (default)
890
- * @param useCase.quality - Optimized for best results (reranking enabled, larger chunks)
891
- * @returns Recommended configuration for both search and ingestion
892
- *
893
- * @example
894
- * ```typescript
895
- * // Get configuration for quality-focused use case
896
- * const { searchOptions, ingestionOptions } = TextFactoryHelpers.getRecommendedConfig('quality');
897
- *
898
- * // Create instances with recommended settings
899
- * const ingestion = await TextIngestionFactory.create('./db.sqlite', './index.bin', ingestionOptions);
900
- * const search = await TextSearchFactory.create('./index.bin', './db.sqlite', searchOptions);
901
- *
902
- * // Or use with RAG factory
903
- * const { searchEngine, ingestionPipeline } = await TextRAGFactory.createBoth(
904
- * './index.bin',
905
- * './db.sqlite',
906
- * searchOptions,
907
- * ingestionOptions
908
- * );
909
- * ```
910
- */
911
- static getRecommendedConfig(useCase) {
912
- switch (useCase) {
913
- case 'fast':
914
- return {
915
- searchOptions: {
916
- enableReranking: false,
917
- topK: 5
918
- },
919
- ingestionOptions: {
920
- batchSize: 32,
921
- chunkSize: 512
922
- }
923
- };
924
- case 'balanced':
925
- return {
926
- searchOptions: {
927
- enableReranking: true,
928
- topK: 10
929
- },
930
- ingestionOptions: {
931
- batchSize: 16,
932
- chunkSize: 1024
933
- }
934
- };
935
- case 'quality':
936
- return {
937
- searchOptions: {
938
- enableReranking: true,
939
- topK: 20
940
- },
941
- ingestionOptions: {
942
- batchSize: 8,
943
- chunkSize: 2048
944
- }
945
- };
946
- default:
947
- return this.getRecommendedConfig('balanced');
948
- }
949
- }
950
- /**
951
- * Create a search engine with clear error reporting
952
- *
953
- * This method creates a search engine with the provided options and fails
954
- * clearly if there are any issues, providing actionable error messages.
955
- *
956
- * @param indexPath - Path to vector index file
957
- * @param dbPath - Path to database file
958
- * @param options - Configuration options
959
- * @returns Promise resolving to SearchEngine
960
- * @throws {Error} If creation fails with clear error message
961
- *
962
- * @example
963
- * ```typescript
964
- * // Create search engine with clear error handling
965
- * const search = await TextFactoryHelpers.createSearchWithValidation(
966
- * './index.bin',
967
- * './db.sqlite',
968
- * { enableReranking: true, topK: 20 }
969
- * );
970
- *
971
- * const results = await search.search('query');
972
- * console.log(`Search created successfully with ${results.length} results`);
973
- * ```
974
- */
975
- static async createSearchWithValidation(indexPath, dbPath, options = {}) {
976
- // Validate files first
977
- this.validateSearchFiles(indexPath, dbPath);
978
- // Create with clear error reporting
979
- return await TextSearchFactory.create(indexPath, dbPath, options);
980
- }
981
- }
982
- //# sourceMappingURL=text-factory.js.map