rag-lite-ts 2.0.5 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -106,18 +106,41 @@ export class SearchEngine {
106
106
  return [];
107
107
  }
108
108
  const startTime = performance.now();
109
- const topK = options.top_k || config.top_k || 10;
110
- const shouldRerank = options.rerank !== undefined ? options.rerank : (this.rerankFn !== undefined);
111
109
  try {
112
110
  // Step 1: Build query embedding using injected embed function
113
111
  const embeddingStartTime = performance.now();
114
112
  const queryEmbedding = await this.embedFn(query);
115
113
  const embeddingTime = performance.now() - embeddingStartTime;
116
- // Step 2: Search using IndexManager (which handles hash mapping properly)
114
+ // Step 2: Search with the vector
115
+ const results = await this.searchWithVector(queryEmbedding.vector, options, query, embeddingTime);
116
+ return results;
117
+ }
118
+ catch (error) {
119
+ throw new Error(`Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
120
+ }
121
+ }
122
+ /**
123
+ * Perform semantic search using a pre-computed embedding vector
124
+ * Useful for image-based search or when embedding is computed externally
125
+ * @param queryVector - Pre-computed query embedding vector
126
+ * @param options - Search options including top_k and rerank settings
127
+ * @param originalQuery - Optional original query for reranking (text or image path)
128
+ * @param embeddingTime - Optional embedding time for logging
129
+ * @returns Promise resolving to array of search results
130
+ */
131
+ async searchWithVector(queryVector, options = {}, originalQuery, embeddingTime) {
132
+ const startTime = performance.now();
133
+ const topK = options.top_k || config.top_k || 10;
134
+ // Phase 1: Disable reranking by default for better performance
135
+ // Users must explicitly opt-in with --rerank flag
136
+ const shouldRerank = options.rerank === true;
137
+ try {
138
+ // Step 1: Search using IndexManager (which handles hash mapping properly)
117
139
  const searchStartTime = performance.now();
118
140
  let searchResult;
119
141
  try {
120
- searchResult = this.indexManager.search(queryEmbedding.vector, topK);
142
+ const contentType = options.contentType;
143
+ searchResult = this.indexManager.search(queryVector, topK, contentType);
121
144
  }
122
145
  catch (error) {
123
146
  if (error instanceof Error && error.message.includes('No embedding ID found for hash')) {
@@ -133,18 +156,18 @@ export class SearchEngine {
133
156
  console.log(`No similar documents found (${totalTime.toFixed(2)}ms total)`);
134
157
  return [];
135
158
  }
136
- // Step 3: Retrieve chunks from database using embedding IDs
159
+ // Step 2: Retrieve chunks from database using embedding IDs
137
160
  const retrievalStartTime = performance.now();
138
161
  const chunks = await getChunksByEmbeddingIds(this.db, searchResult.embeddingIds);
139
162
  const retrievalTime = performance.now() - retrievalStartTime;
140
- // Step 4: Format results as JSON with text, score, and document metadata
163
+ // Step 3: Format results as JSON with text, score, and document metadata
141
164
  let results = this.formatSearchResults(chunks, searchResult.distances, searchResult.embeddingIds);
142
- // Step 5: Optional reranking with injected rerank function
165
+ // Step 4: Optional reranking with injected rerank function
143
166
  let rerankTime = 0;
144
- if (shouldRerank && this.rerankFn && results.length > 1) {
167
+ if (shouldRerank && this.rerankFn && results.length > 1 && originalQuery) {
145
168
  try {
146
169
  const rerankStartTime = performance.now();
147
- results = await this.rerankFn(query, results);
170
+ results = await this.rerankFn(originalQuery, results);
148
171
  rerankTime = performance.now() - rerankStartTime;
149
172
  }
150
173
  catch (error) {
@@ -154,13 +177,14 @@ export class SearchEngine {
154
177
  }
155
178
  const totalTime = performance.now() - startTime;
156
179
  // Measure latency without premature optimization - just log for monitoring
180
+ const embedTimeStr = embeddingTime !== undefined ? `embed: ${embeddingTime.toFixed(2)}ms, ` : '';
157
181
  console.log(`Search completed: ${results.length} results in ${totalTime.toFixed(2)}ms ` +
158
- `(embed: ${embeddingTime.toFixed(2)}ms, vector: ${vectorSearchTime.toFixed(2)}ms, ` +
182
+ `(${embedTimeStr}vector: ${vectorSearchTime.toFixed(2)}ms, ` +
159
183
  `retrieval: ${retrievalTime.toFixed(2)}ms${rerankTime > 0 ? `, rerank: ${rerankTime.toFixed(2)}ms` : ''})`);
160
184
  return results;
161
185
  }
162
186
  catch (error) {
163
- throw new Error(`Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
187
+ throw new Error(`Vector search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
164
188
  }
165
189
  }
166
190
  /**
@@ -49,7 +49,7 @@ export interface RerankingInterface {
49
49
  export interface SearchOptions {
50
50
  top_k?: number;
51
51
  rerank?: boolean;
52
- contentType?: string;
52
+ contentType?: 'text' | 'image' | 'combined';
53
53
  }
54
54
  export interface Chunk {
55
55
  text: string;
@@ -64,5 +64,9 @@ export declare class VectorIndex {
64
64
  * Resize index to accommodate more vectors
65
65
  */
66
66
  resizeIndex(newMaxElements: number): void;
67
+ /**
68
+ * Get index options (for external access to configuration)
69
+ */
70
+ getOptions(): VectorIndexOptions;
67
71
  }
68
72
  //# sourceMappingURL=vector-index.d.ts.map
@@ -321,5 +321,11 @@ export class VectorIndex {
321
321
  throw new Error(`Failed to resize index: ${error}`);
322
322
  }
323
323
  }
324
+ /**
325
+ * Get index options (for external access to configuration)
326
+ */
327
+ getOptions() {
328
+ return { ...this.options };
329
+ }
324
330
  }
325
331
  //# sourceMappingURL=vector-index.js.map
@@ -323,7 +323,9 @@ export class IngestionFactory {
323
323
  const { getSystemInfo, setSystemInfo } = await import('../core/db.js');
324
324
  // Determine the effective mode and reranking strategy
325
325
  const effectiveMode = options.mode || 'text';
326
- const effectiveRerankingStrategy = options.rerankingStrategy || 'cross-encoder';
326
+ // Phase 1: Fix mode-specific reranking strategy defaults
327
+ const effectiveRerankingStrategy = options.rerankingStrategy ||
328
+ (effectiveMode === 'multimodal' ? 'text-derived' : 'cross-encoder');
327
329
  // Determine model type based on model name
328
330
  let modelType;
329
331
  if (effectiveModel.includes('clip')) {
@@ -8,6 +8,8 @@ export interface FileProcessorOptions {
8
8
  recursive?: boolean;
9
9
  /** Maximum file size in bytes (default: 10MB) */
10
10
  maxFileSize?: number;
11
+ /** Processing mode to filter compatible files */
12
+ mode?: 'text' | 'multimodal';
11
13
  }
12
14
  /**
13
15
  * Default options for file processing
@@ -188,6 +188,15 @@ async function discoverFilesRecursive(dirPath, options) {
188
188
  // Check file size based on content type
189
189
  const stats = await fs.stat(fullPath);
190
190
  const contentType = getContentType(fullPath);
191
+ // Filter by mode: skip incompatible content types
192
+ const mode = options.mode || 'text';
193
+ if (mode === 'text' && contentType === 'image') {
194
+ result.skipped.push({
195
+ path: fullPath,
196
+ reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
197
+ });
198
+ continue;
199
+ }
191
200
  // Different size limits for different content types
192
201
  const maxSize = contentType === 'image'
193
202
  ? 50 * 1024 * 1024 // 50MB for images
@@ -250,6 +259,17 @@ export async function discoverFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIO
250
259
  };
251
260
  }
252
261
  const contentType = getContentType(resolvedPath);
262
+ // Filter by mode: skip incompatible content types
263
+ const mode = options.mode || 'text';
264
+ if (mode === 'text' && contentType === 'image') {
265
+ return {
266
+ files: [],
267
+ skipped: [{
268
+ path: resolvedPath,
269
+ reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
270
+ }]
271
+ };
272
+ }
253
273
  // Check file size based on content type
254
274
  const maxSize = contentType === 'image'
255
275
  ? 50 * 1024 * 1024 // 50MB for images
@@ -7,12 +7,16 @@ export interface IndexStats {
7
7
  export declare class IndexManager {
8
8
  private modelName?;
9
9
  private vectorIndex;
10
+ private textIndex?;
11
+ private imageIndex?;
10
12
  private db;
11
13
  private indexPath;
12
14
  private dbPath;
13
15
  private isInitialized;
14
16
  private hashToEmbeddingId;
15
17
  private embeddingIdToHash;
18
+ private groupedEmbeddings?;
19
+ private vectorIndexOptions;
16
20
  constructor(indexPath: string, dbPath: string, dimensions: number, modelName?: string | undefined);
17
21
  /**
18
22
  * Initialize the index manager and load existing index if available
@@ -30,6 +34,10 @@ export declare class IndexManager {
30
34
  * Requirements: 5.3 - When new documents are added THEN system SHALL append new chunks and vectors without rebuilding existing index
31
35
  */
32
36
  addVectors(embeddings: EmbeddingResult[]): Promise<void>;
37
+ /**
38
+ * Add grouped embeddings by content type (for new grouped format)
39
+ */
40
+ addGroupedEmbeddings(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
33
41
  /**
34
42
  * Rebuild the entire index from scratch
35
43
  * Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
@@ -68,10 +76,18 @@ export declare class IndexManager {
68
76
  * Save the vector index to disk
69
77
  */
70
78
  saveIndex(): Promise<void>;
79
+ /**
80
+ * Create specialized indexes for text and image content when grouped data is available
81
+ */
82
+ private createSpecializedIndexes;
83
+ /**
84
+ * Save index with content type grouping (for new grouped format)
85
+ */
86
+ saveGroupedIndex(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
71
87
  /**
72
88
  * Search for similar vectors
73
89
  */
74
- search(queryVector: Float32Array, k?: number): {
90
+ search(queryVector: Float32Array, k?: number, contentType?: 'text' | 'image' | 'combined'): {
75
91
  embeddingIds: string[];
76
92
  distances: number[];
77
93
  };
@@ -1,26 +1,33 @@
1
1
  import { VectorIndex } from './core/vector-index.js';
2
+ import { BinaryIndexFormat } from './core/binary-index-format.js';
2
3
  import { openDatabase, getSystemInfo, setSystemInfo } from './core/db.js';
3
4
  import { config, getModelDefaults } from './core/config.js';
4
5
  export class IndexManager {
5
6
  modelName;
6
7
  vectorIndex;
8
+ textIndex;
9
+ imageIndex;
7
10
  db = null;
8
11
  indexPath;
9
12
  dbPath;
10
13
  isInitialized = false;
11
14
  hashToEmbeddingId = new Map();
12
15
  embeddingIdToHash = new Map();
16
+ groupedEmbeddings;
17
+ vectorIndexOptions;
13
18
  constructor(indexPath, dbPath, dimensions, modelName) {
14
19
  this.modelName = modelName;
15
20
  this.indexPath = indexPath;
16
21
  this.dbPath = dbPath;
17
- // Initialize with provided dimensions from config
18
- this.vectorIndex = new VectorIndex(indexPath, {
22
+ // Store options for creating specialized indexes
23
+ this.vectorIndexOptions = {
19
24
  dimensions: dimensions,
20
25
  maxElements: 100000, // Start with 100k capacity
21
26
  efConstruction: 200,
22
27
  M: 16
23
- });
28
+ };
29
+ // Initialize with provided dimensions from config
30
+ this.vectorIndex = new VectorIndex(indexPath, this.vectorIndexOptions);
24
31
  }
25
32
  /**
26
33
  * Initialize the index manager and load existing index if available
@@ -47,6 +54,8 @@ export class IndexManager {
47
54
  // Only try to load existing index if not forcing recreation
48
55
  console.log('Loading existing vector index...');
49
56
  await this.vectorIndex.loadIndex();
57
+ // Check if the loaded index has grouped data and create specialized indexes
58
+ await this.createSpecializedIndexes();
50
59
  }
51
60
  // Always populate the embedding ID mapping from existing database entries
52
61
  // This is needed both for new and existing indexes
@@ -55,7 +64,8 @@ export class IndexManager {
55
64
  this.hashEmbeddingId(chunk.embedding_id); // This will populate the mapping
56
65
  }
57
66
  this.isInitialized = true;
58
- console.log(`Index manager initialized with ${this.vectorIndex.getCurrentCount()} vectors`);
67
+ const vectorCount = this.vectorIndex.getCurrentCount();
68
+ console.log(`Index manager initialized with ${vectorCount} vectors${this.textIndex && this.imageIndex ? ' (multi-graph mode)' : ''}`);
59
69
  }
60
70
  catch (error) {
61
71
  throw new Error(`Failed to initialize index manager: ${error}`);
@@ -153,6 +163,31 @@ export class IndexManager {
153
163
  throw new Error(`Failed to add vectors to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
154
164
  }
155
165
  }
166
+ /**
167
+ * Add grouped embeddings by content type (for new grouped format)
168
+ */
169
+ async addGroupedEmbeddings(textEmbeddings, imageEmbeddings) {
170
+ if (!this.isInitialized) {
171
+ throw new Error('Index manager not initialized');
172
+ }
173
+ console.log(`addGroupedEmbeddings: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
174
+ const allEmbeddings = [...textEmbeddings, ...imageEmbeddings];
175
+ if (allEmbeddings.length === 0) {
176
+ return;
177
+ }
178
+ try {
179
+ // Store grouped information for later saving
180
+ this.groupedEmbeddings = { text: textEmbeddings, image: imageEmbeddings };
181
+ console.log('addGroupedEmbeddings: stored grouped embeddings');
182
+ // Add all embeddings to the index (maintains current behavior)
183
+ await this.addVectors(allEmbeddings);
184
+ console.log('addGroupedEmbeddings: addVectors completed');
185
+ // The saveIndex method will now use grouped format if groupedEmbeddings exists
186
+ }
187
+ catch (error) {
188
+ throw new Error(`Failed to add grouped embeddings to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
189
+ }
190
+ }
156
191
  /**
157
192
  * Rebuild the entire index from scratch
158
193
  * Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
@@ -349,16 +384,122 @@ export class IndexManager {
349
384
  if (!this.isInitialized) {
350
385
  throw new Error('Index manager not initialized');
351
386
  }
352
- await this.vectorIndex.saveIndex();
387
+ // If we have grouped embeddings, save in grouped format
388
+ if (this.groupedEmbeddings) {
389
+ console.log('IndexManager: Saving in grouped format');
390
+ await this.saveGroupedIndex(this.groupedEmbeddings.text, this.groupedEmbeddings.image);
391
+ // Clear grouped data after saving
392
+ this.groupedEmbeddings = undefined;
393
+ }
394
+ else {
395
+ console.log('IndexManager: Saving in standard format');
396
+ await this.vectorIndex.saveIndex();
397
+ }
398
+ }
399
+ /**
400
+ * Create specialized indexes for text and image content when grouped data is available
401
+ */
402
+ async createSpecializedIndexes() {
403
+ try {
404
+ // Load the index data to check if it has grouped information
405
+ const indexData = await BinaryIndexFormat.load(this.indexPath);
406
+ if (indexData.hasContentTypeGroups && indexData.textVectors && indexData.imageVectors) {
407
+ // Only create specialized indexes if we have both text and image vectors
408
+ // In text-only mode, textVectors would be populated but imageVectors empty
409
+ // In multimodal mode, both would be populated
410
+ const hasTextVectors = indexData.textVectors.length > 0;
411
+ const hasImageVectors = indexData.imageVectors.length > 0;
412
+ if (hasTextVectors && hasImageVectors) {
413
+ console.log('Creating specialized indexes for content type filtering...');
414
+ // Create text-only index
415
+ this.textIndex = new VectorIndex(`${this.indexPath}.text`, this.vectorIndexOptions);
416
+ await this.textIndex.initialize();
417
+ this.textIndex.addVectors(indexData.textVectors);
418
+ console.log(`✓ Text index created with ${indexData.textVectors.length} vectors`);
419
+ // Create image-only index
420
+ this.imageIndex = new VectorIndex(`${this.indexPath}.image`, this.vectorIndexOptions);
421
+ await this.imageIndex.initialize();
422
+ this.imageIndex.addVectors(indexData.imageVectors);
423
+ console.log(`✓ Image index created with ${indexData.imageVectors.length} vectors`);
424
+ console.log('✓ Specialized indexes ready for content type filtering');
425
+ }
426
+ else if (hasTextVectors) {
427
+ console.log('Text-only index detected - using combined index for all searches');
428
+ // In text-only mode, we don't need specialized indexes
429
+ // The combined index (vectorIndex) already contains all text vectors
430
+ }
431
+ }
432
+ }
433
+ catch (error) {
434
+ console.warn('Failed to create specialized indexes, falling back to combined index:', error);
435
+ // Continue without specialized indexes - search will still work with combined index
436
+ }
437
+ }
438
+ /**
439
+ * Save index with content type grouping (for new grouped format)
440
+ */
441
+ async saveGroupedIndex(textEmbeddings, imageEmbeddings) {
442
+ if (!this.isInitialized) {
443
+ throw new Error('Index manager not initialized');
444
+ }
445
+ console.log(`saveGroupedIndex: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
446
+ // Group vectors by content type
447
+ const textVectors = textEmbeddings.map((embedding) => ({
448
+ id: this.hashEmbeddingId(embedding.embedding_id),
449
+ vector: embedding.vector
450
+ }));
451
+ const imageVectors = imageEmbeddings.map((embedding) => ({
452
+ id: this.hashEmbeddingId(embedding.embedding_id),
453
+ vector: embedding.vector
454
+ }));
455
+ // Get index parameters
456
+ const options = this.vectorIndex.getOptions();
457
+ const allVectors = [...textVectors, ...imageVectors];
458
+ console.log(`saveGroupedIndex: dimensions=${options.dimensions}, totalVectors=${allVectors.length}`);
459
+ const indexData = {
460
+ dimensions: options.dimensions,
461
+ maxElements: options.maxElements,
462
+ M: options.M || 16,
463
+ efConstruction: options.efConstruction || 200,
464
+ seed: options.seed || 100,
465
+ currentSize: textVectors.length + imageVectors.length,
466
+ vectors: allVectors, // Required for backward compatibility
467
+ hasContentTypeGroups: true,
468
+ textVectors,
469
+ imageVectors
470
+ };
471
+ console.log('saveGroupedIndex: Calling BinaryIndexFormat.saveGrouped');
472
+ // Save using grouped format
473
+ await BinaryIndexFormat.saveGrouped(this.indexPath, indexData);
474
+ console.log(`✓ Saved grouped index with ${textVectors.length} text and ${imageVectors.length} image vectors`);
353
475
  }
354
476
  /**
355
477
  * Search for similar vectors
356
478
  */
357
- search(queryVector, k = 5) {
479
+ search(queryVector, k = 5, contentType) {
358
480
  if (!this.isInitialized) {
359
481
  throw new Error('Index manager not initialized');
360
482
  }
361
- const results = this.vectorIndex.search(queryVector, k);
483
+ // Select the appropriate index based on content type
484
+ let targetIndex;
485
+ // If we have specialized indexes (multimodal mode), use them for filtering
486
+ if (this.textIndex && this.imageIndex) {
487
+ if (contentType === 'text') {
488
+ targetIndex = this.textIndex;
489
+ }
490
+ else if (contentType === 'image') {
491
+ targetIndex = this.imageIndex;
492
+ }
493
+ else {
494
+ // 'combined' or undefined
495
+ targetIndex = this.vectorIndex;
496
+ }
497
+ }
498
+ else {
499
+ // No specialized indexes (text-only mode) - ignore contentType and use combined index
500
+ targetIndex = this.vectorIndex;
501
+ }
502
+ const results = targetIndex.search(queryVector, k);
362
503
  // Convert numeric IDs back to embedding IDs
363
504
  const embeddingIds = results.neighbors.map(id => this.unhashEmbeddingId(id));
364
505
  return {