rag-lite-ts 2.1.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -198,7 +198,7 @@ export async function runIngest(path, options = {}) {
198
198
  showProgress: true,
199
199
  maxWaitMs: 15000 // Longer timeout for ingestion
200
200
  });
201
- const result = await pipeline.ingestPath(resolvedPath);
201
+ const result = await pipeline.ingestPath(resolvedPath, { mode: factoryOptions.mode });
202
202
  // Display final results
203
203
  console.log('\n' + '='.repeat(50));
204
204
  console.log('INGESTION SUMMARY');
@@ -137,6 +137,11 @@ export async function runSearch(query, options = {}) {
137
137
  if (options['top-k'] !== undefined) {
138
138
  searchOptions.top_k = options['top-k'];
139
139
  }
140
+ // Set content type filter for search-level filtering
141
+ const contentTypeFilter = options['content-type'];
142
+ if (contentTypeFilter && contentTypeFilter !== 'all') {
143
+ searchOptions.contentType = contentTypeFilter;
144
+ }
140
145
  // Phase 2: Disable reranking for image-to-image searches to preserve visual similarity
141
146
  let rerankingForciblyDisabled = false;
142
147
  if (isImage && embedder) {
@@ -174,16 +179,6 @@ export async function runSearch(query, options = {}) {
174
179
  results = await searchEngine.search(query, searchOptions);
175
180
  }
176
181
  const searchTime = Date.now() - startTime;
177
- // Apply content type filter if specified
178
- const contentTypeFilter = options['content-type'];
179
- if (contentTypeFilter && contentTypeFilter !== 'all') {
180
- const originalCount = results.length;
181
- results = results.filter(r => r.contentType === contentTypeFilter);
182
- if (results.length < originalCount) {
183
- console.log(`Filtered to ${results.length} ${contentTypeFilter} result${results.length === 1 ? '' : 's'} (from ${originalCount} total)`);
184
- console.log('');
185
- }
186
- }
187
182
  // Display results
188
183
  if (results.length === 0) {
189
184
  console.log('No results found.');
@@ -25,10 +25,19 @@ export interface BinaryIndexData {
25
25
  id: number;
26
26
  vector: Float32Array;
27
27
  }>;
28
+ hasContentTypeGroups?: boolean;
29
+ textVectors?: Array<{
30
+ id: number;
31
+ vector: Float32Array;
32
+ }>;
33
+ imageVectors?: Array<{
34
+ id: number;
35
+ vector: Float32Array;
36
+ }>;
28
37
  }
29
38
  export declare class BinaryIndexFormat {
30
39
  /**
31
- * Save index data to binary format
40
+ * Save index data to binary format (original format for backward compatibility)
32
41
  *
33
42
  * File structure:
34
43
  * - Header (24 bytes): dimensions, maxElements, M, efConstruction, seed, currentSize
@@ -39,7 +48,24 @@ export declare class BinaryIndexFormat {
39
48
  */
40
49
  static save(indexPath: string, data: BinaryIndexData): Promise<void>;
41
50
  /**
42
- * Load index data from binary format
51
+ * Save index data to grouped binary format
52
+ *
53
+ * File structure:
54
+ * - Extended Header (40 bytes):
55
+ * - Original 6 fields (24 bytes)
56
+ * - hasGroups flag (4 bytes)
57
+ * - textOffset (4 bytes)
58
+ * - textCount (4 bytes)
59
+ * - imageOffset (4 bytes)
60
+ * - imageCount (4 bytes)
61
+ * - Data section: [text vectors...][image vectors...]
62
+ *
63
+ * @param indexPath Path to save the binary index file
64
+ * @param data Index data to serialize
65
+ */
66
+ static saveGrouped(indexPath: string, data: BinaryIndexData): Promise<void>;
67
+ /**
68
+ * Load index data from binary format (supports both original and grouped formats)
43
69
  *
44
70
  * Uses zero-copy Float32Array views for efficient loading.
45
71
  * Copies the views to ensure data persistence after buffer lifecycle.
@@ -17,7 +17,7 @@
17
17
  import { readFileSync, writeFileSync } from 'fs';
18
18
  export class BinaryIndexFormat {
19
19
  /**
20
- * Save index data to binary format
20
+ * Save index data to binary format (original format for backward compatibility)
21
21
  *
22
22
  * File structure:
23
23
  * - Header (24 bytes): dimensions, maxElements, M, efConstruction, seed, currentSize
@@ -66,7 +66,115 @@ export class BinaryIndexFormat {
66
66
  writeFileSync(indexPath, Buffer.from(buffer));
67
67
  }
68
68
  /**
69
- * Load index data from binary format
69
+ * Save index data to grouped binary format
70
+ *
71
+ * File structure:
72
+ * - Extended Header (40 bytes):
73
+ * - Original 6 fields (24 bytes)
74
+ * - hasGroups flag (4 bytes)
75
+ * - textOffset (4 bytes)
76
+ * - textCount (4 bytes)
77
+ * - imageOffset (4 bytes)
78
+ * - imageCount (4 bytes)
79
+ * - Data section: [text vectors...][image vectors...]
80
+ *
81
+ * @param indexPath Path to save the binary index file
82
+ * @param data Index data to serialize
83
+ */
84
+ static async saveGrouped(indexPath, data) {
85
+ if (!data.hasContentTypeGroups || !data.textVectors || !data.imageVectors) {
86
+ // Fallback to original format
87
+ return this.save(indexPath, data);
88
+ }
89
+ const headerSize = 44; // Extended header: 24 + 20 bytes (hasGroups + textOffset + textCount + imageOffset + imageCount)
90
+ const vectorSize = 4 + (data.dimensions * 4); // id + vector
91
+ // Calculate offsets and total size
92
+ const textOffset = headerSize;
93
+ const imageOffset = textOffset + (data.textVectors.length * vectorSize);
94
+ const totalSize = imageOffset + (data.imageVectors.length * vectorSize);
95
+ const buffer = new ArrayBuffer(totalSize);
96
+ const view = new DataView(buffer);
97
+ let offset = 0;
98
+ // Write extended header (40 bytes, all little-endian)
99
+ if (offset + 40 > buffer.byteLength) {
100
+ throw new Error(`Header write would exceed buffer bounds: offset=${offset}, headerSize=40, bufferSize=${buffer.byteLength}`);
101
+ }
102
+ view.setUint32(offset, data.dimensions, true);
103
+ offset += 4;
104
+ view.setUint32(offset, data.maxElements, true);
105
+ offset += 4;
106
+ view.setUint32(offset, data.M, true);
107
+ offset += 4;
108
+ view.setUint32(offset, data.efConstruction, true);
109
+ offset += 4;
110
+ view.setUint32(offset, data.seed, true);
111
+ offset += 4;
112
+ view.setUint32(offset, data.currentSize, true);
113
+ offset += 4;
114
+ // Extended fields
115
+ view.setUint32(offset, 1, true);
116
+ offset += 4; // hasGroups = 1
117
+ view.setUint32(offset, textOffset, true);
118
+ offset += 4;
119
+ view.setUint32(offset, data.textVectors.length, true);
120
+ offset += 4;
121
+ view.setUint32(offset, imageOffset, true);
122
+ offset += 4;
123
+ view.setUint32(offset, data.imageVectors.length, true);
124
+ offset += 4;
125
+ // Write text vectors
126
+ for (const item of data.textVectors) {
127
+ // Ensure 4-byte alignment
128
+ if (offset % 4 !== 0) {
129
+ throw new Error(`Offset ${offset} is not 4-byte aligned`);
130
+ }
131
+ // Check bounds before writing
132
+ if (offset + 4 > buffer.byteLength) {
133
+ throw new Error(`ID write would exceed buffer bounds: offset=${offset}, bufferSize=${buffer.byteLength}`);
134
+ }
135
+ // Write vector ID
136
+ view.setUint32(offset, item.id, true);
137
+ offset += 4;
138
+ // Check bounds for vector data
139
+ const vectorDataSize = item.vector.length * 4;
140
+ if (offset + vectorDataSize > buffer.byteLength) {
141
+ throw new Error(`Vector data write would exceed buffer bounds: offset=${offset}, dataSize=${vectorDataSize}, bufferSize=${buffer.byteLength}`);
142
+ }
143
+ // Write vector data
144
+ for (let i = 0; i < item.vector.length; i++) {
145
+ view.setFloat32(offset, item.vector[i], true);
146
+ offset += 4;
147
+ }
148
+ }
149
+ // Write image vectors
150
+ for (const item of data.imageVectors) {
151
+ // Ensure 4-byte alignment
152
+ if (offset % 4 !== 0) {
153
+ throw new Error(`Offset ${offset} is not 4-byte aligned`);
154
+ }
155
+ // Check bounds before writing
156
+ if (offset + 4 > buffer.byteLength) {
157
+ throw new Error(`ID write would exceed buffer bounds: offset=${offset}, bufferSize=${buffer.byteLength}`);
158
+ }
159
+ // Write vector ID
160
+ view.setUint32(offset, item.id, true);
161
+ offset += 4;
162
+ // Check bounds for vector data
163
+ const vectorDataSize = item.vector.length * 4;
164
+ if (offset + vectorDataSize > buffer.byteLength) {
165
+ throw new Error(`Vector data write would exceed buffer bounds: offset=${offset}, dataSize=${vectorDataSize}, bufferSize=${buffer.byteLength}`);
166
+ }
167
+ // Write vector data
168
+ for (let i = 0; i < item.vector.length; i++) {
169
+ view.setFloat32(offset, item.vector[i], true);
170
+ offset += 4;
171
+ }
172
+ }
173
+ // Write to file
174
+ writeFileSync(indexPath, Buffer.from(buffer));
175
+ }
176
+ /**
177
+ * Load index data from binary format (supports both original and grouped formats)
70
178
  *
71
179
  * Uses zero-copy Float32Array views for efficient loading.
72
180
  * Copies the views to ensure data persistence after buffer lifecycle.
@@ -78,7 +186,7 @@ export class BinaryIndexFormat {
78
186
  const buffer = readFileSync(indexPath);
79
187
  const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
80
188
  let offset = 0;
81
- // Read header (24 bytes, all little-endian)
189
+ // Read basic header (24 bytes, all little-endian)
82
190
  const dimensions = view.getUint32(offset, true);
83
191
  offset += 4;
84
192
  const maxElements = view.getUint32(offset, true);
@@ -91,32 +199,93 @@ export class BinaryIndexFormat {
91
199
  offset += 4;
92
200
  const currentSize = view.getUint32(offset, true);
93
201
  offset += 4;
94
- // Read vectors
95
- const vectors = [];
96
- for (let i = 0; i < currentSize; i++) {
97
- // Ensure 4-byte alignment (should always be true with our format)
98
- if (offset % 4 !== 0) {
99
- throw new Error(`Offset ${offset} is not 4-byte aligned`);
202
+ // Check if this is the extended grouped format (40+ bytes header)
203
+ const hasGroups = buffer.byteLength >= 40 ? view.getUint32(offset, true) : 0;
204
+ if (hasGroups === 1 && buffer.byteLength >= 40) {
205
+ // Load grouped format
206
+ const textOffset = view.getUint32(offset + 4, true);
207
+ const textCount = view.getUint32(offset + 8, true);
208
+ const imageOffset = view.getUint32(offset + 12, true);
209
+ const imageCount = view.getUint32(offset + 16, true);
210
+ // Load text vectors
211
+ const textVectors = [];
212
+ offset = textOffset;
213
+ for (let i = 0; i < textCount; i++) {
214
+ // Ensure 4-byte alignment
215
+ if (offset % 4 !== 0) {
216
+ throw new Error(`Offset ${offset} is not 4-byte aligned`);
217
+ }
218
+ // Read vector ID
219
+ const id = view.getUint32(offset, true);
220
+ offset += 4;
221
+ // Zero-copy Float32Array view
222
+ const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
223
+ // Copy to avoid buffer lifecycle issues
224
+ const vector = new Float32Array(vectorView);
225
+ offset += dimensions * 4;
226
+ textVectors.push({ id, vector });
100
227
  }
101
- // Read vector ID
102
- const id = view.getUint32(offset, true);
103
- offset += 4;
104
- // Zero-copy Float32Array view (fast!)
105
- const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
106
- // Copy to avoid buffer lifecycle issues
107
- const vector = new Float32Array(vectorView);
108
- offset += dimensions * 4;
109
- vectors.push({ id, vector });
228
+ // Load image vectors
229
+ const imageVectors = [];
230
+ offset = imageOffset;
231
+ for (let i = 0; i < imageCount; i++) {
232
+ // Ensure 4-byte alignment
233
+ if (offset % 4 !== 0) {
234
+ throw new Error(`Offset ${offset} is not 4-byte aligned`);
235
+ }
236
+ // Read vector ID
237
+ const id = view.getUint32(offset, true);
238
+ offset += 4;
239
+ // Zero-copy Float32Array view
240
+ const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
241
+ // Copy to avoid buffer lifecycle issues
242
+ const vector = new Float32Array(vectorView);
243
+ offset += dimensions * 4;
244
+ imageVectors.push({ id, vector });
245
+ }
246
+ // Combine all vectors for backward compatibility
247
+ const allVectors = [...textVectors, ...imageVectors];
248
+ return {
249
+ dimensions,
250
+ maxElements,
251
+ M,
252
+ efConstruction,
253
+ seed,
254
+ currentSize,
255
+ vectors: allVectors,
256
+ hasContentTypeGroups: true,
257
+ textVectors,
258
+ imageVectors
259
+ };
260
+ }
261
+ else {
262
+ // Load original format
263
+ const vectors = [];
264
+ for (let i = 0; i < currentSize; i++) {
265
+ // Ensure 4-byte alignment (should always be true with our format)
266
+ if (offset % 4 !== 0) {
267
+ throw new Error(`Offset ${offset} is not 4-byte aligned`);
268
+ }
269
+ // Read vector ID
270
+ const id = view.getUint32(offset, true);
271
+ offset += 4;
272
+ // Zero-copy Float32Array view (fast!)
273
+ const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
274
+ // Copy to avoid buffer lifecycle issues
275
+ const vector = new Float32Array(vectorView);
276
+ offset += dimensions * 4;
277
+ vectors.push({ id, vector });
278
+ }
279
+ return {
280
+ dimensions,
281
+ maxElements,
282
+ M,
283
+ efConstruction,
284
+ seed,
285
+ currentSize,
286
+ vectors
287
+ };
110
288
  }
111
- return {
112
- dimensions,
113
- maxElements,
114
- M,
115
- efConstruction,
116
- seed,
117
- currentSize,
118
- vectors
119
- };
120
289
  }
121
290
  }
122
291
  //# sourceMappingURL=binary-index-format.js.map
@@ -162,9 +162,13 @@ export declare class IngestionPipeline {
162
162
  */
163
163
  private storeDocumentsAndChunksWithContentTypes;
164
164
  /**
165
- * Update vector index with new embeddings
165
+ * Update vector index with new embeddings (supports grouped content type storage)
166
166
  */
167
167
  private updateVectorIndex;
168
+ /**
169
+ * Filter documents based on ingestion mode to avoid processing incompatible content types
170
+ */
171
+ private filterDocumentsByMode;
168
172
  /**
169
173
  * Converts MIME type to simple content type for embedding function
170
174
  * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
@@ -287,21 +287,30 @@ export class IngestionPipeline {
287
287
  try {
288
288
  // Phase 1: File Discovery and Processing with Content-Type Detection
289
289
  console.log('\n--- Phase 1: File Discovery and Processing ---');
290
- const fileResult = await discoverAndProcessFiles(path, options.fileOptions, this.pathManager);
291
- if (fileResult.documents.length === 0) {
290
+ const mode = options.mode || 'text';
291
+ const fileOptions = {
292
+ recursive: true,
293
+ maxFileSize: 10 * 1024 * 1024, // 10MB
294
+ ...options.fileOptions,
295
+ mode
296
+ };
297
+ const fileResult = await discoverAndProcessFiles(path, fileOptions, this.pathManager);
298
+ // Additional filtering as fallback (should be minimal with mode-aware discovery)
299
+ const filteredResult = this.filterDocumentsByMode(fileResult, mode);
300
+ if (filteredResult.documents.length === 0) {
292
301
  console.log('No documents found to process');
293
302
  return {
294
303
  documentsProcessed: 0,
295
304
  chunksCreated: 0,
296
305
  embeddingsGenerated: 0,
297
- documentErrors: fileResult.processingResult.errors.length,
306
+ documentErrors: filteredResult.processingResult.errors.length,
298
307
  embeddingErrors: 0,
299
308
  processingTimeMs: Date.now() - startTime,
300
309
  contentIds: []
301
310
  };
302
311
  }
303
312
  // Content-type detection and routing
304
- const contentTypeStats = this.analyzeContentTypes(fileResult.documents);
313
+ const contentTypeStats = this.analyzeContentTypes(filteredResult.documents);
305
314
  console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
306
315
  // Phase 2: Document Chunking with Content-Type Awareness
307
316
  console.log('\n--- Phase 2: Document Chunking ---');
@@ -309,7 +318,7 @@ export class IngestionPipeline {
309
318
  chunkSize: config.chunk_size,
310
319
  chunkOverlap: config.chunk_overlap
311
320
  };
312
- const chunkingResult = await this.chunkDocumentsWithContentTypes(fileResult.documents, effectiveChunkConfig, options.mode);
321
+ const chunkingResult = await this.chunkDocumentsWithContentTypes(filteredResult.documents, effectiveChunkConfig, options.mode);
313
322
  if (chunkingResult.totalChunks === 0) {
314
323
  console.log('No chunks created from documents');
315
324
  return {
@@ -334,10 +343,10 @@ export class IngestionPipeline {
334
343
  const endTime = Date.now();
335
344
  const processingTimeMs = endTime - startTime;
336
345
  const result = {
337
- documentsProcessed: fileResult.documents.length,
346
+ documentsProcessed: filteredResult.documents.length,
338
347
  chunksCreated: chunkingResult.totalChunks,
339
348
  embeddingsGenerated: embeddingResult.embeddings.length,
340
- documentErrors: fileResult.processingResult.errors.length,
349
+ documentErrors: filteredResult.processingResult.errors.length,
341
350
  embeddingErrors: embeddingResult.errors,
342
351
  processingTimeMs,
343
352
  contentIds
@@ -595,16 +604,35 @@ export class IngestionPipeline {
595
604
  return contentIds;
596
605
  }
597
606
  /**
598
- * Update vector index with new embeddings
607
+ * Update vector index with new embeddings (supports grouped content type storage)
599
608
  */
600
609
  async updateVectorIndex(embeddings) {
610
+ console.log('updateVectorIndex called with', embeddings.length, 'embeddings');
601
611
  if (embeddings.length === 0) {
602
612
  console.log('No embeddings to add to vector index');
603
613
  return;
604
614
  }
605
615
  console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
606
616
  try {
607
- await this.indexManager.addVectors(embeddings);
617
+ // Group embeddings by content type for optimized storage
618
+ const groupedEmbeddings = embeddings.reduce((groups, embedding) => {
619
+ const contentType = embedding.contentType || 'text';
620
+ if (!groups[contentType]) {
621
+ groups[contentType] = [];
622
+ }
623
+ groups[contentType].push(embedding);
624
+ return groups;
625
+ }, {});
626
+ const textEmbeddings = groupedEmbeddings.text || [];
627
+ const imageEmbeddings = groupedEmbeddings.image || [];
628
+ console.log(`Grouped: ${textEmbeddings.length} text, ${imageEmbeddings.length} image vectors`);
629
+ // Use grouped storage method if available, fallback to regular method
630
+ if (this.indexManager.addGroupedEmbeddings) {
631
+ await this.indexManager.addGroupedEmbeddings(textEmbeddings, imageEmbeddings);
632
+ }
633
+ else {
634
+ await this.indexManager.addVectors(embeddings);
635
+ }
608
636
  console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
609
637
  }
610
638
  catch (error) {
@@ -612,6 +640,45 @@ export class IngestionPipeline {
612
640
  throw error;
613
641
  }
614
642
  }
643
+ /**
644
+ * Filter documents based on ingestion mode to avoid processing incompatible content types
645
+ */
646
+ filterDocumentsByMode(fileResult, mode) {
647
+ if (mode === 'multimodal') {
648
+ // In multimodal mode, keep all documents
649
+ return fileResult;
650
+ }
651
+ // In text mode, filter out image documents
652
+ const filteredDocuments = fileResult.documents.filter(doc => {
653
+ const contentType = doc.metadata?.contentType || 'text';
654
+ const isCompatible = contentType === 'text' ||
655
+ contentType.startsWith('text/') ||
656
+ contentType === 'application/pdf' ||
657
+ contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
658
+ if (!isCompatible) {
659
+ console.log(`⚠️ Skipping ${doc.source} (${contentType}) - not compatible with text mode`);
660
+ }
661
+ return isCompatible;
662
+ });
663
+ // Update processing result to reflect filtering
664
+ const filteredProcessingResult = {
665
+ ...fileResult.processingResult,
666
+ skippedFiles: [
667
+ ...(fileResult.processingResult.skippedFiles || []),
668
+ ...fileResult.documents
669
+ .filter(doc => !filteredDocuments.includes(doc))
670
+ .map(doc => ({
671
+ path: doc.source,
672
+ reason: `Content type not compatible with ${mode} mode`
673
+ }))
674
+ ]
675
+ };
676
+ return {
677
+ documents: filteredDocuments,
678
+ discoveryResult: fileResult.discoveryResult,
679
+ processingResult: filteredProcessingResult
680
+ };
681
+ }
615
682
  /**
616
683
  * Converts MIME type to simple content type for embedding function
617
684
  * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
@@ -194,7 +194,7 @@ export class TextDerivedRerankingStrategy {
194
194
  catch (error) {
195
195
  console.warn(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : 'Unknown error'}`);
196
196
  // Fallback to filename-based description
197
- const filename = imagePath.split('/').pop() || imagePath.split('\\').pop() || imagePath;
197
+ const filename = imagePath.split('/').pop() || imagePath;
198
198
  return `Image file: ${filename}`;
199
199
  }
200
200
  }
@@ -211,17 +211,16 @@ export class TextDerivedRerankingStrategy {
211
211
  // Step 1: Convert images to text descriptions
212
212
  const processedResults = await Promise.all(results.map(async (result) => {
213
213
  if (result.contentType === 'image') {
214
- // Generate text description for image using the file path from document.source
215
- const description = await this.generateImageDescription(result.document.source);
214
+ // Generate text description for image
215
+ const description = await this.generateImageDescription(result.content);
216
216
  return {
217
217
  ...result,
218
218
  content: description,
219
- contentType: 'text', // Change to 'text' so cross-encoder will process it
220
219
  originalContent: result.content,
221
220
  originalContentType: result.contentType,
222
221
  metadata: {
223
222
  ...result.metadata,
224
- originalImagePath: result.document.source,
223
+ originalImagePath: result.content,
225
224
  generatedDescription: description
226
225
  }
227
226
  };
@@ -139,7 +139,8 @@ export class SearchEngine {
139
139
  const searchStartTime = performance.now();
140
140
  let searchResult;
141
141
  try {
142
- searchResult = this.indexManager.search(queryVector, topK);
142
+ const contentType = options.contentType;
143
+ searchResult = this.indexManager.search(queryVector, topK, contentType);
143
144
  }
144
145
  catch (error) {
145
146
  if (error instanceof Error && error.message.includes('No embedding ID found for hash')) {
@@ -49,7 +49,7 @@ export interface RerankingInterface {
49
49
  export interface SearchOptions {
50
50
  top_k?: number;
51
51
  rerank?: boolean;
52
- contentType?: string;
52
+ contentType?: 'text' | 'image' | 'combined';
53
53
  }
54
54
  export interface Chunk {
55
55
  text: string;
@@ -64,5 +64,9 @@ export declare class VectorIndex {
64
64
  * Resize index to accommodate more vectors
65
65
  */
66
66
  resizeIndex(newMaxElements: number): void;
67
+ /**
68
+ * Get index options (for external access to configuration)
69
+ */
70
+ getOptions(): VectorIndexOptions;
67
71
  }
68
72
  //# sourceMappingURL=vector-index.d.ts.map
@@ -321,5 +321,11 @@ export class VectorIndex {
321
321
  throw new Error(`Failed to resize index: ${error}`);
322
322
  }
323
323
  }
324
+ /**
325
+ * Get index options (for external access to configuration)
326
+ */
327
+ getOptions() {
328
+ return { ...this.options };
329
+ }
324
330
  }
325
331
  //# sourceMappingURL=vector-index.js.map
@@ -8,6 +8,8 @@ export interface FileProcessorOptions {
8
8
  recursive?: boolean;
9
9
  /** Maximum file size in bytes (default: 10MB) */
10
10
  maxFileSize?: number;
11
+ /** Processing mode to filter compatible files */
12
+ mode?: 'text' | 'multimodal';
11
13
  }
12
14
  /**
13
15
  * Default options for file processing
@@ -188,6 +188,15 @@ async function discoverFilesRecursive(dirPath, options) {
188
188
  // Check file size based on content type
189
189
  const stats = await fs.stat(fullPath);
190
190
  const contentType = getContentType(fullPath);
191
+ // Filter by mode: skip incompatible content types
192
+ const mode = options.mode || 'text';
193
+ if (mode === 'text' && contentType === 'image') {
194
+ result.skipped.push({
195
+ path: fullPath,
196
+ reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
197
+ });
198
+ continue;
199
+ }
191
200
  // Different size limits for different content types
192
201
  const maxSize = contentType === 'image'
193
202
  ? 50 * 1024 * 1024 // 50MB for images
@@ -250,6 +259,17 @@ export async function discoverFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIO
250
259
  };
251
260
  }
252
261
  const contentType = getContentType(resolvedPath);
262
+ // Filter by mode: skip incompatible content types
263
+ const mode = options.mode || 'text';
264
+ if (mode === 'text' && contentType === 'image') {
265
+ return {
266
+ files: [],
267
+ skipped: [{
268
+ path: resolvedPath,
269
+ reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
270
+ }]
271
+ };
272
+ }
253
273
  // Check file size based on content type
254
274
  const maxSize = contentType === 'image'
255
275
  ? 50 * 1024 * 1024 // 50MB for images
@@ -7,12 +7,16 @@ export interface IndexStats {
7
7
  export declare class IndexManager {
8
8
  private modelName?;
9
9
  private vectorIndex;
10
+ private textIndex?;
11
+ private imageIndex?;
10
12
  private db;
11
13
  private indexPath;
12
14
  private dbPath;
13
15
  private isInitialized;
14
16
  private hashToEmbeddingId;
15
17
  private embeddingIdToHash;
18
+ private groupedEmbeddings?;
19
+ private vectorIndexOptions;
16
20
  constructor(indexPath: string, dbPath: string, dimensions: number, modelName?: string | undefined);
17
21
  /**
18
22
  * Initialize the index manager and load existing index if available
@@ -30,6 +34,10 @@ export declare class IndexManager {
30
34
  * Requirements: 5.3 - When new documents are added THEN system SHALL append new chunks and vectors without rebuilding existing index
31
35
  */
32
36
  addVectors(embeddings: EmbeddingResult[]): Promise<void>;
37
+ /**
38
+ * Add grouped embeddings by content type (for new grouped format)
39
+ */
40
+ addGroupedEmbeddings(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
33
41
  /**
34
42
  * Rebuild the entire index from scratch
35
43
  * Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
@@ -68,10 +76,18 @@ export declare class IndexManager {
68
76
  * Save the vector index to disk
69
77
  */
70
78
  saveIndex(): Promise<void>;
79
+ /**
80
+ * Create specialized indexes for text and image content when grouped data is available
81
+ */
82
+ private createSpecializedIndexes;
83
+ /**
84
+ * Save index with content type grouping (for new grouped format)
85
+ */
86
+ saveGroupedIndex(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
71
87
  /**
72
88
  * Search for similar vectors
73
89
  */
74
- search(queryVector: Float32Array, k?: number): {
90
+ search(queryVector: Float32Array, k?: number, contentType?: 'text' | 'image' | 'combined'): {
75
91
  embeddingIds: string[];
76
92
  distances: number[];
77
93
  };
@@ -1,26 +1,33 @@
1
1
  import { VectorIndex } from './core/vector-index.js';
2
+ import { BinaryIndexFormat } from './core/binary-index-format.js';
2
3
  import { openDatabase, getSystemInfo, setSystemInfo } from './core/db.js';
3
4
  import { config, getModelDefaults } from './core/config.js';
4
5
  export class IndexManager {
5
6
  modelName;
6
7
  vectorIndex;
8
+ textIndex;
9
+ imageIndex;
7
10
  db = null;
8
11
  indexPath;
9
12
  dbPath;
10
13
  isInitialized = false;
11
14
  hashToEmbeddingId = new Map();
12
15
  embeddingIdToHash = new Map();
16
+ groupedEmbeddings;
17
+ vectorIndexOptions;
13
18
  constructor(indexPath, dbPath, dimensions, modelName) {
14
19
  this.modelName = modelName;
15
20
  this.indexPath = indexPath;
16
21
  this.dbPath = dbPath;
17
- // Initialize with provided dimensions from config
18
- this.vectorIndex = new VectorIndex(indexPath, {
22
+ // Store options for creating specialized indexes
23
+ this.vectorIndexOptions = {
19
24
  dimensions: dimensions,
20
25
  maxElements: 100000, // Start with 100k capacity
21
26
  efConstruction: 200,
22
27
  M: 16
23
- });
28
+ };
29
+ // Initialize with provided dimensions from config
30
+ this.vectorIndex = new VectorIndex(indexPath, this.vectorIndexOptions);
24
31
  }
25
32
  /**
26
33
  * Initialize the index manager and load existing index if available
@@ -47,6 +54,8 @@ export class IndexManager {
47
54
  // Only try to load existing index if not forcing recreation
48
55
  console.log('Loading existing vector index...');
49
56
  await this.vectorIndex.loadIndex();
57
+ // Check if the loaded index has grouped data and create specialized indexes
58
+ await this.createSpecializedIndexes();
50
59
  }
51
60
  // Always populate the embedding ID mapping from existing database entries
52
61
  // This is needed both for new and existing indexes
@@ -55,7 +64,8 @@ export class IndexManager {
55
64
  this.hashEmbeddingId(chunk.embedding_id); // This will populate the mapping
56
65
  }
57
66
  this.isInitialized = true;
58
- console.log(`Index manager initialized with ${this.vectorIndex.getCurrentCount()} vectors`);
67
+ const vectorCount = this.vectorIndex.getCurrentCount();
68
+ console.log(`Index manager initialized with ${vectorCount} vectors${this.textIndex && this.imageIndex ? ' (multi-graph mode)' : ''}`);
59
69
  }
60
70
  catch (error) {
61
71
  throw new Error(`Failed to initialize index manager: ${error}`);
@@ -153,6 +163,31 @@ export class IndexManager {
153
163
  throw new Error(`Failed to add vectors to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
154
164
  }
155
165
  }
166
+ /**
167
+ * Add grouped embeddings by content type (for new grouped format)
168
+ */
169
+ async addGroupedEmbeddings(textEmbeddings, imageEmbeddings) {
170
+ if (!this.isInitialized) {
171
+ throw new Error('Index manager not initialized');
172
+ }
173
+ console.log(`addGroupedEmbeddings: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
174
+ const allEmbeddings = [...textEmbeddings, ...imageEmbeddings];
175
+ if (allEmbeddings.length === 0) {
176
+ return;
177
+ }
178
+ try {
179
+ // Store grouped information for later saving
180
+ this.groupedEmbeddings = { text: textEmbeddings, image: imageEmbeddings };
181
+ console.log('addGroupedEmbeddings: stored grouped embeddings');
182
+ // Add all embeddings to the index (maintains current behavior)
183
+ await this.addVectors(allEmbeddings);
184
+ console.log('addGroupedEmbeddings: addVectors completed');
185
+ // The saveIndex method will now use grouped format if groupedEmbeddings exists
186
+ }
187
+ catch (error) {
188
+ throw new Error(`Failed to add grouped embeddings to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
189
+ }
190
+ }
156
191
  /**
157
192
  * Rebuild the entire index from scratch
158
193
  * Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
@@ -349,16 +384,122 @@ export class IndexManager {
349
384
  if (!this.isInitialized) {
350
385
  throw new Error('Index manager not initialized');
351
386
  }
352
- await this.vectorIndex.saveIndex();
387
+ // If we have grouped embeddings, save in grouped format
388
+ if (this.groupedEmbeddings) {
389
+ console.log('IndexManager: Saving in grouped format');
390
+ await this.saveGroupedIndex(this.groupedEmbeddings.text, this.groupedEmbeddings.image);
391
+ // Clear grouped data after saving
392
+ this.groupedEmbeddings = undefined;
393
+ }
394
+ else {
395
+ console.log('IndexManager: Saving in standard format');
396
+ await this.vectorIndex.saveIndex();
397
+ }
398
+ }
399
+ /**
400
+ * Create specialized indexes for text and image content when grouped data is available
401
+ */
402
+ async createSpecializedIndexes() {
403
+ try {
404
+ // Load the index data to check if it has grouped information
405
+ const indexData = await BinaryIndexFormat.load(this.indexPath);
406
+ if (indexData.hasContentTypeGroups && indexData.textVectors && indexData.imageVectors) {
407
+ // Only create specialized indexes if we have both text and image vectors
408
+ // In text-only mode, textVectors would be populated but imageVectors empty
409
+ // In multimodal mode, both would be populated
410
+ const hasTextVectors = indexData.textVectors.length > 0;
411
+ const hasImageVectors = indexData.imageVectors.length > 0;
412
+ if (hasTextVectors && hasImageVectors) {
413
+ console.log('Creating specialized indexes for content type filtering...');
414
+ // Create text-only index
415
+ this.textIndex = new VectorIndex(`${this.indexPath}.text`, this.vectorIndexOptions);
416
+ await this.textIndex.initialize();
417
+ this.textIndex.addVectors(indexData.textVectors);
418
+ console.log(`✓ Text index created with ${indexData.textVectors.length} vectors`);
419
+ // Create image-only index
420
+ this.imageIndex = new VectorIndex(`${this.indexPath}.image`, this.vectorIndexOptions);
421
+ await this.imageIndex.initialize();
422
+ this.imageIndex.addVectors(indexData.imageVectors);
423
+ console.log(`✓ Image index created with ${indexData.imageVectors.length} vectors`);
424
+ console.log('✓ Specialized indexes ready for content type filtering');
425
+ }
426
+ else if (hasTextVectors) {
427
+ console.log('Text-only index detected - using combined index for all searches');
428
+ // In text-only mode, we don't need specialized indexes
429
+ // The combined index (vectorIndex) already contains all text vectors
430
+ }
431
+ }
432
+ }
433
+ catch (error) {
434
+ console.warn('Failed to create specialized indexes, falling back to combined index:', error);
435
+ // Continue without specialized indexes - search will still work with combined index
436
+ }
437
+ }
438
+ /**
439
+ * Save index with content type grouping (for new grouped format)
440
+ */
441
+ async saveGroupedIndex(textEmbeddings, imageEmbeddings) {
442
+ if (!this.isInitialized) {
443
+ throw new Error('Index manager not initialized');
444
+ }
445
+ console.log(`saveGroupedIndex: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
446
+ // Group vectors by content type
447
+ const textVectors = textEmbeddings.map((embedding) => ({
448
+ id: this.hashEmbeddingId(embedding.embedding_id),
449
+ vector: embedding.vector
450
+ }));
451
+ const imageVectors = imageEmbeddings.map((embedding) => ({
452
+ id: this.hashEmbeddingId(embedding.embedding_id),
453
+ vector: embedding.vector
454
+ }));
455
+ // Get index parameters
456
+ const options = this.vectorIndex.getOptions();
457
+ const allVectors = [...textVectors, ...imageVectors];
458
+ console.log(`saveGroupedIndex: dimensions=${options.dimensions}, totalVectors=${allVectors.length}`);
459
+ const indexData = {
460
+ dimensions: options.dimensions,
461
+ maxElements: options.maxElements,
462
+ M: options.M || 16,
463
+ efConstruction: options.efConstruction || 200,
464
+ seed: options.seed || 100,
465
+ currentSize: textVectors.length + imageVectors.length,
466
+ vectors: allVectors, // Required for backward compatibility
467
+ hasContentTypeGroups: true,
468
+ textVectors,
469
+ imageVectors
470
+ };
471
+ console.log('saveGroupedIndex: Calling BinaryIndexFormat.saveGrouped');
472
+ // Save using grouped format
473
+ await BinaryIndexFormat.saveGrouped(this.indexPath, indexData);
474
+ console.log(`✓ Saved grouped index with ${textVectors.length} text and ${imageVectors.length} image vectors`);
353
475
  }
354
476
  /**
355
477
  * Search for similar vectors
356
478
  */
357
- search(queryVector, k = 5) {
479
+ search(queryVector, k = 5, contentType) {
358
480
  if (!this.isInitialized) {
359
481
  throw new Error('Index manager not initialized');
360
482
  }
361
- const results = this.vectorIndex.search(queryVector, k);
483
+ // Select the appropriate index based on content type
484
+ let targetIndex;
485
+ // If we have specialized indexes (multimodal mode), use them for filtering
486
+ if (this.textIndex && this.imageIndex) {
487
+ if (contentType === 'text') {
488
+ targetIndex = this.textIndex;
489
+ }
490
+ else if (contentType === 'image') {
491
+ targetIndex = this.imageIndex;
492
+ }
493
+ else {
494
+ // 'combined' or undefined
495
+ targetIndex = this.vectorIndex;
496
+ }
497
+ }
498
+ else {
499
+ // No specialized indexes (text-only mode) - ignore contentType and use combined index
500
+ targetIndex = this.vectorIndex;
501
+ }
502
+ const results = targetIndex.search(queryVector, k);
362
503
  // Convert numeric IDs back to embedding IDs
363
504
  const embeddingIds = results.neighbors.map(id => this.unhashEmbeddingId(id));
364
505
  return {
@@ -338,73 +338,78 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
338
338
  if (!this.textModel || !this.tokenizer) {
339
339
  throw new Error('CLIP text model or tokenizer not initialized');
340
340
  }
341
- // Use the validated CLIPTextModelWithProjection approach (no pixel_values errors)
342
- // Tokenize text with CLIP's requirements
343
- // The tokenizer handles truncation at 77 TOKENS (not characters)
344
- const tokens = await this.tokenizer(processedText, {
345
- padding: true,
346
- truncation: true,
347
- max_length: 77, // CLIP's text sequence length limit (77 tokens)
348
- return_tensors: 'pt'
349
- });
350
- // Log token information for debugging (only in development)
351
- if (process.env.NODE_ENV === 'development') {
352
- const tokenIds = tokens.input_ids?.data || [];
353
- const actualTokenCount = Array.from(tokenIds).filter((id) => id !== 0).length;
354
- if (actualTokenCount >= 77) {
355
- console.warn(`Text truncated by tokenizer: "${processedText.substring(0, 50)}..." (truncated to 77 tokens)`);
356
- }
357
- }
358
- // Generate text embedding using CLIPTextModelWithProjection
359
- const output = await this.textModel(tokens);
360
- // Extract embedding from text_embeds (no pixel_values dependency)
361
- const embedding = new Float32Array(output.text_embeds.data);
362
- // Validate embedding dimensions and values
363
- if (embedding.length !== this.dimensions) {
364
- throw new Error(`CLIP embedding dimension mismatch: expected ${this.dimensions}, got ${embedding.length}`);
365
- }
366
- // Validate that all values are finite numbers
367
- const invalidValues = Array.from(embedding).filter(val => !isFinite(val) || isNaN(val));
368
- if (invalidValues.length > 0) {
369
- throw new Error(`CLIP embedding contains ${invalidValues.length} invalid values`);
370
- }
371
- // Validate embedding quality - should not be all zeros
372
- const nonZeroValues = Array.from(embedding).filter(val => Math.abs(val) > 1e-8);
373
- if (nonZeroValues.length === 0) {
374
- throw new Error('CLIP embedding is all zeros');
375
- }
376
- // Calculate embedding magnitude before normalization for quality assessment
377
- const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
378
- if (magnitudeBeforeNorm < 1e-6) {
379
- throw new Error(`CLIP embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
380
- }
381
- // Apply L2-normalization (CLIP models are trained with normalized embeddings)
382
- this.normalizeEmbedding(embedding);
383
- // Verify normalization was successful
384
- const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
385
- if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
386
- console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
387
- }
388
- // Log text embedding generation
389
- console.log(`[CLIP] Generated text embedding for: "${processedText.substring(0, 30)}${processedText.length > 30 ? '...' : ''}"`);
390
- // Generate unique embedding ID
391
- const embeddingId = this.generateEmbeddingId(processedText, 'text');
392
- return {
393
- embedding_id: embeddingId,
394
- vector: embedding,
395
- contentType: 'text',
396
- metadata: {
397
- originalText: text,
398
- processedText: processedText,
399
- textLength: processedText.length,
400
- embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
401
- embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
402
- normalized: true,
403
- modelName: this.modelName,
404
- modelType: this.modelType,
405
- dimensions: this.dimensions
341
+ try {
342
+ // Use the validated CLIPTextModelWithProjection approach (no pixel_values errors)
343
+ // Tokenize text with CLIP's requirements
344
+ // The tokenizer handles truncation at 77 TOKENS (not characters)
345
+ const tokens = await this.tokenizer(processedText, {
346
+ padding: true,
347
+ truncation: true,
348
+ max_length: 77, // CLIP's text sequence length limit (77 tokens)
349
+ return_tensors: 'pt'
350
+ });
351
+ // Log token information for debugging (only in development)
352
+ if (process.env.NODE_ENV === 'development') {
353
+ const tokenIds = tokens.input_ids?.data || [];
354
+ const actualTokenCount = Array.from(tokenIds).filter((id) => id !== 0).length;
355
+ if (actualTokenCount >= 77) {
356
+ console.warn(`Text truncated by tokenizer: "${processedText.substring(0, 50)}..." (truncated to 77 tokens)`);
357
+ }
406
358
  }
407
- };
359
+ // Generate text embedding using CLIPTextModelWithProjection
360
+ const output = await this.textModel(tokens);
361
+ // Extract embedding from text_embeds (no pixel_values dependency)
362
+ const embedding = new Float32Array(output.text_embeds.data);
363
+ // Validate embedding dimensions and values
364
+ if (embedding.length !== this.dimensions) {
365
+ throw new Error(`CLIP embedding dimension mismatch: expected ${this.dimensions}, got ${embedding.length}`);
366
+ }
367
+ // Validate that all values are finite numbers
368
+ const invalidValues = Array.from(embedding).filter(val => !isFinite(val) || isNaN(val));
369
+ if (invalidValues.length > 0) {
370
+ throw new Error(`CLIP embedding contains ${invalidValues.length} invalid values`);
371
+ }
372
+ // Validate embedding quality - should not be all zeros
373
+ const nonZeroValues = Array.from(embedding).filter(val => Math.abs(val) > 1e-8);
374
+ if (nonZeroValues.length === 0) {
375
+ throw new Error('CLIP embedding is all zeros');
376
+ }
377
+ // Calculate embedding magnitude before normalization for quality assessment
378
+ const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
379
+ if (magnitudeBeforeNorm < 1e-6) {
380
+ throw new Error(`CLIP embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
381
+ }
382
+ // Apply L2-normalization (CLIP models are trained with normalized embeddings)
383
+ this.normalizeEmbedding(embedding);
384
+ // Verify normalization was successful
385
+ const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
386
+ if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
387
+ console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
388
+ }
389
+ // Log text embedding generation
390
+ console.log(`[CLIP] Generated text embedding for: "${processedText.substring(0, 30)}${processedText.length > 30 ? '...' : ''}"`);
391
+ // Generate unique embedding ID
392
+ const embeddingId = this.generateEmbeddingId(processedText, 'text');
393
+ return {
394
+ embedding_id: embeddingId,
395
+ vector: embedding,
396
+ contentType: 'text',
397
+ metadata: {
398
+ originalText: text,
399
+ processedText: processedText,
400
+ textLength: processedText.length,
401
+ embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
402
+ embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
403
+ normalized: true,
404
+ modelName: this.modelName,
405
+ modelType: this.modelType,
406
+ dimensions: this.dimensions
407
+ }
408
+ };
409
+ }
410
+ catch (error) {
411
+ throw error;
412
+ }
408
413
  }
409
414
  // =============================================================================
410
415
  // IMAGE EMBEDDING METHODS
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "rag-lite-ts",
3
- "version": "2.1.0",
3
+ "version": "2.1.1",
4
4
  "description": "Local-first TypeScript retrieval engine with Chameleon Multimodal Architecture for semantic search over text and image content",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",