rag-lite-ts 2.1.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/indexer.js +1 -1
- package/dist/cli/search.js +5 -10
- package/dist/core/binary-index-format.d.ts +28 -2
- package/dist/core/binary-index-format.js +196 -27
- package/dist/core/ingestion.d.ts +5 -1
- package/dist/core/ingestion.js +76 -9
- package/dist/core/reranking-strategies.js +4 -5
- package/dist/core/search.js +2 -1
- package/dist/core/types.d.ts +1 -1
- package/dist/core/vector-index.d.ts +4 -0
- package/dist/core/vector-index.js +6 -0
- package/dist/file-processor.d.ts +2 -0
- package/dist/file-processor.js +20 -0
- package/dist/index-manager.d.ts +17 -1
- package/dist/index-manager.js +148 -7
- package/dist/multimodal/clip-embedder.js +71 -66
- package/package.json +1 -1
package/dist/cli/indexer.js
CHANGED
|
@@ -198,7 +198,7 @@ export async function runIngest(path, options = {}) {
|
|
|
198
198
|
showProgress: true,
|
|
199
199
|
maxWaitMs: 15000 // Longer timeout for ingestion
|
|
200
200
|
});
|
|
201
|
-
const result = await pipeline.ingestPath(resolvedPath);
|
|
201
|
+
const result = await pipeline.ingestPath(resolvedPath, { mode: factoryOptions.mode });
|
|
202
202
|
// Display final results
|
|
203
203
|
console.log('\n' + '='.repeat(50));
|
|
204
204
|
console.log('INGESTION SUMMARY');
|
package/dist/cli/search.js
CHANGED
|
@@ -137,6 +137,11 @@ export async function runSearch(query, options = {}) {
|
|
|
137
137
|
if (options['top-k'] !== undefined) {
|
|
138
138
|
searchOptions.top_k = options['top-k'];
|
|
139
139
|
}
|
|
140
|
+
// Set content type filter for search-level filtering
|
|
141
|
+
const contentTypeFilter = options['content-type'];
|
|
142
|
+
if (contentTypeFilter && contentTypeFilter !== 'all') {
|
|
143
|
+
searchOptions.contentType = contentTypeFilter;
|
|
144
|
+
}
|
|
140
145
|
// Phase 2: Disable reranking for image-to-image searches to preserve visual similarity
|
|
141
146
|
let rerankingForciblyDisabled = false;
|
|
142
147
|
if (isImage && embedder) {
|
|
@@ -174,16 +179,6 @@ export async function runSearch(query, options = {}) {
|
|
|
174
179
|
results = await searchEngine.search(query, searchOptions);
|
|
175
180
|
}
|
|
176
181
|
const searchTime = Date.now() - startTime;
|
|
177
|
-
// Apply content type filter if specified
|
|
178
|
-
const contentTypeFilter = options['content-type'];
|
|
179
|
-
if (contentTypeFilter && contentTypeFilter !== 'all') {
|
|
180
|
-
const originalCount = results.length;
|
|
181
|
-
results = results.filter(r => r.contentType === contentTypeFilter);
|
|
182
|
-
if (results.length < originalCount) {
|
|
183
|
-
console.log(`Filtered to ${results.length} ${contentTypeFilter} result${results.length === 1 ? '' : 's'} (from ${originalCount} total)`);
|
|
184
|
-
console.log('');
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
182
|
// Display results
|
|
188
183
|
if (results.length === 0) {
|
|
189
184
|
console.log('No results found.');
|
|
@@ -25,10 +25,19 @@ export interface BinaryIndexData {
|
|
|
25
25
|
id: number;
|
|
26
26
|
vector: Float32Array;
|
|
27
27
|
}>;
|
|
28
|
+
hasContentTypeGroups?: boolean;
|
|
29
|
+
textVectors?: Array<{
|
|
30
|
+
id: number;
|
|
31
|
+
vector: Float32Array;
|
|
32
|
+
}>;
|
|
33
|
+
imageVectors?: Array<{
|
|
34
|
+
id: number;
|
|
35
|
+
vector: Float32Array;
|
|
36
|
+
}>;
|
|
28
37
|
}
|
|
29
38
|
export declare class BinaryIndexFormat {
|
|
30
39
|
/**
|
|
31
|
-
* Save index data to binary format
|
|
40
|
+
* Save index data to binary format (original format for backward compatibility)
|
|
32
41
|
*
|
|
33
42
|
* File structure:
|
|
34
43
|
* - Header (24 bytes): dimensions, maxElements, M, efConstruction, seed, currentSize
|
|
@@ -39,7 +48,24 @@ export declare class BinaryIndexFormat {
|
|
|
39
48
|
*/
|
|
40
49
|
static save(indexPath: string, data: BinaryIndexData): Promise<void>;
|
|
41
50
|
/**
|
|
42
|
-
*
|
|
51
|
+
* Save index data to grouped binary format
|
|
52
|
+
*
|
|
53
|
+
* File structure:
|
|
54
|
+
* - Extended Header (40 bytes):
|
|
55
|
+
* - Original 6 fields (24 bytes)
|
|
56
|
+
* - hasGroups flag (4 bytes)
|
|
57
|
+
* - textOffset (4 bytes)
|
|
58
|
+
* - textCount (4 bytes)
|
|
59
|
+
* - imageOffset (4 bytes)
|
|
60
|
+
* - imageCount (4 bytes)
|
|
61
|
+
* - Data section: [text vectors...][image vectors...]
|
|
62
|
+
*
|
|
63
|
+
* @param indexPath Path to save the binary index file
|
|
64
|
+
* @param data Index data to serialize
|
|
65
|
+
*/
|
|
66
|
+
static saveGrouped(indexPath: string, data: BinaryIndexData): Promise<void>;
|
|
67
|
+
/**
|
|
68
|
+
* Load index data from binary format (supports both original and grouped formats)
|
|
43
69
|
*
|
|
44
70
|
* Uses zero-copy Float32Array views for efficient loading.
|
|
45
71
|
* Copies the views to ensure data persistence after buffer lifecycle.
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
import { readFileSync, writeFileSync } from 'fs';
|
|
18
18
|
export class BinaryIndexFormat {
|
|
19
19
|
/**
|
|
20
|
-
* Save index data to binary format
|
|
20
|
+
* Save index data to binary format (original format for backward compatibility)
|
|
21
21
|
*
|
|
22
22
|
* File structure:
|
|
23
23
|
* - Header (24 bytes): dimensions, maxElements, M, efConstruction, seed, currentSize
|
|
@@ -66,7 +66,115 @@ export class BinaryIndexFormat {
|
|
|
66
66
|
writeFileSync(indexPath, Buffer.from(buffer));
|
|
67
67
|
}
|
|
68
68
|
/**
|
|
69
|
-
*
|
|
69
|
+
* Save index data to grouped binary format
|
|
70
|
+
*
|
|
71
|
+
* File structure:
|
|
72
|
+
* - Extended Header (40 bytes):
|
|
73
|
+
* - Original 6 fields (24 bytes)
|
|
74
|
+
* - hasGroups flag (4 bytes)
|
|
75
|
+
* - textOffset (4 bytes)
|
|
76
|
+
* - textCount (4 bytes)
|
|
77
|
+
* - imageOffset (4 bytes)
|
|
78
|
+
* - imageCount (4 bytes)
|
|
79
|
+
* - Data section: [text vectors...][image vectors...]
|
|
80
|
+
*
|
|
81
|
+
* @param indexPath Path to save the binary index file
|
|
82
|
+
* @param data Index data to serialize
|
|
83
|
+
*/
|
|
84
|
+
static async saveGrouped(indexPath, data) {
|
|
85
|
+
if (!data.hasContentTypeGroups || !data.textVectors || !data.imageVectors) {
|
|
86
|
+
// Fallback to original format
|
|
87
|
+
return this.save(indexPath, data);
|
|
88
|
+
}
|
|
89
|
+
const headerSize = 44; // Extended header: 24 + 20 bytes (hasGroups + textOffset + textCount + imageOffset + imageCount)
|
|
90
|
+
const vectorSize = 4 + (data.dimensions * 4); // id + vector
|
|
91
|
+
// Calculate offsets and total size
|
|
92
|
+
const textOffset = headerSize;
|
|
93
|
+
const imageOffset = textOffset + (data.textVectors.length * vectorSize);
|
|
94
|
+
const totalSize = imageOffset + (data.imageVectors.length * vectorSize);
|
|
95
|
+
const buffer = new ArrayBuffer(totalSize);
|
|
96
|
+
const view = new DataView(buffer);
|
|
97
|
+
let offset = 0;
|
|
98
|
+
// Write extended header (40 bytes, all little-endian)
|
|
99
|
+
if (offset + 40 > buffer.byteLength) {
|
|
100
|
+
throw new Error(`Header write would exceed buffer bounds: offset=${offset}, headerSize=40, bufferSize=${buffer.byteLength}`);
|
|
101
|
+
}
|
|
102
|
+
view.setUint32(offset, data.dimensions, true);
|
|
103
|
+
offset += 4;
|
|
104
|
+
view.setUint32(offset, data.maxElements, true);
|
|
105
|
+
offset += 4;
|
|
106
|
+
view.setUint32(offset, data.M, true);
|
|
107
|
+
offset += 4;
|
|
108
|
+
view.setUint32(offset, data.efConstruction, true);
|
|
109
|
+
offset += 4;
|
|
110
|
+
view.setUint32(offset, data.seed, true);
|
|
111
|
+
offset += 4;
|
|
112
|
+
view.setUint32(offset, data.currentSize, true);
|
|
113
|
+
offset += 4;
|
|
114
|
+
// Extended fields
|
|
115
|
+
view.setUint32(offset, 1, true);
|
|
116
|
+
offset += 4; // hasGroups = 1
|
|
117
|
+
view.setUint32(offset, textOffset, true);
|
|
118
|
+
offset += 4;
|
|
119
|
+
view.setUint32(offset, data.textVectors.length, true);
|
|
120
|
+
offset += 4;
|
|
121
|
+
view.setUint32(offset, imageOffset, true);
|
|
122
|
+
offset += 4;
|
|
123
|
+
view.setUint32(offset, data.imageVectors.length, true);
|
|
124
|
+
offset += 4;
|
|
125
|
+
// Write text vectors
|
|
126
|
+
for (const item of data.textVectors) {
|
|
127
|
+
// Ensure 4-byte alignment
|
|
128
|
+
if (offset % 4 !== 0) {
|
|
129
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
130
|
+
}
|
|
131
|
+
// Check bounds before writing
|
|
132
|
+
if (offset + 4 > buffer.byteLength) {
|
|
133
|
+
throw new Error(`ID write would exceed buffer bounds: offset=${offset}, bufferSize=${buffer.byteLength}`);
|
|
134
|
+
}
|
|
135
|
+
// Write vector ID
|
|
136
|
+
view.setUint32(offset, item.id, true);
|
|
137
|
+
offset += 4;
|
|
138
|
+
// Check bounds for vector data
|
|
139
|
+
const vectorDataSize = item.vector.length * 4;
|
|
140
|
+
if (offset + vectorDataSize > buffer.byteLength) {
|
|
141
|
+
throw new Error(`Vector data write would exceed buffer bounds: offset=${offset}, dataSize=${vectorDataSize}, bufferSize=${buffer.byteLength}`);
|
|
142
|
+
}
|
|
143
|
+
// Write vector data
|
|
144
|
+
for (let i = 0; i < item.vector.length; i++) {
|
|
145
|
+
view.setFloat32(offset, item.vector[i], true);
|
|
146
|
+
offset += 4;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
// Write image vectors
|
|
150
|
+
for (const item of data.imageVectors) {
|
|
151
|
+
// Ensure 4-byte alignment
|
|
152
|
+
if (offset % 4 !== 0) {
|
|
153
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
154
|
+
}
|
|
155
|
+
// Check bounds before writing
|
|
156
|
+
if (offset + 4 > buffer.byteLength) {
|
|
157
|
+
throw new Error(`ID write would exceed buffer bounds: offset=${offset}, bufferSize=${buffer.byteLength}`);
|
|
158
|
+
}
|
|
159
|
+
// Write vector ID
|
|
160
|
+
view.setUint32(offset, item.id, true);
|
|
161
|
+
offset += 4;
|
|
162
|
+
// Check bounds for vector data
|
|
163
|
+
const vectorDataSize = item.vector.length * 4;
|
|
164
|
+
if (offset + vectorDataSize > buffer.byteLength) {
|
|
165
|
+
throw new Error(`Vector data write would exceed buffer bounds: offset=${offset}, dataSize=${vectorDataSize}, bufferSize=${buffer.byteLength}`);
|
|
166
|
+
}
|
|
167
|
+
// Write vector data
|
|
168
|
+
for (let i = 0; i < item.vector.length; i++) {
|
|
169
|
+
view.setFloat32(offset, item.vector[i], true);
|
|
170
|
+
offset += 4;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
// Write to file
|
|
174
|
+
writeFileSync(indexPath, Buffer.from(buffer));
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Load index data from binary format (supports both original and grouped formats)
|
|
70
178
|
*
|
|
71
179
|
* Uses zero-copy Float32Array views for efficient loading.
|
|
72
180
|
* Copies the views to ensure data persistence after buffer lifecycle.
|
|
@@ -78,7 +186,7 @@ export class BinaryIndexFormat {
|
|
|
78
186
|
const buffer = readFileSync(indexPath);
|
|
79
187
|
const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
80
188
|
let offset = 0;
|
|
81
|
-
// Read header (24 bytes, all little-endian)
|
|
189
|
+
// Read basic header (24 bytes, all little-endian)
|
|
82
190
|
const dimensions = view.getUint32(offset, true);
|
|
83
191
|
offset += 4;
|
|
84
192
|
const maxElements = view.getUint32(offset, true);
|
|
@@ -91,32 +199,93 @@ export class BinaryIndexFormat {
|
|
|
91
199
|
offset += 4;
|
|
92
200
|
const currentSize = view.getUint32(offset, true);
|
|
93
201
|
offset += 4;
|
|
94
|
-
//
|
|
95
|
-
const
|
|
96
|
-
|
|
97
|
-
//
|
|
98
|
-
|
|
99
|
-
|
|
202
|
+
// Check if this is the extended grouped format (40+ bytes header)
|
|
203
|
+
const hasGroups = buffer.byteLength >= 40 ? view.getUint32(offset, true) : 0;
|
|
204
|
+
if (hasGroups === 1 && buffer.byteLength >= 40) {
|
|
205
|
+
// Load grouped format
|
|
206
|
+
const textOffset = view.getUint32(offset + 4, true);
|
|
207
|
+
const textCount = view.getUint32(offset + 8, true);
|
|
208
|
+
const imageOffset = view.getUint32(offset + 12, true);
|
|
209
|
+
const imageCount = view.getUint32(offset + 16, true);
|
|
210
|
+
// Load text vectors
|
|
211
|
+
const textVectors = [];
|
|
212
|
+
offset = textOffset;
|
|
213
|
+
for (let i = 0; i < textCount; i++) {
|
|
214
|
+
// Ensure 4-byte alignment
|
|
215
|
+
if (offset % 4 !== 0) {
|
|
216
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
217
|
+
}
|
|
218
|
+
// Read vector ID
|
|
219
|
+
const id = view.getUint32(offset, true);
|
|
220
|
+
offset += 4;
|
|
221
|
+
// Zero-copy Float32Array view
|
|
222
|
+
const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
|
|
223
|
+
// Copy to avoid buffer lifecycle issues
|
|
224
|
+
const vector = new Float32Array(vectorView);
|
|
225
|
+
offset += dimensions * 4;
|
|
226
|
+
textVectors.push({ id, vector });
|
|
100
227
|
}
|
|
101
|
-
//
|
|
102
|
-
const
|
|
103
|
-
offset
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
228
|
+
// Load image vectors
|
|
229
|
+
const imageVectors = [];
|
|
230
|
+
offset = imageOffset;
|
|
231
|
+
for (let i = 0; i < imageCount; i++) {
|
|
232
|
+
// Ensure 4-byte alignment
|
|
233
|
+
if (offset % 4 !== 0) {
|
|
234
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
235
|
+
}
|
|
236
|
+
// Read vector ID
|
|
237
|
+
const id = view.getUint32(offset, true);
|
|
238
|
+
offset += 4;
|
|
239
|
+
// Zero-copy Float32Array view
|
|
240
|
+
const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
|
|
241
|
+
// Copy to avoid buffer lifecycle issues
|
|
242
|
+
const vector = new Float32Array(vectorView);
|
|
243
|
+
offset += dimensions * 4;
|
|
244
|
+
imageVectors.push({ id, vector });
|
|
245
|
+
}
|
|
246
|
+
// Combine all vectors for backward compatibility
|
|
247
|
+
const allVectors = [...textVectors, ...imageVectors];
|
|
248
|
+
return {
|
|
249
|
+
dimensions,
|
|
250
|
+
maxElements,
|
|
251
|
+
M,
|
|
252
|
+
efConstruction,
|
|
253
|
+
seed,
|
|
254
|
+
currentSize,
|
|
255
|
+
vectors: allVectors,
|
|
256
|
+
hasContentTypeGroups: true,
|
|
257
|
+
textVectors,
|
|
258
|
+
imageVectors
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
else {
|
|
262
|
+
// Load original format
|
|
263
|
+
const vectors = [];
|
|
264
|
+
for (let i = 0; i < currentSize; i++) {
|
|
265
|
+
// Ensure 4-byte alignment (should always be true with our format)
|
|
266
|
+
if (offset % 4 !== 0) {
|
|
267
|
+
throw new Error(`Offset ${offset} is not 4-byte aligned`);
|
|
268
|
+
}
|
|
269
|
+
// Read vector ID
|
|
270
|
+
const id = view.getUint32(offset, true);
|
|
271
|
+
offset += 4;
|
|
272
|
+
// Zero-copy Float32Array view (fast!)
|
|
273
|
+
const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
|
|
274
|
+
// Copy to avoid buffer lifecycle issues
|
|
275
|
+
const vector = new Float32Array(vectorView);
|
|
276
|
+
offset += dimensions * 4;
|
|
277
|
+
vectors.push({ id, vector });
|
|
278
|
+
}
|
|
279
|
+
return {
|
|
280
|
+
dimensions,
|
|
281
|
+
maxElements,
|
|
282
|
+
M,
|
|
283
|
+
efConstruction,
|
|
284
|
+
seed,
|
|
285
|
+
currentSize,
|
|
286
|
+
vectors
|
|
287
|
+
};
|
|
110
288
|
}
|
|
111
|
-
return {
|
|
112
|
-
dimensions,
|
|
113
|
-
maxElements,
|
|
114
|
-
M,
|
|
115
|
-
efConstruction,
|
|
116
|
-
seed,
|
|
117
|
-
currentSize,
|
|
118
|
-
vectors
|
|
119
|
-
};
|
|
120
289
|
}
|
|
121
290
|
}
|
|
122
291
|
//# sourceMappingURL=binary-index-format.js.map
|
package/dist/core/ingestion.d.ts
CHANGED
|
@@ -162,9 +162,13 @@ export declare class IngestionPipeline {
|
|
|
162
162
|
*/
|
|
163
163
|
private storeDocumentsAndChunksWithContentTypes;
|
|
164
164
|
/**
|
|
165
|
-
* Update vector index with new embeddings
|
|
165
|
+
* Update vector index with new embeddings (supports grouped content type storage)
|
|
166
166
|
*/
|
|
167
167
|
private updateVectorIndex;
|
|
168
|
+
/**
|
|
169
|
+
* Filter documents based on ingestion mode to avoid processing incompatible content types
|
|
170
|
+
*/
|
|
171
|
+
private filterDocumentsByMode;
|
|
168
172
|
/**
|
|
169
173
|
* Converts MIME type to simple content type for embedding function
|
|
170
174
|
* @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
|
package/dist/core/ingestion.js
CHANGED
|
@@ -287,21 +287,30 @@ export class IngestionPipeline {
|
|
|
287
287
|
try {
|
|
288
288
|
// Phase 1: File Discovery and Processing with Content-Type Detection
|
|
289
289
|
console.log('\n--- Phase 1: File Discovery and Processing ---');
|
|
290
|
-
const
|
|
291
|
-
|
|
290
|
+
const mode = options.mode || 'text';
|
|
291
|
+
const fileOptions = {
|
|
292
|
+
recursive: true,
|
|
293
|
+
maxFileSize: 10 * 1024 * 1024, // 10MB
|
|
294
|
+
...options.fileOptions,
|
|
295
|
+
mode
|
|
296
|
+
};
|
|
297
|
+
const fileResult = await discoverAndProcessFiles(path, fileOptions, this.pathManager);
|
|
298
|
+
// Additional filtering as fallback (should be minimal with mode-aware discovery)
|
|
299
|
+
const filteredResult = this.filterDocumentsByMode(fileResult, mode);
|
|
300
|
+
if (filteredResult.documents.length === 0) {
|
|
292
301
|
console.log('No documents found to process');
|
|
293
302
|
return {
|
|
294
303
|
documentsProcessed: 0,
|
|
295
304
|
chunksCreated: 0,
|
|
296
305
|
embeddingsGenerated: 0,
|
|
297
|
-
documentErrors:
|
|
306
|
+
documentErrors: filteredResult.processingResult.errors.length,
|
|
298
307
|
embeddingErrors: 0,
|
|
299
308
|
processingTimeMs: Date.now() - startTime,
|
|
300
309
|
contentIds: []
|
|
301
310
|
};
|
|
302
311
|
}
|
|
303
312
|
// Content-type detection and routing
|
|
304
|
-
const contentTypeStats = this.analyzeContentTypes(
|
|
313
|
+
const contentTypeStats = this.analyzeContentTypes(filteredResult.documents);
|
|
305
314
|
console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
|
|
306
315
|
// Phase 2: Document Chunking with Content-Type Awareness
|
|
307
316
|
console.log('\n--- Phase 2: Document Chunking ---');
|
|
@@ -309,7 +318,7 @@ export class IngestionPipeline {
|
|
|
309
318
|
chunkSize: config.chunk_size,
|
|
310
319
|
chunkOverlap: config.chunk_overlap
|
|
311
320
|
};
|
|
312
|
-
const chunkingResult = await this.chunkDocumentsWithContentTypes(
|
|
321
|
+
const chunkingResult = await this.chunkDocumentsWithContentTypes(filteredResult.documents, effectiveChunkConfig, options.mode);
|
|
313
322
|
if (chunkingResult.totalChunks === 0) {
|
|
314
323
|
console.log('No chunks created from documents');
|
|
315
324
|
return {
|
|
@@ -334,10 +343,10 @@ export class IngestionPipeline {
|
|
|
334
343
|
const endTime = Date.now();
|
|
335
344
|
const processingTimeMs = endTime - startTime;
|
|
336
345
|
const result = {
|
|
337
|
-
documentsProcessed:
|
|
346
|
+
documentsProcessed: filteredResult.documents.length,
|
|
338
347
|
chunksCreated: chunkingResult.totalChunks,
|
|
339
348
|
embeddingsGenerated: embeddingResult.embeddings.length,
|
|
340
|
-
documentErrors:
|
|
349
|
+
documentErrors: filteredResult.processingResult.errors.length,
|
|
341
350
|
embeddingErrors: embeddingResult.errors,
|
|
342
351
|
processingTimeMs,
|
|
343
352
|
contentIds
|
|
@@ -595,16 +604,35 @@ export class IngestionPipeline {
|
|
|
595
604
|
return contentIds;
|
|
596
605
|
}
|
|
597
606
|
/**
|
|
598
|
-
* Update vector index with new embeddings
|
|
607
|
+
* Update vector index with new embeddings (supports grouped content type storage)
|
|
599
608
|
*/
|
|
600
609
|
async updateVectorIndex(embeddings) {
|
|
610
|
+
console.log('updateVectorIndex called with', embeddings.length, 'embeddings');
|
|
601
611
|
if (embeddings.length === 0) {
|
|
602
612
|
console.log('No embeddings to add to vector index');
|
|
603
613
|
return;
|
|
604
614
|
}
|
|
605
615
|
console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
|
|
606
616
|
try {
|
|
607
|
-
|
|
617
|
+
// Group embeddings by content type for optimized storage
|
|
618
|
+
const groupedEmbeddings = embeddings.reduce((groups, embedding) => {
|
|
619
|
+
const contentType = embedding.contentType || 'text';
|
|
620
|
+
if (!groups[contentType]) {
|
|
621
|
+
groups[contentType] = [];
|
|
622
|
+
}
|
|
623
|
+
groups[contentType].push(embedding);
|
|
624
|
+
return groups;
|
|
625
|
+
}, {});
|
|
626
|
+
const textEmbeddings = groupedEmbeddings.text || [];
|
|
627
|
+
const imageEmbeddings = groupedEmbeddings.image || [];
|
|
628
|
+
console.log(`Grouped: ${textEmbeddings.length} text, ${imageEmbeddings.length} image vectors`);
|
|
629
|
+
// Use grouped storage method if available, fallback to regular method
|
|
630
|
+
if (this.indexManager.addGroupedEmbeddings) {
|
|
631
|
+
await this.indexManager.addGroupedEmbeddings(textEmbeddings, imageEmbeddings);
|
|
632
|
+
}
|
|
633
|
+
else {
|
|
634
|
+
await this.indexManager.addVectors(embeddings);
|
|
635
|
+
}
|
|
608
636
|
console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
|
|
609
637
|
}
|
|
610
638
|
catch (error) {
|
|
@@ -612,6 +640,45 @@ export class IngestionPipeline {
|
|
|
612
640
|
throw error;
|
|
613
641
|
}
|
|
614
642
|
}
|
|
643
|
+
/**
|
|
644
|
+
* Filter documents based on ingestion mode to avoid processing incompatible content types
|
|
645
|
+
*/
|
|
646
|
+
filterDocumentsByMode(fileResult, mode) {
|
|
647
|
+
if (mode === 'multimodal') {
|
|
648
|
+
// In multimodal mode, keep all documents
|
|
649
|
+
return fileResult;
|
|
650
|
+
}
|
|
651
|
+
// In text mode, filter out image documents
|
|
652
|
+
const filteredDocuments = fileResult.documents.filter(doc => {
|
|
653
|
+
const contentType = doc.metadata?.contentType || 'text';
|
|
654
|
+
const isCompatible = contentType === 'text' ||
|
|
655
|
+
contentType.startsWith('text/') ||
|
|
656
|
+
contentType === 'application/pdf' ||
|
|
657
|
+
contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
|
|
658
|
+
if (!isCompatible) {
|
|
659
|
+
console.log(`⚠️ Skipping ${doc.source} (${contentType}) - not compatible with text mode`);
|
|
660
|
+
}
|
|
661
|
+
return isCompatible;
|
|
662
|
+
});
|
|
663
|
+
// Update processing result to reflect filtering
|
|
664
|
+
const filteredProcessingResult = {
|
|
665
|
+
...fileResult.processingResult,
|
|
666
|
+
skippedFiles: [
|
|
667
|
+
...(fileResult.processingResult.skippedFiles || []),
|
|
668
|
+
...fileResult.documents
|
|
669
|
+
.filter(doc => !filteredDocuments.includes(doc))
|
|
670
|
+
.map(doc => ({
|
|
671
|
+
path: doc.source,
|
|
672
|
+
reason: `Content type not compatible with ${mode} mode`
|
|
673
|
+
}))
|
|
674
|
+
]
|
|
675
|
+
};
|
|
676
|
+
return {
|
|
677
|
+
documents: filteredDocuments,
|
|
678
|
+
discoveryResult: fileResult.discoveryResult,
|
|
679
|
+
processingResult: filteredProcessingResult
|
|
680
|
+
};
|
|
681
|
+
}
|
|
615
682
|
/**
|
|
616
683
|
* Converts MIME type to simple content type for embedding function
|
|
617
684
|
* @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
|
|
@@ -194,7 +194,7 @@ export class TextDerivedRerankingStrategy {
|
|
|
194
194
|
catch (error) {
|
|
195
195
|
console.warn(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
196
196
|
// Fallback to filename-based description
|
|
197
|
-
const filename = imagePath.split('/').pop() || imagePath
|
|
197
|
+
const filename = imagePath.split('/').pop() || imagePath;
|
|
198
198
|
return `Image file: ${filename}`;
|
|
199
199
|
}
|
|
200
200
|
}
|
|
@@ -211,17 +211,16 @@ export class TextDerivedRerankingStrategy {
|
|
|
211
211
|
// Step 1: Convert images to text descriptions
|
|
212
212
|
const processedResults = await Promise.all(results.map(async (result) => {
|
|
213
213
|
if (result.contentType === 'image') {
|
|
214
|
-
// Generate text description for image
|
|
215
|
-
const description = await this.generateImageDescription(result.
|
|
214
|
+
// Generate text description for image
|
|
215
|
+
const description = await this.generateImageDescription(result.content);
|
|
216
216
|
return {
|
|
217
217
|
...result,
|
|
218
218
|
content: description,
|
|
219
|
-
contentType: 'text', // Change to 'text' so cross-encoder will process it
|
|
220
219
|
originalContent: result.content,
|
|
221
220
|
originalContentType: result.contentType,
|
|
222
221
|
metadata: {
|
|
223
222
|
...result.metadata,
|
|
224
|
-
originalImagePath: result.
|
|
223
|
+
originalImagePath: result.content,
|
|
225
224
|
generatedDescription: description
|
|
226
225
|
}
|
|
227
226
|
};
|
package/dist/core/search.js
CHANGED
|
@@ -139,7 +139,8 @@ export class SearchEngine {
|
|
|
139
139
|
const searchStartTime = performance.now();
|
|
140
140
|
let searchResult;
|
|
141
141
|
try {
|
|
142
|
-
|
|
142
|
+
const contentType = options.contentType;
|
|
143
|
+
searchResult = this.indexManager.search(queryVector, topK, contentType);
|
|
143
144
|
}
|
|
144
145
|
catch (error) {
|
|
145
146
|
if (error instanceof Error && error.message.includes('No embedding ID found for hash')) {
|
package/dist/core/types.d.ts
CHANGED
|
@@ -64,5 +64,9 @@ export declare class VectorIndex {
|
|
|
64
64
|
* Resize index to accommodate more vectors
|
|
65
65
|
*/
|
|
66
66
|
resizeIndex(newMaxElements: number): void;
|
|
67
|
+
/**
|
|
68
|
+
* Get index options (for external access to configuration)
|
|
69
|
+
*/
|
|
70
|
+
getOptions(): VectorIndexOptions;
|
|
67
71
|
}
|
|
68
72
|
//# sourceMappingURL=vector-index.d.ts.map
|
|
@@ -321,5 +321,11 @@ export class VectorIndex {
|
|
|
321
321
|
throw new Error(`Failed to resize index: ${error}`);
|
|
322
322
|
}
|
|
323
323
|
}
|
|
324
|
+
/**
|
|
325
|
+
* Get index options (for external access to configuration)
|
|
326
|
+
*/
|
|
327
|
+
getOptions() {
|
|
328
|
+
return { ...this.options };
|
|
329
|
+
}
|
|
324
330
|
}
|
|
325
331
|
//# sourceMappingURL=vector-index.js.map
|
package/dist/file-processor.d.ts
CHANGED
|
@@ -8,6 +8,8 @@ export interface FileProcessorOptions {
|
|
|
8
8
|
recursive?: boolean;
|
|
9
9
|
/** Maximum file size in bytes (default: 10MB) */
|
|
10
10
|
maxFileSize?: number;
|
|
11
|
+
/** Processing mode to filter compatible files */
|
|
12
|
+
mode?: 'text' | 'multimodal';
|
|
11
13
|
}
|
|
12
14
|
/**
|
|
13
15
|
* Default options for file processing
|
package/dist/file-processor.js
CHANGED
|
@@ -188,6 +188,15 @@ async function discoverFilesRecursive(dirPath, options) {
|
|
|
188
188
|
// Check file size based on content type
|
|
189
189
|
const stats = await fs.stat(fullPath);
|
|
190
190
|
const contentType = getContentType(fullPath);
|
|
191
|
+
// Filter by mode: skip incompatible content types
|
|
192
|
+
const mode = options.mode || 'text';
|
|
193
|
+
if (mode === 'text' && contentType === 'image') {
|
|
194
|
+
result.skipped.push({
|
|
195
|
+
path: fullPath,
|
|
196
|
+
reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
|
|
197
|
+
});
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
191
200
|
// Different size limits for different content types
|
|
192
201
|
const maxSize = contentType === 'image'
|
|
193
202
|
? 50 * 1024 * 1024 // 50MB for images
|
|
@@ -250,6 +259,17 @@ export async function discoverFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIO
|
|
|
250
259
|
};
|
|
251
260
|
}
|
|
252
261
|
const contentType = getContentType(resolvedPath);
|
|
262
|
+
// Filter by mode: skip incompatible content types
|
|
263
|
+
const mode = options.mode || 'text';
|
|
264
|
+
if (mode === 'text' && contentType === 'image') {
|
|
265
|
+
return {
|
|
266
|
+
files: [],
|
|
267
|
+
skipped: [{
|
|
268
|
+
path: resolvedPath,
|
|
269
|
+
reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
|
|
270
|
+
}]
|
|
271
|
+
};
|
|
272
|
+
}
|
|
253
273
|
// Check file size based on content type
|
|
254
274
|
const maxSize = contentType === 'image'
|
|
255
275
|
? 50 * 1024 * 1024 // 50MB for images
|
package/dist/index-manager.d.ts
CHANGED
|
@@ -7,12 +7,16 @@ export interface IndexStats {
|
|
|
7
7
|
export declare class IndexManager {
|
|
8
8
|
private modelName?;
|
|
9
9
|
private vectorIndex;
|
|
10
|
+
private textIndex?;
|
|
11
|
+
private imageIndex?;
|
|
10
12
|
private db;
|
|
11
13
|
private indexPath;
|
|
12
14
|
private dbPath;
|
|
13
15
|
private isInitialized;
|
|
14
16
|
private hashToEmbeddingId;
|
|
15
17
|
private embeddingIdToHash;
|
|
18
|
+
private groupedEmbeddings?;
|
|
19
|
+
private vectorIndexOptions;
|
|
16
20
|
constructor(indexPath: string, dbPath: string, dimensions: number, modelName?: string | undefined);
|
|
17
21
|
/**
|
|
18
22
|
* Initialize the index manager and load existing index if available
|
|
@@ -30,6 +34,10 @@ export declare class IndexManager {
|
|
|
30
34
|
* Requirements: 5.3 - When new documents are added THEN system SHALL append new chunks and vectors without rebuilding existing index
|
|
31
35
|
*/
|
|
32
36
|
addVectors(embeddings: EmbeddingResult[]): Promise<void>;
|
|
37
|
+
/**
|
|
38
|
+
* Add grouped embeddings by content type (for new grouped format)
|
|
39
|
+
*/
|
|
40
|
+
addGroupedEmbeddings(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
|
|
33
41
|
/**
|
|
34
42
|
* Rebuild the entire index from scratch
|
|
35
43
|
* Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
|
|
@@ -68,10 +76,18 @@ export declare class IndexManager {
|
|
|
68
76
|
* Save the vector index to disk
|
|
69
77
|
*/
|
|
70
78
|
saveIndex(): Promise<void>;
|
|
79
|
+
/**
|
|
80
|
+
* Create specialized indexes for text and image content when grouped data is available
|
|
81
|
+
*/
|
|
82
|
+
private createSpecializedIndexes;
|
|
83
|
+
/**
|
|
84
|
+
* Save index with content type grouping (for new grouped format)
|
|
85
|
+
*/
|
|
86
|
+
saveGroupedIndex(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
|
|
71
87
|
/**
|
|
72
88
|
* Search for similar vectors
|
|
73
89
|
*/
|
|
74
|
-
search(queryVector: Float32Array, k?: number): {
|
|
90
|
+
search(queryVector: Float32Array, k?: number, contentType?: 'text' | 'image' | 'combined'): {
|
|
75
91
|
embeddingIds: string[];
|
|
76
92
|
distances: number[];
|
|
77
93
|
};
|
package/dist/index-manager.js
CHANGED
|
@@ -1,26 +1,33 @@
|
|
|
1
1
|
import { VectorIndex } from './core/vector-index.js';
|
|
2
|
+
import { BinaryIndexFormat } from './core/binary-index-format.js';
|
|
2
3
|
import { openDatabase, getSystemInfo, setSystemInfo } from './core/db.js';
|
|
3
4
|
import { config, getModelDefaults } from './core/config.js';
|
|
4
5
|
export class IndexManager {
|
|
5
6
|
modelName;
|
|
6
7
|
vectorIndex;
|
|
8
|
+
textIndex;
|
|
9
|
+
imageIndex;
|
|
7
10
|
db = null;
|
|
8
11
|
indexPath;
|
|
9
12
|
dbPath;
|
|
10
13
|
isInitialized = false;
|
|
11
14
|
hashToEmbeddingId = new Map();
|
|
12
15
|
embeddingIdToHash = new Map();
|
|
16
|
+
groupedEmbeddings;
|
|
17
|
+
vectorIndexOptions;
|
|
13
18
|
constructor(indexPath, dbPath, dimensions, modelName) {
|
|
14
19
|
this.modelName = modelName;
|
|
15
20
|
this.indexPath = indexPath;
|
|
16
21
|
this.dbPath = dbPath;
|
|
17
|
-
//
|
|
18
|
-
this.
|
|
22
|
+
// Store options for creating specialized indexes
|
|
23
|
+
this.vectorIndexOptions = {
|
|
19
24
|
dimensions: dimensions,
|
|
20
25
|
maxElements: 100000, // Start with 100k capacity
|
|
21
26
|
efConstruction: 200,
|
|
22
27
|
M: 16
|
|
23
|
-
}
|
|
28
|
+
};
|
|
29
|
+
// Initialize with provided dimensions from config
|
|
30
|
+
this.vectorIndex = new VectorIndex(indexPath, this.vectorIndexOptions);
|
|
24
31
|
}
|
|
25
32
|
/**
|
|
26
33
|
* Initialize the index manager and load existing index if available
|
|
@@ -47,6 +54,8 @@ export class IndexManager {
|
|
|
47
54
|
// Only try to load existing index if not forcing recreation
|
|
48
55
|
console.log('Loading existing vector index...');
|
|
49
56
|
await this.vectorIndex.loadIndex();
|
|
57
|
+
// Check if the loaded index has grouped data and create specialized indexes
|
|
58
|
+
await this.createSpecializedIndexes();
|
|
50
59
|
}
|
|
51
60
|
// Always populate the embedding ID mapping from existing database entries
|
|
52
61
|
// This is needed both for new and existing indexes
|
|
@@ -55,7 +64,8 @@ export class IndexManager {
|
|
|
55
64
|
this.hashEmbeddingId(chunk.embedding_id); // This will populate the mapping
|
|
56
65
|
}
|
|
57
66
|
this.isInitialized = true;
|
|
58
|
-
|
|
67
|
+
const vectorCount = this.vectorIndex.getCurrentCount();
|
|
68
|
+
console.log(`Index manager initialized with ${vectorCount} vectors${this.textIndex && this.imageIndex ? ' (multi-graph mode)' : ''}`);
|
|
59
69
|
}
|
|
60
70
|
catch (error) {
|
|
61
71
|
throw new Error(`Failed to initialize index manager: ${error}`);
|
|
@@ -153,6 +163,31 @@ export class IndexManager {
|
|
|
153
163
|
throw new Error(`Failed to add vectors to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
154
164
|
}
|
|
155
165
|
}
|
|
166
|
+
/**
|
|
167
|
+
* Add grouped embeddings by content type (for new grouped format)
|
|
168
|
+
*/
|
|
169
|
+
async addGroupedEmbeddings(textEmbeddings, imageEmbeddings) {
|
|
170
|
+
if (!this.isInitialized) {
|
|
171
|
+
throw new Error('Index manager not initialized');
|
|
172
|
+
}
|
|
173
|
+
console.log(`addGroupedEmbeddings: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
|
|
174
|
+
const allEmbeddings = [...textEmbeddings, ...imageEmbeddings];
|
|
175
|
+
if (allEmbeddings.length === 0) {
|
|
176
|
+
return;
|
|
177
|
+
}
|
|
178
|
+
try {
|
|
179
|
+
// Store grouped information for later saving
|
|
180
|
+
this.groupedEmbeddings = { text: textEmbeddings, image: imageEmbeddings };
|
|
181
|
+
console.log('addGroupedEmbeddings: stored grouped embeddings');
|
|
182
|
+
// Add all embeddings to the index (maintains current behavior)
|
|
183
|
+
await this.addVectors(allEmbeddings);
|
|
184
|
+
console.log('addGroupedEmbeddings: addVectors completed');
|
|
185
|
+
// The saveIndex method will now use grouped format if groupedEmbeddings exists
|
|
186
|
+
}
|
|
187
|
+
catch (error) {
|
|
188
|
+
throw new Error(`Failed to add grouped embeddings to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
156
191
|
/**
|
|
157
192
|
* Rebuild the entire index from scratch
|
|
158
193
|
* Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
|
|
@@ -349,16 +384,122 @@ export class IndexManager {
|
|
|
349
384
|
if (!this.isInitialized) {
|
|
350
385
|
throw new Error('Index manager not initialized');
|
|
351
386
|
}
|
|
352
|
-
|
|
387
|
+
// If we have grouped embeddings, save in grouped format
|
|
388
|
+
if (this.groupedEmbeddings) {
|
|
389
|
+
console.log('IndexManager: Saving in grouped format');
|
|
390
|
+
await this.saveGroupedIndex(this.groupedEmbeddings.text, this.groupedEmbeddings.image);
|
|
391
|
+
// Clear grouped data after saving
|
|
392
|
+
this.groupedEmbeddings = undefined;
|
|
393
|
+
}
|
|
394
|
+
else {
|
|
395
|
+
console.log('IndexManager: Saving in standard format');
|
|
396
|
+
await this.vectorIndex.saveIndex();
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
/**
|
|
400
|
+
* Create specialized indexes for text and image content when grouped data is available
|
|
401
|
+
*/
|
|
402
|
+
async createSpecializedIndexes() {
|
|
403
|
+
try {
|
|
404
|
+
// Load the index data to check if it has grouped information
|
|
405
|
+
const indexData = await BinaryIndexFormat.load(this.indexPath);
|
|
406
|
+
if (indexData.hasContentTypeGroups && indexData.textVectors && indexData.imageVectors) {
|
|
407
|
+
// Only create specialized indexes if we have both text and image vectors
|
|
408
|
+
// In text-only mode, textVectors would be populated but imageVectors empty
|
|
409
|
+
// In multimodal mode, both would be populated
|
|
410
|
+
const hasTextVectors = indexData.textVectors.length > 0;
|
|
411
|
+
const hasImageVectors = indexData.imageVectors.length > 0;
|
|
412
|
+
if (hasTextVectors && hasImageVectors) {
|
|
413
|
+
console.log('Creating specialized indexes for content type filtering...');
|
|
414
|
+
// Create text-only index
|
|
415
|
+
this.textIndex = new VectorIndex(`${this.indexPath}.text`, this.vectorIndexOptions);
|
|
416
|
+
await this.textIndex.initialize();
|
|
417
|
+
this.textIndex.addVectors(indexData.textVectors);
|
|
418
|
+
console.log(`✓ Text index created with ${indexData.textVectors.length} vectors`);
|
|
419
|
+
// Create image-only index
|
|
420
|
+
this.imageIndex = new VectorIndex(`${this.indexPath}.image`, this.vectorIndexOptions);
|
|
421
|
+
await this.imageIndex.initialize();
|
|
422
|
+
this.imageIndex.addVectors(indexData.imageVectors);
|
|
423
|
+
console.log(`✓ Image index created with ${indexData.imageVectors.length} vectors`);
|
|
424
|
+
console.log('✓ Specialized indexes ready for content type filtering');
|
|
425
|
+
}
|
|
426
|
+
else if (hasTextVectors) {
|
|
427
|
+
console.log('Text-only index detected - using combined index for all searches');
|
|
428
|
+
// In text-only mode, we don't need specialized indexes
|
|
429
|
+
// The combined index (vectorIndex) already contains all text vectors
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
catch (error) {
|
|
434
|
+
console.warn('Failed to create specialized indexes, falling back to combined index:', error);
|
|
435
|
+
// Continue without specialized indexes - search will still work with combined index
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
/**
|
|
439
|
+
* Save index with content type grouping (for new grouped format)
|
|
440
|
+
*/
|
|
441
|
+
async saveGroupedIndex(textEmbeddings, imageEmbeddings) {
|
|
442
|
+
if (!this.isInitialized) {
|
|
443
|
+
throw new Error('Index manager not initialized');
|
|
444
|
+
}
|
|
445
|
+
console.log(`saveGroupedIndex: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
|
|
446
|
+
// Group vectors by content type
|
|
447
|
+
const textVectors = textEmbeddings.map((embedding) => ({
|
|
448
|
+
id: this.hashEmbeddingId(embedding.embedding_id),
|
|
449
|
+
vector: embedding.vector
|
|
450
|
+
}));
|
|
451
|
+
const imageVectors = imageEmbeddings.map((embedding) => ({
|
|
452
|
+
id: this.hashEmbeddingId(embedding.embedding_id),
|
|
453
|
+
vector: embedding.vector
|
|
454
|
+
}));
|
|
455
|
+
// Get index parameters
|
|
456
|
+
const options = this.vectorIndex.getOptions();
|
|
457
|
+
const allVectors = [...textVectors, ...imageVectors];
|
|
458
|
+
console.log(`saveGroupedIndex: dimensions=${options.dimensions}, totalVectors=${allVectors.length}`);
|
|
459
|
+
const indexData = {
|
|
460
|
+
dimensions: options.dimensions,
|
|
461
|
+
maxElements: options.maxElements,
|
|
462
|
+
M: options.M || 16,
|
|
463
|
+
efConstruction: options.efConstruction || 200,
|
|
464
|
+
seed: options.seed || 100,
|
|
465
|
+
currentSize: textVectors.length + imageVectors.length,
|
|
466
|
+
vectors: allVectors, // Required for backward compatibility
|
|
467
|
+
hasContentTypeGroups: true,
|
|
468
|
+
textVectors,
|
|
469
|
+
imageVectors
|
|
470
|
+
};
|
|
471
|
+
console.log('saveGroupedIndex: Calling BinaryIndexFormat.saveGrouped');
|
|
472
|
+
// Save using grouped format
|
|
473
|
+
await BinaryIndexFormat.saveGrouped(this.indexPath, indexData);
|
|
474
|
+
console.log(`✓ Saved grouped index with ${textVectors.length} text and ${imageVectors.length} image vectors`);
|
|
353
475
|
}
|
|
354
476
|
/**
|
|
355
477
|
* Search for similar vectors
|
|
356
478
|
*/
|
|
357
|
-
search(queryVector, k = 5) {
|
|
479
|
+
search(queryVector, k = 5, contentType) {
|
|
358
480
|
if (!this.isInitialized) {
|
|
359
481
|
throw new Error('Index manager not initialized');
|
|
360
482
|
}
|
|
361
|
-
|
|
483
|
+
// Select the appropriate index based on content type
|
|
484
|
+
let targetIndex;
|
|
485
|
+
// If we have specialized indexes (multimodal mode), use them for filtering
|
|
486
|
+
if (this.textIndex && this.imageIndex) {
|
|
487
|
+
if (contentType === 'text') {
|
|
488
|
+
targetIndex = this.textIndex;
|
|
489
|
+
}
|
|
490
|
+
else if (contentType === 'image') {
|
|
491
|
+
targetIndex = this.imageIndex;
|
|
492
|
+
}
|
|
493
|
+
else {
|
|
494
|
+
// 'combined' or undefined
|
|
495
|
+
targetIndex = this.vectorIndex;
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
else {
|
|
499
|
+
// No specialized indexes (text-only mode) - ignore contentType and use combined index
|
|
500
|
+
targetIndex = this.vectorIndex;
|
|
501
|
+
}
|
|
502
|
+
const results = targetIndex.search(queryVector, k);
|
|
362
503
|
// Convert numeric IDs back to embedding IDs
|
|
363
504
|
const embeddingIds = results.neighbors.map(id => this.unhashEmbeddingId(id));
|
|
364
505
|
return {
|
|
@@ -338,73 +338,78 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
|
|
|
338
338
|
if (!this.textModel || !this.tokenizer) {
|
|
339
339
|
throw new Error('CLIP text model or tokenizer not initialized');
|
|
340
340
|
}
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
// Generate text embedding using CLIPTextModelWithProjection
|
|
359
|
-
const output = await this.textModel(tokens);
|
|
360
|
-
// Extract embedding from text_embeds (no pixel_values dependency)
|
|
361
|
-
const embedding = new Float32Array(output.text_embeds.data);
|
|
362
|
-
// Validate embedding dimensions and values
|
|
363
|
-
if (embedding.length !== this.dimensions) {
|
|
364
|
-
throw new Error(`CLIP embedding dimension mismatch: expected ${this.dimensions}, got ${embedding.length}`);
|
|
365
|
-
}
|
|
366
|
-
// Validate that all values are finite numbers
|
|
367
|
-
const invalidValues = Array.from(embedding).filter(val => !isFinite(val) || isNaN(val));
|
|
368
|
-
if (invalidValues.length > 0) {
|
|
369
|
-
throw new Error(`CLIP embedding contains ${invalidValues.length} invalid values`);
|
|
370
|
-
}
|
|
371
|
-
// Validate embedding quality - should not be all zeros
|
|
372
|
-
const nonZeroValues = Array.from(embedding).filter(val => Math.abs(val) > 1e-8);
|
|
373
|
-
if (nonZeroValues.length === 0) {
|
|
374
|
-
throw new Error('CLIP embedding is all zeros');
|
|
375
|
-
}
|
|
376
|
-
// Calculate embedding magnitude before normalization for quality assessment
|
|
377
|
-
const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
|
|
378
|
-
if (magnitudeBeforeNorm < 1e-6) {
|
|
379
|
-
throw new Error(`CLIP embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
|
|
380
|
-
}
|
|
381
|
-
// Apply L2-normalization (CLIP models are trained with normalized embeddings)
|
|
382
|
-
this.normalizeEmbedding(embedding);
|
|
383
|
-
// Verify normalization was successful
|
|
384
|
-
const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
|
|
385
|
-
if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
|
|
386
|
-
console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
|
|
387
|
-
}
|
|
388
|
-
// Log text embedding generation
|
|
389
|
-
console.log(`[CLIP] Generated text embedding for: "${processedText.substring(0, 30)}${processedText.length > 30 ? '...' : ''}"`);
|
|
390
|
-
// Generate unique embedding ID
|
|
391
|
-
const embeddingId = this.generateEmbeddingId(processedText, 'text');
|
|
392
|
-
return {
|
|
393
|
-
embedding_id: embeddingId,
|
|
394
|
-
vector: embedding,
|
|
395
|
-
contentType: 'text',
|
|
396
|
-
metadata: {
|
|
397
|
-
originalText: text,
|
|
398
|
-
processedText: processedText,
|
|
399
|
-
textLength: processedText.length,
|
|
400
|
-
embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
|
|
401
|
-
embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
|
|
402
|
-
normalized: true,
|
|
403
|
-
modelName: this.modelName,
|
|
404
|
-
modelType: this.modelType,
|
|
405
|
-
dimensions: this.dimensions
|
|
341
|
+
try {
|
|
342
|
+
// Use the validated CLIPTextModelWithProjection approach (no pixel_values errors)
|
|
343
|
+
// Tokenize text with CLIP's requirements
|
|
344
|
+
// The tokenizer handles truncation at 77 TOKENS (not characters)
|
|
345
|
+
const tokens = await this.tokenizer(processedText, {
|
|
346
|
+
padding: true,
|
|
347
|
+
truncation: true,
|
|
348
|
+
max_length: 77, // CLIP's text sequence length limit (77 tokens)
|
|
349
|
+
return_tensors: 'pt'
|
|
350
|
+
});
|
|
351
|
+
// Log token information for debugging (only in development)
|
|
352
|
+
if (process.env.NODE_ENV === 'development') {
|
|
353
|
+
const tokenIds = tokens.input_ids?.data || [];
|
|
354
|
+
const actualTokenCount = Array.from(tokenIds).filter((id) => id !== 0).length;
|
|
355
|
+
if (actualTokenCount >= 77) {
|
|
356
|
+
console.warn(`Text truncated by tokenizer: "${processedText.substring(0, 50)}..." (truncated to 77 tokens)`);
|
|
357
|
+
}
|
|
406
358
|
}
|
|
407
|
-
|
|
359
|
+
// Generate text embedding using CLIPTextModelWithProjection
|
|
360
|
+
const output = await this.textModel(tokens);
|
|
361
|
+
// Extract embedding from text_embeds (no pixel_values dependency)
|
|
362
|
+
const embedding = new Float32Array(output.text_embeds.data);
|
|
363
|
+
// Validate embedding dimensions and values
|
|
364
|
+
if (embedding.length !== this.dimensions) {
|
|
365
|
+
throw new Error(`CLIP embedding dimension mismatch: expected ${this.dimensions}, got ${embedding.length}`);
|
|
366
|
+
}
|
|
367
|
+
// Validate that all values are finite numbers
|
|
368
|
+
const invalidValues = Array.from(embedding).filter(val => !isFinite(val) || isNaN(val));
|
|
369
|
+
if (invalidValues.length > 0) {
|
|
370
|
+
throw new Error(`CLIP embedding contains ${invalidValues.length} invalid values`);
|
|
371
|
+
}
|
|
372
|
+
// Validate embedding quality - should not be all zeros
|
|
373
|
+
const nonZeroValues = Array.from(embedding).filter(val => Math.abs(val) > 1e-8);
|
|
374
|
+
if (nonZeroValues.length === 0) {
|
|
375
|
+
throw new Error('CLIP embedding is all zeros');
|
|
376
|
+
}
|
|
377
|
+
// Calculate embedding magnitude before normalization for quality assessment
|
|
378
|
+
const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
|
|
379
|
+
if (magnitudeBeforeNorm < 1e-6) {
|
|
380
|
+
throw new Error(`CLIP embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
|
|
381
|
+
}
|
|
382
|
+
// Apply L2-normalization (CLIP models are trained with normalized embeddings)
|
|
383
|
+
this.normalizeEmbedding(embedding);
|
|
384
|
+
// Verify normalization was successful
|
|
385
|
+
const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
|
|
386
|
+
if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
|
|
387
|
+
console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
|
|
388
|
+
}
|
|
389
|
+
// Log text embedding generation
|
|
390
|
+
console.log(`[CLIP] Generated text embedding for: "${processedText.substring(0, 30)}${processedText.length > 30 ? '...' : ''}"`);
|
|
391
|
+
// Generate unique embedding ID
|
|
392
|
+
const embeddingId = this.generateEmbeddingId(processedText, 'text');
|
|
393
|
+
return {
|
|
394
|
+
embedding_id: embeddingId,
|
|
395
|
+
vector: embedding,
|
|
396
|
+
contentType: 'text',
|
|
397
|
+
metadata: {
|
|
398
|
+
originalText: text,
|
|
399
|
+
processedText: processedText,
|
|
400
|
+
textLength: processedText.length,
|
|
401
|
+
embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
|
|
402
|
+
embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
|
|
403
|
+
normalized: true,
|
|
404
|
+
modelName: this.modelName,
|
|
405
|
+
modelType: this.modelType,
|
|
406
|
+
dimensions: this.dimensions
|
|
407
|
+
}
|
|
408
|
+
};
|
|
409
|
+
}
|
|
410
|
+
catch (error) {
|
|
411
|
+
throw error;
|
|
412
|
+
}
|
|
408
413
|
}
|
|
409
414
|
// =============================================================================
|
|
410
415
|
// IMAGE EMBEDDING METHODS
|
package/package.json
CHANGED