rag-lite-ts 2.0.4 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,15 +17,13 @@ export const DEFAULT_MULTIMODAL_RERANKING_CONFIG = {
17
17
  semantic: 0.7,
18
18
  metadata: 0.3
19
19
  },
20
- fallback: 'metadata'
20
+ fallback: 'disabled'
21
21
  };
22
22
  // Strategy validation without complex interface patterns
23
23
  export function validateRerankingStrategy(strategy) {
24
24
  const validStrategies = [
25
25
  'cross-encoder',
26
26
  'text-derived',
27
- 'metadata',
28
- 'hybrid',
29
27
  'disabled'
30
28
  ];
31
29
  return validStrategies.includes(strategy);
@@ -36,7 +34,7 @@ export function validateRerankingConfig(config) {
36
34
  throw new Error('Reranking strategy is required');
37
35
  }
38
36
  if (!validateRerankingStrategy(config.strategy)) {
39
- const validStrategies = ['cross-encoder', 'text-derived', 'metadata', 'hybrid', 'disabled'];
37
+ const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
40
38
  throw new Error(`Invalid reranking strategy '${config.strategy}'. ` +
41
39
  `Valid strategies: ${validStrategies.join(', ')}`);
42
40
  }
@@ -52,23 +50,16 @@ export function validateRerankingConfig(config) {
52
50
  if (visual !== undefined && (visual < 0 || visual > 1)) {
53
51
  throw new Error('Visual weight must be between 0 and 1');
54
52
  }
55
- // Ensure weights sum to reasonable value for hybrid strategy
56
- if (config.strategy === 'hybrid') {
57
- const totalWeight = (semantic || 0) + (metadata || 0) + (visual || 0);
58
- if (totalWeight === 0) {
59
- throw new Error('Hybrid strategy requires at least one weight to be greater than 0');
60
- }
61
- }
62
53
  }
63
54
  // Validate fallback strategy if provided
64
55
  if (config.fallback && !validateRerankingStrategy(config.fallback)) {
65
- const validStrategies = ['cross-encoder', 'text-derived', 'metadata', 'hybrid', 'disabled'];
56
+ const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
66
57
  throw new Error(`Invalid fallback strategy '${config.fallback}'. ` +
67
58
  `Valid strategies: ${validStrategies.join(', ')}`);
68
59
  }
69
60
  return {
70
61
  strategy: config.strategy,
71
- enabled: config.enabled ?? true,
62
+ enabled: config.strategy === 'disabled' ? false : (config.enabled ?? true),
72
63
  model: config.model,
73
64
  weights: config.weights,
74
65
  fallback: config.fallback || 'disabled'
@@ -91,7 +82,7 @@ export function isStrategySupported(strategy, mode) {
91
82
  case 'text':
92
83
  return strategy === 'cross-encoder' || strategy === 'disabled';
93
84
  case 'multimodal':
94
- return ['text-derived', 'metadata', 'hybrid', 'disabled'].includes(strategy);
85
+ return ['text-derived', 'disabled'].includes(strategy);
95
86
  default:
96
87
  return false;
97
88
  }
@@ -102,7 +93,7 @@ export function getSupportedStrategies(mode) {
102
93
  case 'text':
103
94
  return ['cross-encoder', 'disabled'];
104
95
  case 'multimodal':
105
- return ['text-derived', 'metadata', 'hybrid', 'disabled'];
96
+ return ['text-derived', 'disabled'];
106
97
  default:
107
98
  return ['disabled'];
108
99
  }
@@ -145,7 +136,7 @@ export class RerankingConfigBuilder {
145
136
  .strategy('text-derived')
146
137
  .enabled(true)
147
138
  .weights({ semantic: 0.7, metadata: 0.3 })
148
- .fallback('metadata');
139
+ .fallback('disabled');
149
140
  }
150
141
  static disabled() {
151
142
  return new RerankingConfigBuilder()
@@ -6,7 +6,7 @@
6
6
  * principle of using simple functions over complex factory patterns.
7
7
  */
8
8
  import { getDefaultRerankingConfig, isStrategySupported, getSupportedStrategies, validateRerankingConfig } from './reranking-config.js';
9
- import { createCrossEncoderRerankFunction, createTextDerivedRerankFunction, createMetadataRerankFunction } from './reranking-strategies.js';
9
+ import { createCrossEncoderRerankFunction, createTextDerivedRerankFunction } from './reranking-strategies.js';
10
10
  /**
11
11
  * Simple reranking creation function with conditional logic
12
12
  *
@@ -102,23 +102,6 @@ function createRerankingFunction(mode, strategy, config) {
102
102
  undefined // Use default cross-encoder model
103
103
  );
104
104
  break;
105
- case 'metadata':
106
- console.log(`Creating metadata reranker for ${mode} mode`);
107
- reranker = createMetadataRerankFunction({
108
- weights: config.weights ? {
109
- filename: config.weights.metadata || 0.4,
110
- contentType: 0.3,
111
- metadata: config.weights.metadata || 0.3
112
- } : undefined
113
- });
114
- break;
115
- case 'hybrid':
116
- if (mode !== 'multimodal') {
117
- throw new RerankingStrategyError(strategy, mode, 'Hybrid strategy only supported in multimodal mode', 'UNSUPPORTED_MODE');
118
- }
119
- console.log('Creating hybrid reranker for multimodal mode');
120
- reranker = createHybridRerankFunction(config);
121
- break;
122
105
  case 'disabled':
123
106
  console.log('Reranking explicitly disabled');
124
107
  return undefined;
@@ -241,172 +224,10 @@ function wrapRerankFunctionWithErrorRecovery(reranker, strategy, mode) {
241
224
  };
242
225
  }
243
226
  /**
244
- * Create hybrid reranking function that combines multiple strategies with enhanced error recovery
227
+ * Hybrid reranking strategy removed in Phase 3 - throwing error for backward compatibility
245
228
  */
246
229
  function createHybridRerankFunction(config) {
247
- // Default weights if not specified
248
- const weights = config.weights || {
249
- semantic: 0.6,
250
- metadata: 0.4,
251
- visual: 0.0 // Not implemented yet
252
- };
253
- // Track which strategies are available
254
- const availableStrategies = {};
255
- // Initialize strategies with error handling
256
- try {
257
- if (weights.semantic && weights.semantic > 0) {
258
- availableStrategies.textDerived = createTextDerivedRerankFunction();
259
- console.log('✅ Text-derived strategy initialized for hybrid reranking');
260
- }
261
- }
262
- catch (error) {
263
- console.warn(`⚠️ Text-derived strategy initialization failed for hybrid reranking: ${error instanceof Error ? error.message : 'Unknown error'}`);
264
- }
265
- try {
266
- if (weights.metadata && weights.metadata > 0) {
267
- availableStrategies.metadata = createMetadataRerankFunction();
268
- console.log('✅ Metadata strategy initialized for hybrid reranking');
269
- }
270
- }
271
- catch (error) {
272
- console.warn(`⚠️ Metadata strategy initialization failed for hybrid reranking: ${error instanceof Error ? error.message : 'Unknown error'}`);
273
- }
274
- // Check if any strategies are available
275
- const hasAvailableStrategies = Object.keys(availableStrategies).length > 0;
276
- if (!hasAvailableStrategies) {
277
- throw new RerankingStrategyError('hybrid', 'multimodal', 'No hybrid reranking strategies could be initialized', 'NO_STRATEGIES_AVAILABLE');
278
- }
279
- console.log(`Hybrid reranking initialized with ${Object.keys(availableStrategies).length} available strategies`);
280
- return async (query, results, contentType) => {
281
- const startTime = Date.now();
282
- const strategyResults = {};
283
- try {
284
- console.log(`🔄 Running hybrid reranking with ${Object.keys(availableStrategies).length} strategies`);
285
- // Start with original results
286
- let hybridResults = [...results];
287
- let successfulStrategies = 0;
288
- // Apply text-derived reranking if available and enabled
289
- if (availableStrategies.textDerived && weights.semantic && weights.semantic > 0) {
290
- const strategyStartTime = Date.now();
291
- try {
292
- console.log(`🔧 Applying text-derived reranking (weight: ${weights.semantic})`);
293
- const textDerivedResults = await availableStrategies.textDerived(query, hybridResults, contentType);
294
- // Combine scores with semantic weight
295
- hybridResults = hybridResults.map((result, index) => {
296
- const textDerivedScore = textDerivedResults[index]?.score || result.score;
297
- const combinedScore = result.score * (1 - weights.semantic) + textDerivedScore * weights.semantic;
298
- return {
299
- ...result,
300
- score: combinedScore,
301
- metadata: {
302
- ...result.metadata,
303
- hybridScores: {
304
- ...(result.metadata?.hybridScores || {}),
305
- textDerived: textDerivedScore,
306
- semantic: combinedScore
307
- }
308
- }
309
- };
310
- });
311
- const strategyDuration = Date.now() - strategyStartTime;
312
- strategyResults.textDerived = { success: true, duration: strategyDuration };
313
- successfulStrategies++;
314
- console.log(`✅ Text-derived reranking completed (${strategyDuration}ms)`);
315
- }
316
- catch (error) {
317
- const strategyDuration = Date.now() - strategyStartTime;
318
- const errorMessage = error instanceof Error ? error.message : 'Unknown error';
319
- strategyResults.textDerived = { success: false, error: errorMessage, duration: strategyDuration };
320
- console.warn(`❌ Text-derived reranking failed in hybrid mode (${strategyDuration}ms): ${errorMessage}`);
321
- }
322
- }
323
- // Apply metadata reranking if available and enabled
324
- if (availableStrategies.metadata && weights.metadata && weights.metadata > 0) {
325
- const strategyStartTime = Date.now();
326
- try {
327
- console.log(`🔧 Applying metadata reranking (weight: ${weights.metadata})`);
328
- const metadataResults = await availableStrategies.metadata(query, hybridResults, contentType);
329
- // Combine scores with metadata weight
330
- hybridResults = hybridResults.map((result, index) => {
331
- const metadataScore = metadataResults[index]?.score || result.score;
332
- const currentScore = result.score;
333
- const combinedScore = currentScore * (1 - weights.metadata) + metadataScore * weights.metadata;
334
- return {
335
- ...result,
336
- score: combinedScore,
337
- metadata: {
338
- ...result.metadata,
339
- hybridScores: {
340
- ...(result.metadata?.hybridScores || {}),
341
- metadata: metadataScore,
342
- combined: combinedScore
343
- }
344
- }
345
- };
346
- });
347
- const strategyDuration = Date.now() - strategyStartTime;
348
- strategyResults.metadata = { success: true, duration: strategyDuration };
349
- successfulStrategies++;
350
- console.log(`✅ Metadata reranking completed (${strategyDuration}ms)`);
351
- }
352
- catch (error) {
353
- const strategyDuration = Date.now() - strategyStartTime;
354
- const errorMessage = error instanceof Error ? error.message : 'Unknown error';
355
- strategyResults.metadata = { success: false, error: errorMessage, duration: strategyDuration };
356
- console.warn(`❌ Metadata reranking failed in hybrid mode (${strategyDuration}ms): ${errorMessage}`);
357
- }
358
- }
359
- // Sort by final combined scores
360
- hybridResults.sort((a, b) => b.score - a.score);
361
- const totalDuration = Date.now() - startTime;
362
- // Add hybrid reranking metadata to results
363
- hybridResults = hybridResults.map(result => ({
364
- ...result,
365
- metadata: {
366
- ...result.metadata,
367
- hybridRerankingInfo: {
368
- totalDuration,
369
- successfulStrategies,
370
- strategyResults,
371
- weights
372
- }
373
- }
374
- }));
375
- if (successfulStrategies > 0) {
376
- console.log(`✅ Hybrid reranking completed successfully (${totalDuration}ms, ${successfulStrategies}/${Object.keys(availableStrategies).length} strategies succeeded)`);
377
- }
378
- else {
379
- console.warn(`⚠️ Hybrid reranking completed with no successful strategies (${totalDuration}ms), returning original results`);
380
- return results; // Return original results if no strategies succeeded
381
- }
382
- return hybridResults;
383
- }
384
- catch (error) {
385
- const totalDuration = Date.now() - startTime;
386
- const errorMessage = error instanceof Error ? error.message : 'Unknown error';
387
- console.warn(`❌ Hybrid reranking failed (${totalDuration}ms): ${errorMessage}. ` +
388
- `Returning original results.`);
389
- // Log detailed error information
390
- console.error('Hybrid reranking error details:', {
391
- query: query.substring(0, 100) + (query.length > 100 ? '...' : ''),
392
- resultCount: results.length,
393
- contentType,
394
- availableStrategies: Object.keys(availableStrategies),
395
- weights,
396
- strategyResults,
397
- error: errorMessage
398
- });
399
- return results.map(result => ({
400
- ...result,
401
- metadata: {
402
- ...result.metadata,
403
- hybridRerankingFailed: true,
404
- hybridRerankingError: errorMessage,
405
- fallbackToVectorSimilarity: true
406
- }
407
- }));
408
- }
409
- };
230
+ throw new RerankingStrategyError('hybrid', 'multimodal', 'Hybrid reranking strategy has been removed in this version. Use text-derived instead.', 'STRATEGY_REMOVED');
410
231
  }
411
232
  /**
412
233
  * Create reranker with automatic mode detection
@@ -582,8 +403,6 @@ export function getRerankingStats() {
582
403
  strategiesUsed: {
583
404
  'cross-encoder': 0,
584
405
  'text-derived': 0,
585
- 'metadata': 0,
586
- 'hybrid': 0,
587
406
  'disabled': 0
588
407
  }
589
408
  };
@@ -194,7 +194,7 @@ export class TextDerivedRerankingStrategy {
194
194
  catch (error) {
195
195
  console.warn(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : 'Unknown error'}`);
196
196
  // Fallback to filename-based description
197
- const filename = imagePath.split('/').pop() || imagePath;
197
+ const filename = imagePath.split('/').pop() || imagePath.split('\\').pop() || imagePath;
198
198
  return `Image file: ${filename}`;
199
199
  }
200
200
  }
@@ -211,16 +211,17 @@ export class TextDerivedRerankingStrategy {
211
211
  // Step 1: Convert images to text descriptions
212
212
  const processedResults = await Promise.all(results.map(async (result) => {
213
213
  if (result.contentType === 'image') {
214
- // Generate text description for image
215
- const description = await this.generateImageDescription(result.content);
214
+ // Generate text description for image using the file path from document.source
215
+ const description = await this.generateImageDescription(result.document.source);
216
216
  return {
217
217
  ...result,
218
218
  content: description,
219
+ contentType: 'text', // Change to 'text' so cross-encoder will process it
219
220
  originalContent: result.content,
220
221
  originalContentType: result.contentType,
221
222
  metadata: {
222
223
  ...result.metadata,
223
- originalImagePath: result.content,
224
+ originalImagePath: result.document.source,
224
225
  generatedDescription: description
225
226
  }
226
227
  };
@@ -80,6 +80,16 @@ export declare class SearchEngine {
80
80
  * @returns Promise resolving to array of search results
81
81
  */
82
82
  search(query: string, options?: SearchOptions): Promise<SearchResult[]>;
83
+ /**
84
+ * Perform semantic search using a pre-computed embedding vector
85
+ * Useful for image-based search or when embedding is computed externally
86
+ * @param queryVector - Pre-computed query embedding vector
87
+ * @param options - Search options including top_k and rerank settings
88
+ * @param originalQuery - Optional original query for reranking (text or image path)
89
+ * @param embeddingTime - Optional embedding time for logging
90
+ * @returns Promise resolving to array of search results
91
+ */
92
+ searchWithVector(queryVector: Float32Array, options?: SearchOptions, originalQuery?: string, embeddingTime?: number): Promise<SearchResult[]>;
83
93
  /**
84
94
  * Format search results with proper structure
85
95
  * @param chunks - Database chunks with metadata
@@ -106,18 +106,40 @@ export class SearchEngine {
106
106
  return [];
107
107
  }
108
108
  const startTime = performance.now();
109
- const topK = options.top_k || config.top_k || 10;
110
- const shouldRerank = options.rerank !== undefined ? options.rerank : (this.rerankFn !== undefined);
111
109
  try {
112
110
  // Step 1: Build query embedding using injected embed function
113
111
  const embeddingStartTime = performance.now();
114
112
  const queryEmbedding = await this.embedFn(query);
115
113
  const embeddingTime = performance.now() - embeddingStartTime;
116
- // Step 2: Search using IndexManager (which handles hash mapping properly)
114
+ // Step 2: Search with the vector
115
+ const results = await this.searchWithVector(queryEmbedding.vector, options, query, embeddingTime);
116
+ return results;
117
+ }
118
+ catch (error) {
119
+ throw new Error(`Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
120
+ }
121
+ }
122
+ /**
123
+ * Perform semantic search using a pre-computed embedding vector
124
+ * Useful for image-based search or when embedding is computed externally
125
+ * @param queryVector - Pre-computed query embedding vector
126
+ * @param options - Search options including top_k and rerank settings
127
+ * @param originalQuery - Optional original query for reranking (text or image path)
128
+ * @param embeddingTime - Optional embedding time for logging
129
+ * @returns Promise resolving to array of search results
130
+ */
131
+ async searchWithVector(queryVector, options = {}, originalQuery, embeddingTime) {
132
+ const startTime = performance.now();
133
+ const topK = options.top_k || config.top_k || 10;
134
+ // Phase 1: Disable reranking by default for better performance
135
+ // Users must explicitly opt-in with --rerank flag
136
+ const shouldRerank = options.rerank === true;
137
+ try {
138
+ // Step 1: Search using IndexManager (which handles hash mapping properly)
117
139
  const searchStartTime = performance.now();
118
140
  let searchResult;
119
141
  try {
120
- searchResult = this.indexManager.search(queryEmbedding.vector, topK);
142
+ searchResult = this.indexManager.search(queryVector, topK);
121
143
  }
122
144
  catch (error) {
123
145
  if (error instanceof Error && error.message.includes('No embedding ID found for hash')) {
@@ -133,18 +155,18 @@ export class SearchEngine {
133
155
  console.log(`No similar documents found (${totalTime.toFixed(2)}ms total)`);
134
156
  return [];
135
157
  }
136
- // Step 3: Retrieve chunks from database using embedding IDs
158
+ // Step 2: Retrieve chunks from database using embedding IDs
137
159
  const retrievalStartTime = performance.now();
138
160
  const chunks = await getChunksByEmbeddingIds(this.db, searchResult.embeddingIds);
139
161
  const retrievalTime = performance.now() - retrievalStartTime;
140
- // Step 4: Format results as JSON with text, score, and document metadata
162
+ // Step 3: Format results as JSON with text, score, and document metadata
141
163
  let results = this.formatSearchResults(chunks, searchResult.distances, searchResult.embeddingIds);
142
- // Step 5: Optional reranking with injected rerank function
164
+ // Step 4: Optional reranking with injected rerank function
143
165
  let rerankTime = 0;
144
- if (shouldRerank && this.rerankFn && results.length > 1) {
166
+ if (shouldRerank && this.rerankFn && results.length > 1 && originalQuery) {
145
167
  try {
146
168
  const rerankStartTime = performance.now();
147
- results = await this.rerankFn(query, results);
169
+ results = await this.rerankFn(originalQuery, results);
148
170
  rerankTime = performance.now() - rerankStartTime;
149
171
  }
150
172
  catch (error) {
@@ -154,13 +176,14 @@ export class SearchEngine {
154
176
  }
155
177
  const totalTime = performance.now() - startTime;
156
178
  // Measure latency without premature optimization - just log for monitoring
179
+ const embedTimeStr = embeddingTime !== undefined ? `embed: ${embeddingTime.toFixed(2)}ms, ` : '';
157
180
  console.log(`Search completed: ${results.length} results in ${totalTime.toFixed(2)}ms ` +
158
- `(embed: ${embeddingTime.toFixed(2)}ms, vector: ${vectorSearchTime.toFixed(2)}ms, ` +
181
+ `(${embedTimeStr}vector: ${vectorSearchTime.toFixed(2)}ms, ` +
159
182
  `retrieval: ${retrievalTime.toFixed(2)}ms${rerankTime > 0 ? `, rerank: ${rerankTime.toFixed(2)}ms` : ''})`);
160
183
  return results;
161
184
  }
162
185
  catch (error) {
163
- throw new Error(`Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
186
+ throw new Error(`Vector search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
164
187
  }
165
188
  }
166
189
  /**
@@ -323,7 +323,9 @@ export class IngestionFactory {
323
323
  const { getSystemInfo, setSystemInfo } = await import('../core/db.js');
324
324
  // Determine the effective mode and reranking strategy
325
325
  const effectiveMode = options.mode || 'text';
326
- const effectiveRerankingStrategy = options.rerankingStrategy || 'cross-encoder';
326
+ // Phase 1: Fix mode-specific reranking strategy defaults
327
+ const effectiveRerankingStrategy = options.rerankingStrategy ||
328
+ (effectiveMode === 'multimodal' ? 'text-derived' : 'cross-encoder');
327
329
  // Determine model type based on model name
328
330
  let modelType;
329
331
  if (effectiveModel.includes('clip')) {