rag-lite-ts 2.0.5 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -201,7 +201,23 @@ export class IngestionPipeline {
201
201
  try {
202
202
  // Convert MIME type to simple content type for embedding function
203
203
  const contentTypeForEmbedding = this.getContentTypeForEmbedding(document.metadata?.contentType);
204
- const embedding = await this.embedFn(chunk.text, contentTypeForEmbedding);
204
+ // For images, use the image path from metadata instead of text description
205
+ let contentForEmbedding = chunk.text;
206
+ if (contentTypeForEmbedding === 'image' && document.metadata) {
207
+ // Try to get image path from metadata (contentPath, originalPath, or source)
208
+ // contentPath is where the image is stored (from contentResult)
209
+ const imagePath = document.metadata.contentPath ||
210
+ document.metadata.originalPath ||
211
+ document.metadata.source;
212
+ if (imagePath) {
213
+ contentForEmbedding = imagePath;
214
+ }
215
+ else {
216
+ // Fallback: try to extract path from source if available
217
+ console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
218
+ }
219
+ }
220
+ const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
205
221
  // Enhance embedding result with content type metadata
206
222
  if (!embedding.contentType) {
207
223
  embedding.contentType = contentTypeForEmbedding;
@@ -271,21 +287,30 @@ export class IngestionPipeline {
271
287
  try {
272
288
  // Phase 1: File Discovery and Processing with Content-Type Detection
273
289
  console.log('\n--- Phase 1: File Discovery and Processing ---');
274
- const fileResult = await discoverAndProcessFiles(path, options.fileOptions, this.pathManager);
275
- if (fileResult.documents.length === 0) {
290
+ const mode = options.mode || 'text';
291
+ const fileOptions = {
292
+ recursive: true,
293
+ maxFileSize: 10 * 1024 * 1024, // 10MB
294
+ ...options.fileOptions,
295
+ mode
296
+ };
297
+ const fileResult = await discoverAndProcessFiles(path, fileOptions, this.pathManager);
298
+ // Additional filtering as fallback (should be minimal with mode-aware discovery)
299
+ const filteredResult = this.filterDocumentsByMode(fileResult, mode);
300
+ if (filteredResult.documents.length === 0) {
276
301
  console.log('No documents found to process');
277
302
  return {
278
303
  documentsProcessed: 0,
279
304
  chunksCreated: 0,
280
305
  embeddingsGenerated: 0,
281
- documentErrors: fileResult.processingResult.errors.length,
306
+ documentErrors: filteredResult.processingResult.errors.length,
282
307
  embeddingErrors: 0,
283
308
  processingTimeMs: Date.now() - startTime,
284
309
  contentIds: []
285
310
  };
286
311
  }
287
312
  // Content-type detection and routing
288
- const contentTypeStats = this.analyzeContentTypes(fileResult.documents);
313
+ const contentTypeStats = this.analyzeContentTypes(filteredResult.documents);
289
314
  console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
290
315
  // Phase 2: Document Chunking with Content-Type Awareness
291
316
  console.log('\n--- Phase 2: Document Chunking ---');
@@ -293,7 +318,7 @@ export class IngestionPipeline {
293
318
  chunkSize: config.chunk_size,
294
319
  chunkOverlap: config.chunk_overlap
295
320
  };
296
- const chunkingResult = await this.chunkDocumentsWithContentTypes(fileResult.documents, effectiveChunkConfig, options.mode);
321
+ const chunkingResult = await this.chunkDocumentsWithContentTypes(filteredResult.documents, effectiveChunkConfig, options.mode);
297
322
  if (chunkingResult.totalChunks === 0) {
298
323
  console.log('No chunks created from documents');
299
324
  return {
@@ -318,10 +343,10 @@ export class IngestionPipeline {
318
343
  const endTime = Date.now();
319
344
  const processingTimeMs = endTime - startTime;
320
345
  const result = {
321
- documentsProcessed: fileResult.documents.length,
346
+ documentsProcessed: filteredResult.documents.length,
322
347
  chunksCreated: chunkingResult.totalChunks,
323
348
  embeddingsGenerated: embeddingResult.embeddings.length,
324
- documentErrors: fileResult.processingResult.errors.length,
349
+ documentErrors: filteredResult.processingResult.errors.length,
325
350
  embeddingErrors: embeddingResult.errors,
326
351
  processingTimeMs,
327
352
  contentIds
@@ -447,7 +472,20 @@ export class IngestionPipeline {
447
472
  try {
448
473
  // Convert MIME type to simple content type for embedding function
449
474
  const contentTypeForEmbedding = this.getContentTypeForEmbedding(chunk.contentType);
450
- const embedding = await this.embedFn(chunk.text, contentTypeForEmbedding);
475
+ // For images, use the image path from metadata instead of text description
476
+ let contentForEmbedding = chunk.text;
477
+ if (contentTypeForEmbedding === 'image' && chunk.metadata) {
478
+ // Try to get image path from metadata (originalPath or contentPath)
479
+ const imagePath = chunk.metadata.originalPath || chunk.metadata.contentPath || chunk.metadata.source;
480
+ if (imagePath) {
481
+ contentForEmbedding = imagePath;
482
+ }
483
+ else {
484
+ // Fallback: try to extract path from source if available
485
+ console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
486
+ }
487
+ }
488
+ const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
451
489
  // Enhance embedding result with content type metadata if not already present
452
490
  if (!embedding.contentType) {
453
491
  embedding.contentType = contentTypeForEmbedding;
@@ -566,16 +604,35 @@ export class IngestionPipeline {
566
604
  return contentIds;
567
605
  }
568
606
  /**
569
- * Update vector index with new embeddings
607
+ * Update vector index with new embeddings (supports grouped content type storage)
570
608
  */
571
609
  async updateVectorIndex(embeddings) {
610
+ console.log('updateVectorIndex called with', embeddings.length, 'embeddings');
572
611
  if (embeddings.length === 0) {
573
612
  console.log('No embeddings to add to vector index');
574
613
  return;
575
614
  }
576
615
  console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
577
616
  try {
578
- await this.indexManager.addVectors(embeddings);
617
+ // Group embeddings by content type for optimized storage
618
+ const groupedEmbeddings = embeddings.reduce((groups, embedding) => {
619
+ const contentType = embedding.contentType || 'text';
620
+ if (!groups[contentType]) {
621
+ groups[contentType] = [];
622
+ }
623
+ groups[contentType].push(embedding);
624
+ return groups;
625
+ }, {});
626
+ const textEmbeddings = groupedEmbeddings.text || [];
627
+ const imageEmbeddings = groupedEmbeddings.image || [];
628
+ console.log(`Grouped: ${textEmbeddings.length} text, ${imageEmbeddings.length} image vectors`);
629
+ // Use grouped storage method if available, fallback to regular method
630
+ if (this.indexManager.addGroupedEmbeddings) {
631
+ await this.indexManager.addGroupedEmbeddings(textEmbeddings, imageEmbeddings);
632
+ }
633
+ else {
634
+ await this.indexManager.addVectors(embeddings);
635
+ }
579
636
  console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
580
637
  }
581
638
  catch (error) {
@@ -583,26 +640,72 @@ export class IngestionPipeline {
583
640
  throw error;
584
641
  }
585
642
  }
643
+ /**
644
+ * Filter documents based on ingestion mode to avoid processing incompatible content types
645
+ */
646
+ filterDocumentsByMode(fileResult, mode) {
647
+ if (mode === 'multimodal') {
648
+ // In multimodal mode, keep all documents
649
+ return fileResult;
650
+ }
651
+ // In text mode, filter out image documents
652
+ const filteredDocuments = fileResult.documents.filter(doc => {
653
+ const contentType = doc.metadata?.contentType || 'text';
654
+ const isCompatible = contentType === 'text' ||
655
+ contentType.startsWith('text/') ||
656
+ contentType === 'application/pdf' ||
657
+ contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
658
+ if (!isCompatible) {
659
+ console.log(`⚠️ Skipping ${doc.source} (${contentType}) - not compatible with text mode`);
660
+ }
661
+ return isCompatible;
662
+ });
663
+ // Update processing result to reflect filtering
664
+ const filteredProcessingResult = {
665
+ ...fileResult.processingResult,
666
+ skippedFiles: [
667
+ ...(fileResult.processingResult.skippedFiles || []),
668
+ ...fileResult.documents
669
+ .filter(doc => !filteredDocuments.includes(doc))
670
+ .map(doc => ({
671
+ path: doc.source,
672
+ reason: `Content type not compatible with ${mode} mode`
673
+ }))
674
+ ]
675
+ };
676
+ return {
677
+ documents: filteredDocuments,
678
+ discoveryResult: fileResult.discoveryResult,
679
+ processingResult: filteredProcessingResult
680
+ };
681
+ }
586
682
  /**
587
683
  * Converts MIME type to simple content type for embedding function
588
684
  * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
589
685
  * @returns Simple content type ('text', 'image', etc.)
590
686
  */
591
- getContentTypeForEmbedding(mimeType) {
592
- if (!mimeType) {
687
+ getContentTypeForEmbedding(contentType) {
688
+ if (!contentType) {
689
+ return 'text';
690
+ }
691
+ // Handle simple content type strings (used by chunking)
692
+ if (contentType === 'image') {
693
+ return 'image';
694
+ }
695
+ else if (contentType === 'text') {
593
696
  return 'text';
594
697
  }
595
- // Convert MIME types to simple content types
596
- if (mimeType.startsWith('text/')) {
698
+ // Convert MIME types to simple content types (legacy support)
699
+ if (contentType.startsWith('text/')) {
597
700
  return 'text';
598
701
  }
599
- else if (mimeType.startsWith('image/')) {
702
+ else if (contentType.startsWith('image/')) {
600
703
  return 'image';
601
704
  }
602
- else if (mimeType === 'application/pdf') {
705
+ else if (contentType === 'application/pdf') {
603
706
  return 'text'; // PDFs are processed as text
604
707
  }
605
- else if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
708
+ else if (contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
606
709
  return 'text'; // DOCX files are processed as text
607
710
  }
608
711
  else {
@@ -671,6 +774,7 @@ export class IngestionPipeline {
671
774
  contentType: 'image',
672
775
  contentId: contentResult.contentId,
673
776
  storageType: contentResult.storageType,
777
+ contentPath: contentResult.contentPath, // Store contentPath for embedding
674
778
  originalPath: metadata.originalPath,
675
779
  ...imageMetadata // Spread all image metadata fields
676
780
  }
@@ -687,6 +791,7 @@ export class IngestionPipeline {
687
791
  contentType: 'image',
688
792
  contentId: contentResult.contentId,
689
793
  storageType: contentResult.storageType,
794
+ contentPath: contentResult.contentPath, // Store contentPath for embedding
690
795
  originalPath: metadata.originalPath,
691
796
  processingError: error instanceof Error ? error.message : String(error)
692
797
  }
@@ -59,15 +59,10 @@ export declare class LazyRerankerLoader {
59
59
  */
60
60
  static loadTextDerivedReranker(): Promise<RerankFunction>;
61
61
  /**
62
- * Lazily load metadata-based reranker for multimodal mode
63
- * Only imports when specifically needed
62
+ * Lazily load CLIP AutoProcessor for consistent image preprocessing
63
+ * Shares processor instances across embedder instances to ensure identical preprocessing
64
64
  */
65
- static loadMetadataReranker(): Promise<RerankFunction>;
66
- /**
67
- * Lazily load hybrid reranker for multimodal mode
68
- * Combines multiple reranking strategies (uses text-derived for now)
69
- */
70
- static loadHybridReranker(): Promise<RerankFunction>;
65
+ static loadCLIPAutoProcessor(modelName: string): Promise<any>;
71
66
  /**
72
67
  * Check if a reranker is already loaded in cache
73
68
  */
@@ -198,32 +198,18 @@ export class LazyRerankerLoader {
198
198
  });
199
199
  }
200
200
  /**
201
- * Lazily load metadata-based reranker for multimodal mode
202
- * Only imports when specifically needed
201
+ * Lazily load CLIP AutoProcessor for consistent image preprocessing
202
+ * Shares processor instances across embedder instances to ensure identical preprocessing
203
203
  */
204
- static async loadMetadataReranker() {
205
- const cacheKey = 'reranker:metadata';
204
+ static async loadCLIPAutoProcessor(modelName) {
205
+ const cacheKey = `processor:clip:${modelName}`;
206
206
  return this.cache.getOrLoad(cacheKey, async () => {
207
- console.log('🔄 Lazy loading metadata reranker (multimodal)');
208
- // Dynamic import - only loaded when multimodal mode uses metadata reranking
209
- const { MetadataRerankingStrategy } = await import('./reranking-strategies.js');
210
- const reranker = new MetadataRerankingStrategy();
211
- console.log('✅ Metadata reranker loaded');
212
- return reranker.rerank.bind(reranker);
213
- });
214
- }
215
- /**
216
- * Lazily load hybrid reranker for multimodal mode
217
- * Combines multiple reranking strategies (uses text-derived for now)
218
- */
219
- static async loadHybridReranker() {
220
- const cacheKey = 'reranker:hybrid';
221
- return this.cache.getOrLoad(cacheKey, async () => {
222
- console.log('🔄 Lazy loading hybrid reranker (multimodal)');
223
- // For now, hybrid reranking uses text-derived
224
- // TODO: Implement proper hybrid reranking in future tasks
225
- console.log('🔄 Hybrid reranking not yet implemented, using text-derived');
226
- return this.loadTextDerivedReranker();
207
+ console.log(`🔄 Lazy loading CLIP AutoProcessor: ${modelName}`);
208
+ // Dynamic import - only loaded when CLIP models are used
209
+ const { AutoProcessor } = await import('@huggingface/transformers');
210
+ const processor = await AutoProcessor.from_pretrained(modelName);
211
+ console.log(`✅ CLIP AutoProcessor loaded: ${modelName}`);
212
+ return processor;
227
213
  });
228
214
  }
229
215
  /**
@@ -371,12 +357,8 @@ export class LazyDependencyManager {
371
357
  return LazyRerankerLoader.loadTextReranker();
372
358
  case 'text-derived':
373
359
  return LazyRerankerLoader.loadTextDerivedReranker();
374
- case 'metadata':
375
- return LazyRerankerLoader.loadMetadataReranker();
376
- case 'hybrid':
377
- return LazyRerankerLoader.loadHybridReranker();
378
360
  default:
379
- throw new Error(`Unknown reranking strategy '${strategy}'. Supported strategies: cross-encoder, text-derived, metadata, hybrid, disabled`);
361
+ throw new Error(`Unknown reranking strategy '${strategy}'. Supported strategies: cross-encoder, text-derived, disabled`);
380
362
  }
381
363
  }
382
364
  /**
@@ -526,7 +526,7 @@ export class ModeDetectionService {
526
526
  * @private
527
527
  */
528
528
  validateRerankingStrategy(strategy) {
529
- const validStrategies = ['cross-encoder', 'text-derived', 'metadata', 'hybrid', 'disabled'];
529
+ const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
530
530
  if (!validStrategies.includes(strategy)) {
531
531
  throw createError.validation(`Invalid reranking strategy '${strategy}'. Must be one of: ${validStrategies.join(', ')}`);
532
532
  }
@@ -4,7 +4,7 @@
4
4
  * Provides straightforward configuration types and validation for different
5
5
  * reranking strategies without complex interface patterns.
6
6
  */
7
- export type RerankingStrategyType = 'cross-encoder' | 'text-derived' | 'metadata' | 'hybrid' | 'disabled';
7
+ export type RerankingStrategyType = 'cross-encoder' | 'text-derived' | 'disabled';
8
8
  export interface RerankingConfig {
9
9
  strategy: RerankingStrategyType;
10
10
  model?: string;
@@ -17,15 +17,13 @@ export const DEFAULT_MULTIMODAL_RERANKING_CONFIG = {
17
17
  semantic: 0.7,
18
18
  metadata: 0.3
19
19
  },
20
- fallback: 'metadata'
20
+ fallback: 'disabled'
21
21
  };
22
22
  // Strategy validation without complex interface patterns
23
23
  export function validateRerankingStrategy(strategy) {
24
24
  const validStrategies = [
25
25
  'cross-encoder',
26
26
  'text-derived',
27
- 'metadata',
28
- 'hybrid',
29
27
  'disabled'
30
28
  ];
31
29
  return validStrategies.includes(strategy);
@@ -36,7 +34,7 @@ export function validateRerankingConfig(config) {
36
34
  throw new Error('Reranking strategy is required');
37
35
  }
38
36
  if (!validateRerankingStrategy(config.strategy)) {
39
- const validStrategies = ['cross-encoder', 'text-derived', 'metadata', 'hybrid', 'disabled'];
37
+ const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
40
38
  throw new Error(`Invalid reranking strategy '${config.strategy}'. ` +
41
39
  `Valid strategies: ${validStrategies.join(', ')}`);
42
40
  }
@@ -52,23 +50,16 @@ export function validateRerankingConfig(config) {
52
50
  if (visual !== undefined && (visual < 0 || visual > 1)) {
53
51
  throw new Error('Visual weight must be between 0 and 1');
54
52
  }
55
- // Ensure weights sum to reasonable value for hybrid strategy
56
- if (config.strategy === 'hybrid') {
57
- const totalWeight = (semantic || 0) + (metadata || 0) + (visual || 0);
58
- if (totalWeight === 0) {
59
- throw new Error('Hybrid strategy requires at least one weight to be greater than 0');
60
- }
61
- }
62
53
  }
63
54
  // Validate fallback strategy if provided
64
55
  if (config.fallback && !validateRerankingStrategy(config.fallback)) {
65
- const validStrategies = ['cross-encoder', 'text-derived', 'metadata', 'hybrid', 'disabled'];
56
+ const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
66
57
  throw new Error(`Invalid fallback strategy '${config.fallback}'. ` +
67
58
  `Valid strategies: ${validStrategies.join(', ')}`);
68
59
  }
69
60
  return {
70
61
  strategy: config.strategy,
71
- enabled: config.enabled ?? true,
62
+ enabled: config.strategy === 'disabled' ? false : (config.enabled ?? true),
72
63
  model: config.model,
73
64
  weights: config.weights,
74
65
  fallback: config.fallback || 'disabled'
@@ -91,7 +82,7 @@ export function isStrategySupported(strategy, mode) {
91
82
  case 'text':
92
83
  return strategy === 'cross-encoder' || strategy === 'disabled';
93
84
  case 'multimodal':
94
- return ['text-derived', 'metadata', 'hybrid', 'disabled'].includes(strategy);
85
+ return ['text-derived', 'disabled'].includes(strategy);
95
86
  default:
96
87
  return false;
97
88
  }
@@ -102,7 +93,7 @@ export function getSupportedStrategies(mode) {
102
93
  case 'text':
103
94
  return ['cross-encoder', 'disabled'];
104
95
  case 'multimodal':
105
- return ['text-derived', 'metadata', 'hybrid', 'disabled'];
96
+ return ['text-derived', 'disabled'];
106
97
  default:
107
98
  return ['disabled'];
108
99
  }
@@ -145,7 +136,7 @@ export class RerankingConfigBuilder {
145
136
  .strategy('text-derived')
146
137
  .enabled(true)
147
138
  .weights({ semantic: 0.7, metadata: 0.3 })
148
- .fallback('metadata');
139
+ .fallback('disabled');
149
140
  }
150
141
  static disabled() {
151
142
  return new RerankingConfigBuilder()
@@ -6,7 +6,7 @@
6
6
  * principle of using simple functions over complex factory patterns.
7
7
  */
8
8
  import { getDefaultRerankingConfig, isStrategySupported, getSupportedStrategies, validateRerankingConfig } from './reranking-config.js';
9
- import { createCrossEncoderRerankFunction, createTextDerivedRerankFunction, createMetadataRerankFunction } from './reranking-strategies.js';
9
+ import { createCrossEncoderRerankFunction, createTextDerivedRerankFunction } from './reranking-strategies.js';
10
10
  /**
11
11
  * Simple reranking creation function with conditional logic
12
12
  *
@@ -102,23 +102,6 @@ function createRerankingFunction(mode, strategy, config) {
102
102
  undefined // Use default cross-encoder model
103
103
  );
104
104
  break;
105
- case 'metadata':
106
- console.log(`Creating metadata reranker for ${mode} mode`);
107
- reranker = createMetadataRerankFunction({
108
- weights: config.weights ? {
109
- filename: config.weights.metadata || 0.4,
110
- contentType: 0.3,
111
- metadata: config.weights.metadata || 0.3
112
- } : undefined
113
- });
114
- break;
115
- case 'hybrid':
116
- if (mode !== 'multimodal') {
117
- throw new RerankingStrategyError(strategy, mode, 'Hybrid strategy only supported in multimodal mode', 'UNSUPPORTED_MODE');
118
- }
119
- console.log('Creating hybrid reranker for multimodal mode');
120
- reranker = createHybridRerankFunction(config);
121
- break;
122
105
  case 'disabled':
123
106
  console.log('Reranking explicitly disabled');
124
107
  return undefined;
@@ -241,172 +224,10 @@ function wrapRerankFunctionWithErrorRecovery(reranker, strategy, mode) {
241
224
  };
242
225
  }
243
226
  /**
244
- * Create hybrid reranking function that combines multiple strategies with enhanced error recovery
227
+ * Hybrid reranking strategy removed in Phase 3 - throwing error for backward compatibility
245
228
  */
246
229
  function createHybridRerankFunction(config) {
247
- // Default weights if not specified
248
- const weights = config.weights || {
249
- semantic: 0.6,
250
- metadata: 0.4,
251
- visual: 0.0 // Not implemented yet
252
- };
253
- // Track which strategies are available
254
- const availableStrategies = {};
255
- // Initialize strategies with error handling
256
- try {
257
- if (weights.semantic && weights.semantic > 0) {
258
- availableStrategies.textDerived = createTextDerivedRerankFunction();
259
- console.log('✅ Text-derived strategy initialized for hybrid reranking');
260
- }
261
- }
262
- catch (error) {
263
- console.warn(`⚠️ Text-derived strategy initialization failed for hybrid reranking: ${error instanceof Error ? error.message : 'Unknown error'}`);
264
- }
265
- try {
266
- if (weights.metadata && weights.metadata > 0) {
267
- availableStrategies.metadata = createMetadataRerankFunction();
268
- console.log('✅ Metadata strategy initialized for hybrid reranking');
269
- }
270
- }
271
- catch (error) {
272
- console.warn(`⚠️ Metadata strategy initialization failed for hybrid reranking: ${error instanceof Error ? error.message : 'Unknown error'}`);
273
- }
274
- // Check if any strategies are available
275
- const hasAvailableStrategies = Object.keys(availableStrategies).length > 0;
276
- if (!hasAvailableStrategies) {
277
- throw new RerankingStrategyError('hybrid', 'multimodal', 'No hybrid reranking strategies could be initialized', 'NO_STRATEGIES_AVAILABLE');
278
- }
279
- console.log(`Hybrid reranking initialized with ${Object.keys(availableStrategies).length} available strategies`);
280
- return async (query, results, contentType) => {
281
- const startTime = Date.now();
282
- const strategyResults = {};
283
- try {
284
- console.log(`🔄 Running hybrid reranking with ${Object.keys(availableStrategies).length} strategies`);
285
- // Start with original results
286
- let hybridResults = [...results];
287
- let successfulStrategies = 0;
288
- // Apply text-derived reranking if available and enabled
289
- if (availableStrategies.textDerived && weights.semantic && weights.semantic > 0) {
290
- const strategyStartTime = Date.now();
291
- try {
292
- console.log(`🔧 Applying text-derived reranking (weight: ${weights.semantic})`);
293
- const textDerivedResults = await availableStrategies.textDerived(query, hybridResults, contentType);
294
- // Combine scores with semantic weight
295
- hybridResults = hybridResults.map((result, index) => {
296
- const textDerivedScore = textDerivedResults[index]?.score || result.score;
297
- const combinedScore = result.score * (1 - weights.semantic) + textDerivedScore * weights.semantic;
298
- return {
299
- ...result,
300
- score: combinedScore,
301
- metadata: {
302
- ...result.metadata,
303
- hybridScores: {
304
- ...(result.metadata?.hybridScores || {}),
305
- textDerived: textDerivedScore,
306
- semantic: combinedScore
307
- }
308
- }
309
- };
310
- });
311
- const strategyDuration = Date.now() - strategyStartTime;
312
- strategyResults.textDerived = { success: true, duration: strategyDuration };
313
- successfulStrategies++;
314
- console.log(`✅ Text-derived reranking completed (${strategyDuration}ms)`);
315
- }
316
- catch (error) {
317
- const strategyDuration = Date.now() - strategyStartTime;
318
- const errorMessage = error instanceof Error ? error.message : 'Unknown error';
319
- strategyResults.textDerived = { success: false, error: errorMessage, duration: strategyDuration };
320
- console.warn(`❌ Text-derived reranking failed in hybrid mode (${strategyDuration}ms): ${errorMessage}`);
321
- }
322
- }
323
- // Apply metadata reranking if available and enabled
324
- if (availableStrategies.metadata && weights.metadata && weights.metadata > 0) {
325
- const strategyStartTime = Date.now();
326
- try {
327
- console.log(`🔧 Applying metadata reranking (weight: ${weights.metadata})`);
328
- const metadataResults = await availableStrategies.metadata(query, hybridResults, contentType);
329
- // Combine scores with metadata weight
330
- hybridResults = hybridResults.map((result, index) => {
331
- const metadataScore = metadataResults[index]?.score || result.score;
332
- const currentScore = result.score;
333
- const combinedScore = currentScore * (1 - weights.metadata) + metadataScore * weights.metadata;
334
- return {
335
- ...result,
336
- score: combinedScore,
337
- metadata: {
338
- ...result.metadata,
339
- hybridScores: {
340
- ...(result.metadata?.hybridScores || {}),
341
- metadata: metadataScore,
342
- combined: combinedScore
343
- }
344
- }
345
- };
346
- });
347
- const strategyDuration = Date.now() - strategyStartTime;
348
- strategyResults.metadata = { success: true, duration: strategyDuration };
349
- successfulStrategies++;
350
- console.log(`✅ Metadata reranking completed (${strategyDuration}ms)`);
351
- }
352
- catch (error) {
353
- const strategyDuration = Date.now() - strategyStartTime;
354
- const errorMessage = error instanceof Error ? error.message : 'Unknown error';
355
- strategyResults.metadata = { success: false, error: errorMessage, duration: strategyDuration };
356
- console.warn(`❌ Metadata reranking failed in hybrid mode (${strategyDuration}ms): ${errorMessage}`);
357
- }
358
- }
359
- // Sort by final combined scores
360
- hybridResults.sort((a, b) => b.score - a.score);
361
- const totalDuration = Date.now() - startTime;
362
- // Add hybrid reranking metadata to results
363
- hybridResults = hybridResults.map(result => ({
364
- ...result,
365
- metadata: {
366
- ...result.metadata,
367
- hybridRerankingInfo: {
368
- totalDuration,
369
- successfulStrategies,
370
- strategyResults,
371
- weights
372
- }
373
- }
374
- }));
375
- if (successfulStrategies > 0) {
376
- console.log(`✅ Hybrid reranking completed successfully (${totalDuration}ms, ${successfulStrategies}/${Object.keys(availableStrategies).length} strategies succeeded)`);
377
- }
378
- else {
379
- console.warn(`⚠️ Hybrid reranking completed with no successful strategies (${totalDuration}ms), returning original results`);
380
- return results; // Return original results if no strategies succeeded
381
- }
382
- return hybridResults;
383
- }
384
- catch (error) {
385
- const totalDuration = Date.now() - startTime;
386
- const errorMessage = error instanceof Error ? error.message : 'Unknown error';
387
- console.warn(`❌ Hybrid reranking failed (${totalDuration}ms): ${errorMessage}. ` +
388
- `Returning original results.`);
389
- // Log detailed error information
390
- console.error('Hybrid reranking error details:', {
391
- query: query.substring(0, 100) + (query.length > 100 ? '...' : ''),
392
- resultCount: results.length,
393
- contentType,
394
- availableStrategies: Object.keys(availableStrategies),
395
- weights,
396
- strategyResults,
397
- error: errorMessage
398
- });
399
- return results.map(result => ({
400
- ...result,
401
- metadata: {
402
- ...result.metadata,
403
- hybridRerankingFailed: true,
404
- hybridRerankingError: errorMessage,
405
- fallbackToVectorSimilarity: true
406
- }
407
- }));
408
- }
409
- };
230
+ throw new RerankingStrategyError('hybrid', 'multimodal', 'Hybrid reranking strategy has been removed in this version. Use text-derived instead.', 'STRATEGY_REMOVED');
410
231
  }
411
232
  /**
412
233
  * Create reranker with automatic mode detection
@@ -582,8 +403,6 @@ export function getRerankingStats() {
582
403
  strategiesUsed: {
583
404
  'cross-encoder': 0,
584
405
  'text-derived': 0,
585
- 'metadata': 0,
586
- 'hybrid': 0,
587
406
  'disabled': 0
588
407
  }
589
408
  };
@@ -80,6 +80,16 @@ export declare class SearchEngine {
80
80
  * @returns Promise resolving to array of search results
81
81
  */
82
82
  search(query: string, options?: SearchOptions): Promise<SearchResult[]>;
83
+ /**
84
+ * Perform semantic search using a pre-computed embedding vector
85
+ * Useful for image-based search or when embedding is computed externally
86
+ * @param queryVector - Pre-computed query embedding vector
87
+ * @param options - Search options including top_k and rerank settings
88
+ * @param originalQuery - Optional original query for reranking (text or image path)
89
+ * @param embeddingTime - Optional embedding time for logging
90
+ * @returns Promise resolving to array of search results
91
+ */
92
+ searchWithVector(queryVector: Float32Array, options?: SearchOptions, originalQuery?: string, embeddingTime?: number): Promise<SearchResult[]>;
83
93
  /**
84
94
  * Format search results with proper structure
85
95
  * @param chunks - Database chunks with metadata