rag-lite-ts 2.0.5 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +815 -808
- package/dist/cli/indexer.js +3 -39
- package/dist/cli/search.d.ts +1 -1
- package/dist/cli/search.js +123 -19
- package/dist/cli.js +77 -94
- package/dist/core/binary-index-format.d.ts +28 -2
- package/dist/core/binary-index-format.js +196 -27
- package/dist/core/db.js +173 -173
- package/dist/core/ingestion.d.ts +5 -1
- package/dist/core/ingestion.js +123 -18
- package/dist/core/lazy-dependency-loader.d.ts +3 -8
- package/dist/core/lazy-dependency-loader.js +11 -29
- package/dist/core/mode-detection-service.js +1 -1
- package/dist/core/reranking-config.d.ts +1 -1
- package/dist/core/reranking-config.js +7 -16
- package/dist/core/reranking-factory.js +3 -184
- package/dist/core/search.d.ts +10 -0
- package/dist/core/search.js +35 -11
- package/dist/core/types.d.ts +1 -1
- package/dist/core/vector-index.d.ts +4 -0
- package/dist/core/vector-index.js +6 -0
- package/dist/factories/ingestion-factory.js +3 -1
- package/dist/file-processor.d.ts +2 -0
- package/dist/file-processor.js +20 -0
- package/dist/index-manager.d.ts +17 -1
- package/dist/index-manager.js +148 -7
- package/dist/mcp-server.js +127 -105
- package/dist/multimodal/clip-embedder.js +6 -2
- package/package.json +1 -1
package/dist/core/ingestion.js
CHANGED
|
@@ -201,7 +201,23 @@ export class IngestionPipeline {
|
|
|
201
201
|
try {
|
|
202
202
|
// Convert MIME type to simple content type for embedding function
|
|
203
203
|
const contentTypeForEmbedding = this.getContentTypeForEmbedding(document.metadata?.contentType);
|
|
204
|
-
|
|
204
|
+
// For images, use the image path from metadata instead of text description
|
|
205
|
+
let contentForEmbedding = chunk.text;
|
|
206
|
+
if (contentTypeForEmbedding === 'image' && document.metadata) {
|
|
207
|
+
// Try to get image path from metadata (contentPath, originalPath, or source)
|
|
208
|
+
// contentPath is where the image is stored (from contentResult)
|
|
209
|
+
const imagePath = document.metadata.contentPath ||
|
|
210
|
+
document.metadata.originalPath ||
|
|
211
|
+
document.metadata.source;
|
|
212
|
+
if (imagePath) {
|
|
213
|
+
contentForEmbedding = imagePath;
|
|
214
|
+
}
|
|
215
|
+
else {
|
|
216
|
+
// Fallback: try to extract path from source if available
|
|
217
|
+
console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
|
|
205
221
|
// Enhance embedding result with content type metadata
|
|
206
222
|
if (!embedding.contentType) {
|
|
207
223
|
embedding.contentType = contentTypeForEmbedding;
|
|
@@ -271,21 +287,30 @@ export class IngestionPipeline {
|
|
|
271
287
|
try {
|
|
272
288
|
// Phase 1: File Discovery and Processing with Content-Type Detection
|
|
273
289
|
console.log('\n--- Phase 1: File Discovery and Processing ---');
|
|
274
|
-
const
|
|
275
|
-
|
|
290
|
+
const mode = options.mode || 'text';
|
|
291
|
+
const fileOptions = {
|
|
292
|
+
recursive: true,
|
|
293
|
+
maxFileSize: 10 * 1024 * 1024, // 10MB
|
|
294
|
+
...options.fileOptions,
|
|
295
|
+
mode
|
|
296
|
+
};
|
|
297
|
+
const fileResult = await discoverAndProcessFiles(path, fileOptions, this.pathManager);
|
|
298
|
+
// Additional filtering as fallback (should be minimal with mode-aware discovery)
|
|
299
|
+
const filteredResult = this.filterDocumentsByMode(fileResult, mode);
|
|
300
|
+
if (filteredResult.documents.length === 0) {
|
|
276
301
|
console.log('No documents found to process');
|
|
277
302
|
return {
|
|
278
303
|
documentsProcessed: 0,
|
|
279
304
|
chunksCreated: 0,
|
|
280
305
|
embeddingsGenerated: 0,
|
|
281
|
-
documentErrors:
|
|
306
|
+
documentErrors: filteredResult.processingResult.errors.length,
|
|
282
307
|
embeddingErrors: 0,
|
|
283
308
|
processingTimeMs: Date.now() - startTime,
|
|
284
309
|
contentIds: []
|
|
285
310
|
};
|
|
286
311
|
}
|
|
287
312
|
// Content-type detection and routing
|
|
288
|
-
const contentTypeStats = this.analyzeContentTypes(
|
|
313
|
+
const contentTypeStats = this.analyzeContentTypes(filteredResult.documents);
|
|
289
314
|
console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
|
|
290
315
|
// Phase 2: Document Chunking with Content-Type Awareness
|
|
291
316
|
console.log('\n--- Phase 2: Document Chunking ---');
|
|
@@ -293,7 +318,7 @@ export class IngestionPipeline {
|
|
|
293
318
|
chunkSize: config.chunk_size,
|
|
294
319
|
chunkOverlap: config.chunk_overlap
|
|
295
320
|
};
|
|
296
|
-
const chunkingResult = await this.chunkDocumentsWithContentTypes(
|
|
321
|
+
const chunkingResult = await this.chunkDocumentsWithContentTypes(filteredResult.documents, effectiveChunkConfig, options.mode);
|
|
297
322
|
if (chunkingResult.totalChunks === 0) {
|
|
298
323
|
console.log('No chunks created from documents');
|
|
299
324
|
return {
|
|
@@ -318,10 +343,10 @@ export class IngestionPipeline {
|
|
|
318
343
|
const endTime = Date.now();
|
|
319
344
|
const processingTimeMs = endTime - startTime;
|
|
320
345
|
const result = {
|
|
321
|
-
documentsProcessed:
|
|
346
|
+
documentsProcessed: filteredResult.documents.length,
|
|
322
347
|
chunksCreated: chunkingResult.totalChunks,
|
|
323
348
|
embeddingsGenerated: embeddingResult.embeddings.length,
|
|
324
|
-
documentErrors:
|
|
349
|
+
documentErrors: filteredResult.processingResult.errors.length,
|
|
325
350
|
embeddingErrors: embeddingResult.errors,
|
|
326
351
|
processingTimeMs,
|
|
327
352
|
contentIds
|
|
@@ -447,7 +472,20 @@ export class IngestionPipeline {
|
|
|
447
472
|
try {
|
|
448
473
|
// Convert MIME type to simple content type for embedding function
|
|
449
474
|
const contentTypeForEmbedding = this.getContentTypeForEmbedding(chunk.contentType);
|
|
450
|
-
|
|
475
|
+
// For images, use the image path from metadata instead of text description
|
|
476
|
+
let contentForEmbedding = chunk.text;
|
|
477
|
+
if (contentTypeForEmbedding === 'image' && chunk.metadata) {
|
|
478
|
+
// Try to get image path from metadata (originalPath or contentPath)
|
|
479
|
+
const imagePath = chunk.metadata.originalPath || chunk.metadata.contentPath || chunk.metadata.source;
|
|
480
|
+
if (imagePath) {
|
|
481
|
+
contentForEmbedding = imagePath;
|
|
482
|
+
}
|
|
483
|
+
else {
|
|
484
|
+
// Fallback: try to extract path from source if available
|
|
485
|
+
console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
|
|
451
489
|
// Enhance embedding result with content type metadata if not already present
|
|
452
490
|
if (!embedding.contentType) {
|
|
453
491
|
embedding.contentType = contentTypeForEmbedding;
|
|
@@ -566,16 +604,35 @@ export class IngestionPipeline {
|
|
|
566
604
|
return contentIds;
|
|
567
605
|
}
|
|
568
606
|
/**
|
|
569
|
-
* Update vector index with new embeddings
|
|
607
|
+
* Update vector index with new embeddings (supports grouped content type storage)
|
|
570
608
|
*/
|
|
571
609
|
async updateVectorIndex(embeddings) {
|
|
610
|
+
console.log('updateVectorIndex called with', embeddings.length, 'embeddings');
|
|
572
611
|
if (embeddings.length === 0) {
|
|
573
612
|
console.log('No embeddings to add to vector index');
|
|
574
613
|
return;
|
|
575
614
|
}
|
|
576
615
|
console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
|
|
577
616
|
try {
|
|
578
|
-
|
|
617
|
+
// Group embeddings by content type for optimized storage
|
|
618
|
+
const groupedEmbeddings = embeddings.reduce((groups, embedding) => {
|
|
619
|
+
const contentType = embedding.contentType || 'text';
|
|
620
|
+
if (!groups[contentType]) {
|
|
621
|
+
groups[contentType] = [];
|
|
622
|
+
}
|
|
623
|
+
groups[contentType].push(embedding);
|
|
624
|
+
return groups;
|
|
625
|
+
}, {});
|
|
626
|
+
const textEmbeddings = groupedEmbeddings.text || [];
|
|
627
|
+
const imageEmbeddings = groupedEmbeddings.image || [];
|
|
628
|
+
console.log(`Grouped: ${textEmbeddings.length} text, ${imageEmbeddings.length} image vectors`);
|
|
629
|
+
// Use grouped storage method if available, fallback to regular method
|
|
630
|
+
if (this.indexManager.addGroupedEmbeddings) {
|
|
631
|
+
await this.indexManager.addGroupedEmbeddings(textEmbeddings, imageEmbeddings);
|
|
632
|
+
}
|
|
633
|
+
else {
|
|
634
|
+
await this.indexManager.addVectors(embeddings);
|
|
635
|
+
}
|
|
579
636
|
console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
|
|
580
637
|
}
|
|
581
638
|
catch (error) {
|
|
@@ -583,26 +640,72 @@ export class IngestionPipeline {
|
|
|
583
640
|
throw error;
|
|
584
641
|
}
|
|
585
642
|
}
|
|
643
|
+
/**
|
|
644
|
+
* Filter documents based on ingestion mode to avoid processing incompatible content types
|
|
645
|
+
*/
|
|
646
|
+
filterDocumentsByMode(fileResult, mode) {
|
|
647
|
+
if (mode === 'multimodal') {
|
|
648
|
+
// In multimodal mode, keep all documents
|
|
649
|
+
return fileResult;
|
|
650
|
+
}
|
|
651
|
+
// In text mode, filter out image documents
|
|
652
|
+
const filteredDocuments = fileResult.documents.filter(doc => {
|
|
653
|
+
const contentType = doc.metadata?.contentType || 'text';
|
|
654
|
+
const isCompatible = contentType === 'text' ||
|
|
655
|
+
contentType.startsWith('text/') ||
|
|
656
|
+
contentType === 'application/pdf' ||
|
|
657
|
+
contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
|
|
658
|
+
if (!isCompatible) {
|
|
659
|
+
console.log(`⚠️ Skipping ${doc.source} (${contentType}) - not compatible with text mode`);
|
|
660
|
+
}
|
|
661
|
+
return isCompatible;
|
|
662
|
+
});
|
|
663
|
+
// Update processing result to reflect filtering
|
|
664
|
+
const filteredProcessingResult = {
|
|
665
|
+
...fileResult.processingResult,
|
|
666
|
+
skippedFiles: [
|
|
667
|
+
...(fileResult.processingResult.skippedFiles || []),
|
|
668
|
+
...fileResult.documents
|
|
669
|
+
.filter(doc => !filteredDocuments.includes(doc))
|
|
670
|
+
.map(doc => ({
|
|
671
|
+
path: doc.source,
|
|
672
|
+
reason: `Content type not compatible with ${mode} mode`
|
|
673
|
+
}))
|
|
674
|
+
]
|
|
675
|
+
};
|
|
676
|
+
return {
|
|
677
|
+
documents: filteredDocuments,
|
|
678
|
+
discoveryResult: fileResult.discoveryResult,
|
|
679
|
+
processingResult: filteredProcessingResult
|
|
680
|
+
};
|
|
681
|
+
}
|
|
586
682
|
/**
|
|
587
683
|
* Converts MIME type to simple content type for embedding function
|
|
588
684
|
* @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
|
|
589
685
|
* @returns Simple content type ('text', 'image', etc.)
|
|
590
686
|
*/
|
|
591
|
-
getContentTypeForEmbedding(
|
|
592
|
-
if (!
|
|
687
|
+
getContentTypeForEmbedding(contentType) {
|
|
688
|
+
if (!contentType) {
|
|
689
|
+
return 'text';
|
|
690
|
+
}
|
|
691
|
+
// Handle simple content type strings (used by chunking)
|
|
692
|
+
if (contentType === 'image') {
|
|
693
|
+
return 'image';
|
|
694
|
+
}
|
|
695
|
+
else if (contentType === 'text') {
|
|
593
696
|
return 'text';
|
|
594
697
|
}
|
|
595
|
-
// Convert MIME types to simple content types
|
|
596
|
-
if (
|
|
698
|
+
// Convert MIME types to simple content types (legacy support)
|
|
699
|
+
if (contentType.startsWith('text/')) {
|
|
597
700
|
return 'text';
|
|
598
701
|
}
|
|
599
|
-
else if (
|
|
702
|
+
else if (contentType.startsWith('image/')) {
|
|
600
703
|
return 'image';
|
|
601
704
|
}
|
|
602
|
-
else if (
|
|
705
|
+
else if (contentType === 'application/pdf') {
|
|
603
706
|
return 'text'; // PDFs are processed as text
|
|
604
707
|
}
|
|
605
|
-
else if (
|
|
708
|
+
else if (contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
|
|
606
709
|
return 'text'; // DOCX files are processed as text
|
|
607
710
|
}
|
|
608
711
|
else {
|
|
@@ -671,6 +774,7 @@ export class IngestionPipeline {
|
|
|
671
774
|
contentType: 'image',
|
|
672
775
|
contentId: contentResult.contentId,
|
|
673
776
|
storageType: contentResult.storageType,
|
|
777
|
+
contentPath: contentResult.contentPath, // Store contentPath for embedding
|
|
674
778
|
originalPath: metadata.originalPath,
|
|
675
779
|
...imageMetadata // Spread all image metadata fields
|
|
676
780
|
}
|
|
@@ -687,6 +791,7 @@ export class IngestionPipeline {
|
|
|
687
791
|
contentType: 'image',
|
|
688
792
|
contentId: contentResult.contentId,
|
|
689
793
|
storageType: contentResult.storageType,
|
|
794
|
+
contentPath: contentResult.contentPath, // Store contentPath for embedding
|
|
690
795
|
originalPath: metadata.originalPath,
|
|
691
796
|
processingError: error instanceof Error ? error.message : String(error)
|
|
692
797
|
}
|
|
@@ -59,15 +59,10 @@ export declare class LazyRerankerLoader {
|
|
|
59
59
|
*/
|
|
60
60
|
static loadTextDerivedReranker(): Promise<RerankFunction>;
|
|
61
61
|
/**
|
|
62
|
-
* Lazily load
|
|
63
|
-
*
|
|
62
|
+
* Lazily load CLIP AutoProcessor for consistent image preprocessing
|
|
63
|
+
* Shares processor instances across embedder instances to ensure identical preprocessing
|
|
64
64
|
*/
|
|
65
|
-
static
|
|
66
|
-
/**
|
|
67
|
-
* Lazily load hybrid reranker for multimodal mode
|
|
68
|
-
* Combines multiple reranking strategies (uses text-derived for now)
|
|
69
|
-
*/
|
|
70
|
-
static loadHybridReranker(): Promise<RerankFunction>;
|
|
65
|
+
static loadCLIPAutoProcessor(modelName: string): Promise<any>;
|
|
71
66
|
/**
|
|
72
67
|
* Check if a reranker is already loaded in cache
|
|
73
68
|
*/
|
|
@@ -198,32 +198,18 @@ export class LazyRerankerLoader {
|
|
|
198
198
|
});
|
|
199
199
|
}
|
|
200
200
|
/**
|
|
201
|
-
* Lazily load
|
|
202
|
-
*
|
|
201
|
+
* Lazily load CLIP AutoProcessor for consistent image preprocessing
|
|
202
|
+
* Shares processor instances across embedder instances to ensure identical preprocessing
|
|
203
203
|
*/
|
|
204
|
-
static async
|
|
205
|
-
const cacheKey =
|
|
204
|
+
static async loadCLIPAutoProcessor(modelName) {
|
|
205
|
+
const cacheKey = `processor:clip:${modelName}`;
|
|
206
206
|
return this.cache.getOrLoad(cacheKey, async () => {
|
|
207
|
-
console.log(
|
|
208
|
-
// Dynamic import - only loaded when
|
|
209
|
-
const {
|
|
210
|
-
const
|
|
211
|
-
console.log(
|
|
212
|
-
return
|
|
213
|
-
});
|
|
214
|
-
}
|
|
215
|
-
/**
|
|
216
|
-
* Lazily load hybrid reranker for multimodal mode
|
|
217
|
-
* Combines multiple reranking strategies (uses text-derived for now)
|
|
218
|
-
*/
|
|
219
|
-
static async loadHybridReranker() {
|
|
220
|
-
const cacheKey = 'reranker:hybrid';
|
|
221
|
-
return this.cache.getOrLoad(cacheKey, async () => {
|
|
222
|
-
console.log('🔄 Lazy loading hybrid reranker (multimodal)');
|
|
223
|
-
// For now, hybrid reranking uses text-derived
|
|
224
|
-
// TODO: Implement proper hybrid reranking in future tasks
|
|
225
|
-
console.log('🔄 Hybrid reranking not yet implemented, using text-derived');
|
|
226
|
-
return this.loadTextDerivedReranker();
|
|
207
|
+
console.log(`🔄 Lazy loading CLIP AutoProcessor: ${modelName}`);
|
|
208
|
+
// Dynamic import - only loaded when CLIP models are used
|
|
209
|
+
const { AutoProcessor } = await import('@huggingface/transformers');
|
|
210
|
+
const processor = await AutoProcessor.from_pretrained(modelName);
|
|
211
|
+
console.log(`✅ CLIP AutoProcessor loaded: ${modelName}`);
|
|
212
|
+
return processor;
|
|
227
213
|
});
|
|
228
214
|
}
|
|
229
215
|
/**
|
|
@@ -371,12 +357,8 @@ export class LazyDependencyManager {
|
|
|
371
357
|
return LazyRerankerLoader.loadTextReranker();
|
|
372
358
|
case 'text-derived':
|
|
373
359
|
return LazyRerankerLoader.loadTextDerivedReranker();
|
|
374
|
-
case 'metadata':
|
|
375
|
-
return LazyRerankerLoader.loadMetadataReranker();
|
|
376
|
-
case 'hybrid':
|
|
377
|
-
return LazyRerankerLoader.loadHybridReranker();
|
|
378
360
|
default:
|
|
379
|
-
throw new Error(`Unknown reranking strategy '${strategy}'. Supported strategies: cross-encoder, text-derived,
|
|
361
|
+
throw new Error(`Unknown reranking strategy '${strategy}'. Supported strategies: cross-encoder, text-derived, disabled`);
|
|
380
362
|
}
|
|
381
363
|
}
|
|
382
364
|
/**
|
|
@@ -526,7 +526,7 @@ export class ModeDetectionService {
|
|
|
526
526
|
* @private
|
|
527
527
|
*/
|
|
528
528
|
validateRerankingStrategy(strategy) {
|
|
529
|
-
const validStrategies = ['cross-encoder', 'text-derived', '
|
|
529
|
+
const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
|
|
530
530
|
if (!validStrategies.includes(strategy)) {
|
|
531
531
|
throw createError.validation(`Invalid reranking strategy '${strategy}'. Must be one of: ${validStrategies.join(', ')}`);
|
|
532
532
|
}
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Provides straightforward configuration types and validation for different
|
|
5
5
|
* reranking strategies without complex interface patterns.
|
|
6
6
|
*/
|
|
7
|
-
export type RerankingStrategyType = 'cross-encoder' | 'text-derived' | '
|
|
7
|
+
export type RerankingStrategyType = 'cross-encoder' | 'text-derived' | 'disabled';
|
|
8
8
|
export interface RerankingConfig {
|
|
9
9
|
strategy: RerankingStrategyType;
|
|
10
10
|
model?: string;
|
|
@@ -17,15 +17,13 @@ export const DEFAULT_MULTIMODAL_RERANKING_CONFIG = {
|
|
|
17
17
|
semantic: 0.7,
|
|
18
18
|
metadata: 0.3
|
|
19
19
|
},
|
|
20
|
-
fallback: '
|
|
20
|
+
fallback: 'disabled'
|
|
21
21
|
};
|
|
22
22
|
// Strategy validation without complex interface patterns
|
|
23
23
|
export function validateRerankingStrategy(strategy) {
|
|
24
24
|
const validStrategies = [
|
|
25
25
|
'cross-encoder',
|
|
26
26
|
'text-derived',
|
|
27
|
-
'metadata',
|
|
28
|
-
'hybrid',
|
|
29
27
|
'disabled'
|
|
30
28
|
];
|
|
31
29
|
return validStrategies.includes(strategy);
|
|
@@ -36,7 +34,7 @@ export function validateRerankingConfig(config) {
|
|
|
36
34
|
throw new Error('Reranking strategy is required');
|
|
37
35
|
}
|
|
38
36
|
if (!validateRerankingStrategy(config.strategy)) {
|
|
39
|
-
const validStrategies = ['cross-encoder', 'text-derived', '
|
|
37
|
+
const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
|
|
40
38
|
throw new Error(`Invalid reranking strategy '${config.strategy}'. ` +
|
|
41
39
|
`Valid strategies: ${validStrategies.join(', ')}`);
|
|
42
40
|
}
|
|
@@ -52,23 +50,16 @@ export function validateRerankingConfig(config) {
|
|
|
52
50
|
if (visual !== undefined && (visual < 0 || visual > 1)) {
|
|
53
51
|
throw new Error('Visual weight must be between 0 and 1');
|
|
54
52
|
}
|
|
55
|
-
// Ensure weights sum to reasonable value for hybrid strategy
|
|
56
|
-
if (config.strategy === 'hybrid') {
|
|
57
|
-
const totalWeight = (semantic || 0) + (metadata || 0) + (visual || 0);
|
|
58
|
-
if (totalWeight === 0) {
|
|
59
|
-
throw new Error('Hybrid strategy requires at least one weight to be greater than 0');
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
53
|
}
|
|
63
54
|
// Validate fallback strategy if provided
|
|
64
55
|
if (config.fallback && !validateRerankingStrategy(config.fallback)) {
|
|
65
|
-
const validStrategies = ['cross-encoder', 'text-derived', '
|
|
56
|
+
const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
|
|
66
57
|
throw new Error(`Invalid fallback strategy '${config.fallback}'. ` +
|
|
67
58
|
`Valid strategies: ${validStrategies.join(', ')}`);
|
|
68
59
|
}
|
|
69
60
|
return {
|
|
70
61
|
strategy: config.strategy,
|
|
71
|
-
enabled: config.enabled ?? true,
|
|
62
|
+
enabled: config.strategy === 'disabled' ? false : (config.enabled ?? true),
|
|
72
63
|
model: config.model,
|
|
73
64
|
weights: config.weights,
|
|
74
65
|
fallback: config.fallback || 'disabled'
|
|
@@ -91,7 +82,7 @@ export function isStrategySupported(strategy, mode) {
|
|
|
91
82
|
case 'text':
|
|
92
83
|
return strategy === 'cross-encoder' || strategy === 'disabled';
|
|
93
84
|
case 'multimodal':
|
|
94
|
-
return ['text-derived', '
|
|
85
|
+
return ['text-derived', 'disabled'].includes(strategy);
|
|
95
86
|
default:
|
|
96
87
|
return false;
|
|
97
88
|
}
|
|
@@ -102,7 +93,7 @@ export function getSupportedStrategies(mode) {
|
|
|
102
93
|
case 'text':
|
|
103
94
|
return ['cross-encoder', 'disabled'];
|
|
104
95
|
case 'multimodal':
|
|
105
|
-
return ['text-derived', '
|
|
96
|
+
return ['text-derived', 'disabled'];
|
|
106
97
|
default:
|
|
107
98
|
return ['disabled'];
|
|
108
99
|
}
|
|
@@ -145,7 +136,7 @@ export class RerankingConfigBuilder {
|
|
|
145
136
|
.strategy('text-derived')
|
|
146
137
|
.enabled(true)
|
|
147
138
|
.weights({ semantic: 0.7, metadata: 0.3 })
|
|
148
|
-
.fallback('
|
|
139
|
+
.fallback('disabled');
|
|
149
140
|
}
|
|
150
141
|
static disabled() {
|
|
151
142
|
return new RerankingConfigBuilder()
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* principle of using simple functions over complex factory patterns.
|
|
7
7
|
*/
|
|
8
8
|
import { getDefaultRerankingConfig, isStrategySupported, getSupportedStrategies, validateRerankingConfig } from './reranking-config.js';
|
|
9
|
-
import { createCrossEncoderRerankFunction, createTextDerivedRerankFunction
|
|
9
|
+
import { createCrossEncoderRerankFunction, createTextDerivedRerankFunction } from './reranking-strategies.js';
|
|
10
10
|
/**
|
|
11
11
|
* Simple reranking creation function with conditional logic
|
|
12
12
|
*
|
|
@@ -102,23 +102,6 @@ function createRerankingFunction(mode, strategy, config) {
|
|
|
102
102
|
undefined // Use default cross-encoder model
|
|
103
103
|
);
|
|
104
104
|
break;
|
|
105
|
-
case 'metadata':
|
|
106
|
-
console.log(`Creating metadata reranker for ${mode} mode`);
|
|
107
|
-
reranker = createMetadataRerankFunction({
|
|
108
|
-
weights: config.weights ? {
|
|
109
|
-
filename: config.weights.metadata || 0.4,
|
|
110
|
-
contentType: 0.3,
|
|
111
|
-
metadata: config.weights.metadata || 0.3
|
|
112
|
-
} : undefined
|
|
113
|
-
});
|
|
114
|
-
break;
|
|
115
|
-
case 'hybrid':
|
|
116
|
-
if (mode !== 'multimodal') {
|
|
117
|
-
throw new RerankingStrategyError(strategy, mode, 'Hybrid strategy only supported in multimodal mode', 'UNSUPPORTED_MODE');
|
|
118
|
-
}
|
|
119
|
-
console.log('Creating hybrid reranker for multimodal mode');
|
|
120
|
-
reranker = createHybridRerankFunction(config);
|
|
121
|
-
break;
|
|
122
105
|
case 'disabled':
|
|
123
106
|
console.log('Reranking explicitly disabled');
|
|
124
107
|
return undefined;
|
|
@@ -241,172 +224,10 @@ function wrapRerankFunctionWithErrorRecovery(reranker, strategy, mode) {
|
|
|
241
224
|
};
|
|
242
225
|
}
|
|
243
226
|
/**
|
|
244
|
-
*
|
|
227
|
+
* Hybrid reranking strategy removed in Phase 3 - throwing error for backward compatibility
|
|
245
228
|
*/
|
|
246
229
|
function createHybridRerankFunction(config) {
|
|
247
|
-
|
|
248
|
-
const weights = config.weights || {
|
|
249
|
-
semantic: 0.6,
|
|
250
|
-
metadata: 0.4,
|
|
251
|
-
visual: 0.0 // Not implemented yet
|
|
252
|
-
};
|
|
253
|
-
// Track which strategies are available
|
|
254
|
-
const availableStrategies = {};
|
|
255
|
-
// Initialize strategies with error handling
|
|
256
|
-
try {
|
|
257
|
-
if (weights.semantic && weights.semantic > 0) {
|
|
258
|
-
availableStrategies.textDerived = createTextDerivedRerankFunction();
|
|
259
|
-
console.log('✅ Text-derived strategy initialized for hybrid reranking');
|
|
260
|
-
}
|
|
261
|
-
}
|
|
262
|
-
catch (error) {
|
|
263
|
-
console.warn(`⚠️ Text-derived strategy initialization failed for hybrid reranking: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
264
|
-
}
|
|
265
|
-
try {
|
|
266
|
-
if (weights.metadata && weights.metadata > 0) {
|
|
267
|
-
availableStrategies.metadata = createMetadataRerankFunction();
|
|
268
|
-
console.log('✅ Metadata strategy initialized for hybrid reranking');
|
|
269
|
-
}
|
|
270
|
-
}
|
|
271
|
-
catch (error) {
|
|
272
|
-
console.warn(`⚠️ Metadata strategy initialization failed for hybrid reranking: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
273
|
-
}
|
|
274
|
-
// Check if any strategies are available
|
|
275
|
-
const hasAvailableStrategies = Object.keys(availableStrategies).length > 0;
|
|
276
|
-
if (!hasAvailableStrategies) {
|
|
277
|
-
throw new RerankingStrategyError('hybrid', 'multimodal', 'No hybrid reranking strategies could be initialized', 'NO_STRATEGIES_AVAILABLE');
|
|
278
|
-
}
|
|
279
|
-
console.log(`Hybrid reranking initialized with ${Object.keys(availableStrategies).length} available strategies`);
|
|
280
|
-
return async (query, results, contentType) => {
|
|
281
|
-
const startTime = Date.now();
|
|
282
|
-
const strategyResults = {};
|
|
283
|
-
try {
|
|
284
|
-
console.log(`🔄 Running hybrid reranking with ${Object.keys(availableStrategies).length} strategies`);
|
|
285
|
-
// Start with original results
|
|
286
|
-
let hybridResults = [...results];
|
|
287
|
-
let successfulStrategies = 0;
|
|
288
|
-
// Apply text-derived reranking if available and enabled
|
|
289
|
-
if (availableStrategies.textDerived && weights.semantic && weights.semantic > 0) {
|
|
290
|
-
const strategyStartTime = Date.now();
|
|
291
|
-
try {
|
|
292
|
-
console.log(`🔧 Applying text-derived reranking (weight: ${weights.semantic})`);
|
|
293
|
-
const textDerivedResults = await availableStrategies.textDerived(query, hybridResults, contentType);
|
|
294
|
-
// Combine scores with semantic weight
|
|
295
|
-
hybridResults = hybridResults.map((result, index) => {
|
|
296
|
-
const textDerivedScore = textDerivedResults[index]?.score || result.score;
|
|
297
|
-
const combinedScore = result.score * (1 - weights.semantic) + textDerivedScore * weights.semantic;
|
|
298
|
-
return {
|
|
299
|
-
...result,
|
|
300
|
-
score: combinedScore,
|
|
301
|
-
metadata: {
|
|
302
|
-
...result.metadata,
|
|
303
|
-
hybridScores: {
|
|
304
|
-
...(result.metadata?.hybridScores || {}),
|
|
305
|
-
textDerived: textDerivedScore,
|
|
306
|
-
semantic: combinedScore
|
|
307
|
-
}
|
|
308
|
-
}
|
|
309
|
-
};
|
|
310
|
-
});
|
|
311
|
-
const strategyDuration = Date.now() - strategyStartTime;
|
|
312
|
-
strategyResults.textDerived = { success: true, duration: strategyDuration };
|
|
313
|
-
successfulStrategies++;
|
|
314
|
-
console.log(`✅ Text-derived reranking completed (${strategyDuration}ms)`);
|
|
315
|
-
}
|
|
316
|
-
catch (error) {
|
|
317
|
-
const strategyDuration = Date.now() - strategyStartTime;
|
|
318
|
-
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
319
|
-
strategyResults.textDerived = { success: false, error: errorMessage, duration: strategyDuration };
|
|
320
|
-
console.warn(`❌ Text-derived reranking failed in hybrid mode (${strategyDuration}ms): ${errorMessage}`);
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
// Apply metadata reranking if available and enabled
|
|
324
|
-
if (availableStrategies.metadata && weights.metadata && weights.metadata > 0) {
|
|
325
|
-
const strategyStartTime = Date.now();
|
|
326
|
-
try {
|
|
327
|
-
console.log(`🔧 Applying metadata reranking (weight: ${weights.metadata})`);
|
|
328
|
-
const metadataResults = await availableStrategies.metadata(query, hybridResults, contentType);
|
|
329
|
-
// Combine scores with metadata weight
|
|
330
|
-
hybridResults = hybridResults.map((result, index) => {
|
|
331
|
-
const metadataScore = metadataResults[index]?.score || result.score;
|
|
332
|
-
const currentScore = result.score;
|
|
333
|
-
const combinedScore = currentScore * (1 - weights.metadata) + metadataScore * weights.metadata;
|
|
334
|
-
return {
|
|
335
|
-
...result,
|
|
336
|
-
score: combinedScore,
|
|
337
|
-
metadata: {
|
|
338
|
-
...result.metadata,
|
|
339
|
-
hybridScores: {
|
|
340
|
-
...(result.metadata?.hybridScores || {}),
|
|
341
|
-
metadata: metadataScore,
|
|
342
|
-
combined: combinedScore
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
};
|
|
346
|
-
});
|
|
347
|
-
const strategyDuration = Date.now() - strategyStartTime;
|
|
348
|
-
strategyResults.metadata = { success: true, duration: strategyDuration };
|
|
349
|
-
successfulStrategies++;
|
|
350
|
-
console.log(`✅ Metadata reranking completed (${strategyDuration}ms)`);
|
|
351
|
-
}
|
|
352
|
-
catch (error) {
|
|
353
|
-
const strategyDuration = Date.now() - strategyStartTime;
|
|
354
|
-
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
355
|
-
strategyResults.metadata = { success: false, error: errorMessage, duration: strategyDuration };
|
|
356
|
-
console.warn(`❌ Metadata reranking failed in hybrid mode (${strategyDuration}ms): ${errorMessage}`);
|
|
357
|
-
}
|
|
358
|
-
}
|
|
359
|
-
// Sort by final combined scores
|
|
360
|
-
hybridResults.sort((a, b) => b.score - a.score);
|
|
361
|
-
const totalDuration = Date.now() - startTime;
|
|
362
|
-
// Add hybrid reranking metadata to results
|
|
363
|
-
hybridResults = hybridResults.map(result => ({
|
|
364
|
-
...result,
|
|
365
|
-
metadata: {
|
|
366
|
-
...result.metadata,
|
|
367
|
-
hybridRerankingInfo: {
|
|
368
|
-
totalDuration,
|
|
369
|
-
successfulStrategies,
|
|
370
|
-
strategyResults,
|
|
371
|
-
weights
|
|
372
|
-
}
|
|
373
|
-
}
|
|
374
|
-
}));
|
|
375
|
-
if (successfulStrategies > 0) {
|
|
376
|
-
console.log(`✅ Hybrid reranking completed successfully (${totalDuration}ms, ${successfulStrategies}/${Object.keys(availableStrategies).length} strategies succeeded)`);
|
|
377
|
-
}
|
|
378
|
-
else {
|
|
379
|
-
console.warn(`⚠️ Hybrid reranking completed with no successful strategies (${totalDuration}ms), returning original results`);
|
|
380
|
-
return results; // Return original results if no strategies succeeded
|
|
381
|
-
}
|
|
382
|
-
return hybridResults;
|
|
383
|
-
}
|
|
384
|
-
catch (error) {
|
|
385
|
-
const totalDuration = Date.now() - startTime;
|
|
386
|
-
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
387
|
-
console.warn(`❌ Hybrid reranking failed (${totalDuration}ms): ${errorMessage}. ` +
|
|
388
|
-
`Returning original results.`);
|
|
389
|
-
// Log detailed error information
|
|
390
|
-
console.error('Hybrid reranking error details:', {
|
|
391
|
-
query: query.substring(0, 100) + (query.length > 100 ? '...' : ''),
|
|
392
|
-
resultCount: results.length,
|
|
393
|
-
contentType,
|
|
394
|
-
availableStrategies: Object.keys(availableStrategies),
|
|
395
|
-
weights,
|
|
396
|
-
strategyResults,
|
|
397
|
-
error: errorMessage
|
|
398
|
-
});
|
|
399
|
-
return results.map(result => ({
|
|
400
|
-
...result,
|
|
401
|
-
metadata: {
|
|
402
|
-
...result.metadata,
|
|
403
|
-
hybridRerankingFailed: true,
|
|
404
|
-
hybridRerankingError: errorMessage,
|
|
405
|
-
fallbackToVectorSimilarity: true
|
|
406
|
-
}
|
|
407
|
-
}));
|
|
408
|
-
}
|
|
409
|
-
};
|
|
230
|
+
throw new RerankingStrategyError('hybrid', 'multimodal', 'Hybrid reranking strategy has been removed in this version. Use text-derived instead.', 'STRATEGY_REMOVED');
|
|
410
231
|
}
|
|
411
232
|
/**
|
|
412
233
|
* Create reranker with automatic mode detection
|
|
@@ -582,8 +403,6 @@ export function getRerankingStats() {
|
|
|
582
403
|
strategiesUsed: {
|
|
583
404
|
'cross-encoder': 0,
|
|
584
405
|
'text-derived': 0,
|
|
585
|
-
'metadata': 0,
|
|
586
|
-
'hybrid': 0,
|
|
587
406
|
'disabled': 0
|
|
588
407
|
}
|
|
589
408
|
};
|
package/dist/core/search.d.ts
CHANGED
|
@@ -80,6 +80,16 @@ export declare class SearchEngine {
|
|
|
80
80
|
* @returns Promise resolving to array of search results
|
|
81
81
|
*/
|
|
82
82
|
search(query: string, options?: SearchOptions): Promise<SearchResult[]>;
|
|
83
|
+
/**
|
|
84
|
+
* Perform semantic search using a pre-computed embedding vector
|
|
85
|
+
* Useful for image-based search or when embedding is computed externally
|
|
86
|
+
* @param queryVector - Pre-computed query embedding vector
|
|
87
|
+
* @param options - Search options including top_k and rerank settings
|
|
88
|
+
* @param originalQuery - Optional original query for reranking (text or image path)
|
|
89
|
+
* @param embeddingTime - Optional embedding time for logging
|
|
90
|
+
* @returns Promise resolving to array of search results
|
|
91
|
+
*/
|
|
92
|
+
searchWithVector(queryVector: Float32Array, options?: SearchOptions, originalQuery?: string, embeddingTime?: number): Promise<SearchResult[]>;
|
|
83
93
|
/**
|
|
84
94
|
* Format search results with proper structure
|
|
85
95
|
* @param chunks - Database chunks with metadata
|