npm - rag-lite-ts - Versions diffs - 2.0.5 → 2.1.1 - Mend

rag-lite-ts 2.0.5 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +815 -808
package/dist/cli/indexer.js +3 -39
package/dist/cli/search.d.ts +1 -1
package/dist/cli/search.js +123 -19
package/dist/cli.js +77 -94
package/dist/core/binary-index-format.d.ts +28 -2
package/dist/core/binary-index-format.js +196 -27
package/dist/core/db.js +173 -173
package/dist/core/ingestion.d.ts +5 -1
package/dist/core/ingestion.js +123 -18
package/dist/core/lazy-dependency-loader.d.ts +3 -8
package/dist/core/lazy-dependency-loader.js +11 -29
package/dist/core/mode-detection-service.js +1 -1
package/dist/core/reranking-config.d.ts +1 -1
package/dist/core/reranking-config.js +7 -16
package/dist/core/reranking-factory.js +3 -184
package/dist/core/search.d.ts +10 -0
package/dist/core/search.js +35 -11
package/dist/core/types.d.ts +1 -1
package/dist/core/vector-index.d.ts +4 -0
package/dist/core/vector-index.js +6 -0
package/dist/factories/ingestion-factory.js +3 -1
package/dist/file-processor.d.ts +2 -0
package/dist/file-processor.js +20 -0
package/dist/index-manager.d.ts +17 -1
package/dist/index-manager.js +148 -7
package/dist/mcp-server.js +127 -105
package/dist/multimodal/clip-embedder.js +6 -2
package/package.json +1 -1

package/dist/core/search.js CHANGED Viewed

@@ -106,18 +106,41 @@ export class SearchEngine {
             return [];
         }
         const startTime = performance.now();
-        const topK = options.top_k || config.top_k || 10;
-        const shouldRerank = options.rerank !== undefined ? options.rerank : (this.rerankFn !== undefined);
         try {
             // Step 1: Build query embedding using injected embed function
             const embeddingStartTime = performance.now();
             const queryEmbedding = await this.embedFn(query);
             const embeddingTime = performance.now() - embeddingStartTime;
-            // Step 2: Search using IndexManager (which handles hash mapping properly)
+            // Step 2: Search with the vector
+            const results = await this.searchWithVector(queryEmbedding.vector, options, query, embeddingTime);
+            return results;
+        }
+        catch (error) {
+            throw new Error(`Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
+        }
+    }
+    /**
+     * Perform semantic search using a pre-computed embedding vector
+     * Useful for image-based search or when embedding is computed externally
+     * @param queryVector - Pre-computed query embedding vector
+     * @param options - Search options including top_k and rerank settings
+     * @param originalQuery - Optional original query for reranking (text or image path)
+     * @param embeddingTime - Optional embedding time for logging
+     * @returns Promise resolving to array of search results
+     */
+    async searchWithVector(queryVector, options = {}, originalQuery, embeddingTime) {
+        const startTime = performance.now();
+        const topK = options.top_k || config.top_k || 10;
+        // Phase 1: Disable reranking by default for better performance
+        // Users must explicitly opt-in with --rerank flag
+        const shouldRerank = options.rerank === true;
+        try {
+            // Step 1: Search using IndexManager (which handles hash mapping properly)
             const searchStartTime = performance.now();
             let searchResult;
             try {
-                searchResult = this.indexManager.search(queryEmbedding.vector, topK);
+                const contentType = options.contentType;
+                searchResult = this.indexManager.search(queryVector, topK, contentType);
             }
             catch (error) {
                 if (error instanceof Error && error.message.includes('No embedding ID found for hash')) {
@@ -133,18 +156,18 @@ export class SearchEngine {
                 console.log(`No similar documents found (${totalTime.toFixed(2)}ms total)`);
                 return [];
             }
-            // Step 3: Retrieve chunks from database using embedding IDs
+            // Step 2: Retrieve chunks from database using embedding IDs
             const retrievalStartTime = performance.now();
             const chunks = await getChunksByEmbeddingIds(this.db, searchResult.embeddingIds);
             const retrievalTime = performance.now() - retrievalStartTime;
-            // Step 4: Format results as JSON with text, score, and document metadata
+            // Step 3: Format results as JSON with text, score, and document metadata
             let results = this.formatSearchResults(chunks, searchResult.distances, searchResult.embeddingIds);
-            // Step 5: Optional reranking with injected rerank function
+            // Step 4: Optional reranking with injected rerank function
             let rerankTime = 0;
-            if (shouldRerank && this.rerankFn && results.length > 1) {
+            if (shouldRerank && this.rerankFn && results.length > 1 && originalQuery) {
                 try {
                     const rerankStartTime = performance.now();
-                    results = await this.rerankFn(query, results);
+                    results = await this.rerankFn(originalQuery, results);
                     rerankTime = performance.now() - rerankStartTime;
                 }
                 catch (error) {
@@ -154,13 +177,14 @@ export class SearchEngine {
             }
             const totalTime = performance.now() - startTime;
             // Measure latency without premature optimization - just log for monitoring
+            const embedTimeStr = embeddingTime !== undefined ? `embed: ${embeddingTime.toFixed(2)}ms, ` : '';
             console.log(`Search completed: ${results.length} results in ${totalTime.toFixed(2)}ms ` +
-                `(embed: ${embeddingTime.toFixed(2)}ms, vector: ${vectorSearchTime.toFixed(2)}ms, ` +
+                `(${embedTimeStr}vector: ${vectorSearchTime.toFixed(2)}ms, ` +
                 `retrieval: ${retrievalTime.toFixed(2)}ms${rerankTime > 0 ? `, rerank: ${rerankTime.toFixed(2)}ms` : ''})`);
             return results;
         }
         catch (error) {
-            throw new Error(`Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
+            throw new Error(`Vector search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
         }
     }
     /**

package/dist/core/types.d.ts CHANGED Viewed

@@ -49,7 +49,7 @@ export interface RerankingInterface {
 export interface SearchOptions {
     top_k?: number;
     rerank?: boolean;
-    contentType?: string;
+    contentType?: 'text' | 'image' | 'combined';
 }
 export interface Chunk {
     text: string;

package/dist/core/vector-index.d.ts CHANGED Viewed

@@ -64,5 +64,9 @@ export declare class VectorIndex {
      * Resize index to accommodate more vectors
      */
     resizeIndex(newMaxElements: number): void;
+    /**
+     * Get index options (for external access to configuration)
+     */
+    getOptions(): VectorIndexOptions;
 }
 //# sourceMappingURL=vector-index.d.ts.map

package/dist/core/vector-index.js CHANGED Viewed

@@ -321,5 +321,11 @@ export class VectorIndex {
             throw new Error(`Failed to resize index: ${error}`);
         }
     }
+    /**
+     * Get index options (for external access to configuration)
+     */
+    getOptions() {
+        return { ...this.options };
+    }
 }
 //# sourceMappingURL=vector-index.js.map

package/dist/factories/ingestion-factory.js CHANGED Viewed

@@ -323,7 +323,9 @@ export class IngestionFactory {
         const { getSystemInfo, setSystemInfo } = await import('../core/db.js');
         // Determine the effective mode and reranking strategy
         const effectiveMode = options.mode || 'text';
-        const effectiveRerankingStrategy = options.rerankingStrategy || 'cross-encoder';
+        // Phase 1: Fix mode-specific reranking strategy defaults
+        const effectiveRerankingStrategy = options.rerankingStrategy ||
+            (effectiveMode === 'multimodal' ? 'text-derived' : 'cross-encoder');
         // Determine model type based on model name
         let modelType;
         if (effectiveModel.includes('clip')) {

package/dist/file-processor.d.ts CHANGED Viewed

@@ -8,6 +8,8 @@ export interface FileProcessorOptions {
     recursive?: boolean;
     /** Maximum file size in bytes (default: 10MB) */
     maxFileSize?: number;
+    /** Processing mode to filter compatible files */
+    mode?: 'text' | 'multimodal';
 }
 /**
  * Default options for file processing

package/dist/file-processor.js CHANGED Viewed

@@ -188,6 +188,15 @@ async function discoverFilesRecursive(dirPath, options) {
                         // Check file size based on content type
                         const stats = await fs.stat(fullPath);
                         const contentType = getContentType(fullPath);
+                        // Filter by mode: skip incompatible content types
+                        const mode = options.mode || 'text';
+                        if (mode === 'text' && contentType === 'image') {
+                            result.skipped.push({
+                                path: fullPath,
+                                reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
+                            });
+                            continue;
+                        }
                         // Different size limits for different content types
                         const maxSize = contentType === 'image'
                             ? 50 * 1024 * 1024 // 50MB for images
@@ -250,6 +259,17 @@ export async function discoverFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIO
                 };
             }
             const contentType = getContentType(resolvedPath);
+            // Filter by mode: skip incompatible content types
+            const mode = options.mode || 'text';
+            if (mode === 'text' && contentType === 'image') {
+                return {
+                    files: [],
+                    skipped: [{
+                            path: resolvedPath,
+                            reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
+                        }]
+                };
+            }
             // Check file size based on content type
             const maxSize = contentType === 'image'
                 ? 50 * 1024 * 1024 // 50MB for images

package/dist/index-manager.d.ts CHANGED Viewed

@@ -7,12 +7,16 @@ export interface IndexStats {
 export declare class IndexManager {
     private modelName?;
     private vectorIndex;
+    private textIndex?;
+    private imageIndex?;
     private db;
     private indexPath;
     private dbPath;
     private isInitialized;
     private hashToEmbeddingId;
     private embeddingIdToHash;
+    private groupedEmbeddings?;
+    private vectorIndexOptions;
     constructor(indexPath: string, dbPath: string, dimensions: number, modelName?: string | undefined);
     /**
      * Initialize the index manager and load existing index if available
@@ -30,6 +34,10 @@ export declare class IndexManager {
      * Requirements: 5.3 - When new documents are added THEN system SHALL append new chunks and vectors without rebuilding existing index
      */
     addVectors(embeddings: EmbeddingResult[]): Promise<void>;
+    /**
+     * Add grouped embeddings by content type (for new grouped format)
+     */
+    addGroupedEmbeddings(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
     /**
      * Rebuild the entire index from scratch
      * Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
@@ -68,10 +76,18 @@ export declare class IndexManager {
      * Save the vector index to disk
      */
     saveIndex(): Promise<void>;
+    /**
+     * Create specialized indexes for text and image content when grouped data is available
+     */
+    private createSpecializedIndexes;
+    /**
+     * Save index with content type grouping (for new grouped format)
+     */
+    saveGroupedIndex(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
     /**
      * Search for similar vectors
      */
-    search(queryVector: Float32Array, k?: number): {
+    search(queryVector: Float32Array, k?: number, contentType?: 'text' | 'image' | 'combined'): {
         embeddingIds: string[];
         distances: number[];
     };

package/dist/index-manager.js CHANGED Viewed

@@ -1,26 +1,33 @@
 import { VectorIndex } from './core/vector-index.js';
+import { BinaryIndexFormat } from './core/binary-index-format.js';
 import { openDatabase, getSystemInfo, setSystemInfo } from './core/db.js';
 import { config, getModelDefaults } from './core/config.js';
 export class IndexManager {
     modelName;
     vectorIndex;
+    textIndex;
+    imageIndex;
     db = null;
     indexPath;
     dbPath;
     isInitialized = false;
     hashToEmbeddingId = new Map();
     embeddingIdToHash = new Map();
+    groupedEmbeddings;
+    vectorIndexOptions;
     constructor(indexPath, dbPath, dimensions, modelName) {
         this.modelName = modelName;
         this.indexPath = indexPath;
         this.dbPath = dbPath;
-        // Initialize with provided dimensions from config
-        this.vectorIndex = new VectorIndex(indexPath, {
+        // Store options for creating specialized indexes
+        this.vectorIndexOptions = {
             dimensions: dimensions,
             maxElements: 100000, // Start with 100k capacity
             efConstruction: 200,
             M: 16
-        });
+        };
+        // Initialize with provided dimensions from config
+        this.vectorIndex = new VectorIndex(indexPath, this.vectorIndexOptions);
     }
     /**
      * Initialize the index manager and load existing index if available
@@ -47,6 +54,8 @@ export class IndexManager {
                 // Only try to load existing index if not forcing recreation
                 console.log('Loading existing vector index...');
                 await this.vectorIndex.loadIndex();
+                // Check if the loaded index has grouped data and create specialized indexes
+                await this.createSpecializedIndexes();
             }
             // Always populate the embedding ID mapping from existing database entries
             // This is needed both for new and existing indexes
@@ -55,7 +64,8 @@ export class IndexManager {
                 this.hashEmbeddingId(chunk.embedding_id); // This will populate the mapping
             }
             this.isInitialized = true;
-            console.log(`Index manager initialized with ${this.vectorIndex.getCurrentCount()} vectors`);
+            const vectorCount = this.vectorIndex.getCurrentCount();
+            console.log(`Index manager initialized with ${vectorCount} vectors${this.textIndex && this.imageIndex ? ' (multi-graph mode)' : ''}`);
         }
         catch (error) {
             throw new Error(`Failed to initialize index manager: ${error}`);
@@ -153,6 +163,31 @@ export class IndexManager {
             throw new Error(`Failed to add vectors to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
         }
     }
+    /**
+     * Add grouped embeddings by content type (for new grouped format)
+     */
+    async addGroupedEmbeddings(textEmbeddings, imageEmbeddings) {
+        if (!this.isInitialized) {
+            throw new Error('Index manager not initialized');
+        }
+        console.log(`addGroupedEmbeddings: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
+        const allEmbeddings = [...textEmbeddings, ...imageEmbeddings];
+        if (allEmbeddings.length === 0) {
+            return;
+        }
+        try {
+            // Store grouped information for later saving
+            this.groupedEmbeddings = { text: textEmbeddings, image: imageEmbeddings };
+            console.log('addGroupedEmbeddings: stored grouped embeddings');
+            // Add all embeddings to the index (maintains current behavior)
+            await this.addVectors(allEmbeddings);
+            console.log('addGroupedEmbeddings: addVectors completed');
+            // The saveIndex method will now use grouped format if groupedEmbeddings exists
+        }
+        catch (error) {
+            throw new Error(`Failed to add grouped embeddings to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
+        }
+    }
     /**
      * Rebuild the entire index from scratch
      * Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
@@ -349,16 +384,122 @@ export class IndexManager {
         if (!this.isInitialized) {
             throw new Error('Index manager not initialized');
         }
-        await this.vectorIndex.saveIndex();
+        // If we have grouped embeddings, save in grouped format
+        if (this.groupedEmbeddings) {
+            console.log('IndexManager: Saving in grouped format');
+            await this.saveGroupedIndex(this.groupedEmbeddings.text, this.groupedEmbeddings.image);
+            // Clear grouped data after saving
+            this.groupedEmbeddings = undefined;
+        }
+        else {
+            console.log('IndexManager: Saving in standard format');
+            await this.vectorIndex.saveIndex();
+        }
+    }
+    /**
+     * Create specialized indexes for text and image content when grouped data is available
+     */
+    async createSpecializedIndexes() {
+        try {
+            // Load the index data to check if it has grouped information
+            const indexData = await BinaryIndexFormat.load(this.indexPath);
+            if (indexData.hasContentTypeGroups && indexData.textVectors && indexData.imageVectors) {
+                // Only create specialized indexes if we have both text and image vectors
+                // In text-only mode, textVectors would be populated but imageVectors empty
+                // In multimodal mode, both would be populated
+                const hasTextVectors = indexData.textVectors.length > 0;
+                const hasImageVectors = indexData.imageVectors.length > 0;
+                if (hasTextVectors && hasImageVectors) {
+                    console.log('Creating specialized indexes for content type filtering...');
+                    // Create text-only index
+                    this.textIndex = new VectorIndex(`${this.indexPath}.text`, this.vectorIndexOptions);
+                    await this.textIndex.initialize();
+                    this.textIndex.addVectors(indexData.textVectors);
+                    console.log(`✓ Text index created with ${indexData.textVectors.length} vectors`);
+                    // Create image-only index
+                    this.imageIndex = new VectorIndex(`${this.indexPath}.image`, this.vectorIndexOptions);
+                    await this.imageIndex.initialize();
+                    this.imageIndex.addVectors(indexData.imageVectors);
+                    console.log(`✓ Image index created with ${indexData.imageVectors.length} vectors`);
+                    console.log('✓ Specialized indexes ready for content type filtering');
+                }
+                else if (hasTextVectors) {
+                    console.log('Text-only index detected - using combined index for all searches');
+                    // In text-only mode, we don't need specialized indexes
+                    // The combined index (vectorIndex) already contains all text vectors
+                }
+            }
+        }
+        catch (error) {
+            console.warn('Failed to create specialized indexes, falling back to combined index:', error);
+            // Continue without specialized indexes - search will still work with combined index
+        }
+    }
+    /**
+     * Save index with content type grouping (for new grouped format)
+     */
+    async saveGroupedIndex(textEmbeddings, imageEmbeddings) {
+        if (!this.isInitialized) {
+            throw new Error('Index manager not initialized');
+        }
+        console.log(`saveGroupedIndex: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
+        // Group vectors by content type
+        const textVectors = textEmbeddings.map((embedding) => ({
+            id: this.hashEmbeddingId(embedding.embedding_id),
+            vector: embedding.vector
+        }));
+        const imageVectors = imageEmbeddings.map((embedding) => ({
+            id: this.hashEmbeddingId(embedding.embedding_id),
+            vector: embedding.vector
+        }));
+        // Get index parameters
+        const options = this.vectorIndex.getOptions();
+        const allVectors = [...textVectors, ...imageVectors];
+        console.log(`saveGroupedIndex: dimensions=${options.dimensions}, totalVectors=${allVectors.length}`);
+        const indexData = {
+            dimensions: options.dimensions,
+            maxElements: options.maxElements,
+            M: options.M || 16,
+            efConstruction: options.efConstruction || 200,
+            seed: options.seed || 100,
+            currentSize: textVectors.length + imageVectors.length,
+            vectors: allVectors, // Required for backward compatibility
+            hasContentTypeGroups: true,
+            textVectors,
+            imageVectors
+        };
+        console.log('saveGroupedIndex: Calling BinaryIndexFormat.saveGrouped');
+        // Save using grouped format
+        await BinaryIndexFormat.saveGrouped(this.indexPath, indexData);
+        console.log(`✓ Saved grouped index with ${textVectors.length} text and ${imageVectors.length} image vectors`);
     }
     /**
      * Search for similar vectors
      */
-    search(queryVector, k = 5) {
+    search(queryVector, k = 5, contentType) {
         if (!this.isInitialized) {
             throw new Error('Index manager not initialized');
         }
-        const results = this.vectorIndex.search(queryVector, k);
+        // Select the appropriate index based on content type
+        let targetIndex;
+        // If we have specialized indexes (multimodal mode), use them for filtering
+        if (this.textIndex && this.imageIndex) {
+            if (contentType === 'text') {
+                targetIndex = this.textIndex;
+            }
+            else if (contentType === 'image') {
+                targetIndex = this.imageIndex;
+            }
+            else {
+                // 'combined' or undefined
+                targetIndex = this.vectorIndex;
+            }
+        }
+        else {
+            // No specialized indexes (text-only mode) - ignore contentType and use combined index
+            targetIndex = this.vectorIndex;
+        }
+        const results = targetIndex.search(queryVector, k);
         // Convert numeric IDs back to embedding IDs
         const embeddingIds = results.neighbors.map(id => this.unhashEmbeddingId(id));
         return {