npm - rag-lite-ts - Versions diffs - 2.1.0 → 2.1.1 - Mend

rag-lite-ts 2.1.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/cli/indexer.js +1 -1
package/dist/cli/search.js +5 -10
package/dist/core/binary-index-format.d.ts +28 -2
package/dist/core/binary-index-format.js +196 -27
package/dist/core/ingestion.d.ts +5 -1
package/dist/core/ingestion.js +76 -9
package/dist/core/reranking-strategies.js +4 -5
package/dist/core/search.js +2 -1
package/dist/core/types.d.ts +1 -1
package/dist/core/vector-index.d.ts +4 -0
package/dist/core/vector-index.js +6 -0
package/dist/file-processor.d.ts +2 -0
package/dist/file-processor.js +20 -0
package/dist/index-manager.d.ts +17 -1
package/dist/index-manager.js +148 -7
package/dist/multimodal/clip-embedder.js +71 -66
package/package.json +1 -1

package/dist/cli/indexer.js CHANGED Viewed

@@ -198,7 +198,7 @@ export async function runIngest(path, options = {}) {
                 showProgress: true,
                 maxWaitMs: 15000 // Longer timeout for ingestion
             });
-            const result = await pipeline.ingestPath(resolvedPath);
+            const result = await pipeline.ingestPath(resolvedPath, { mode: factoryOptions.mode });
             // Display final results
             console.log('\n' + '='.repeat(50));
             console.log('INGESTION SUMMARY');

package/dist/cli/search.js CHANGED Viewed

@@ -137,6 +137,11 @@ export async function runSearch(query, options = {}) {
             if (options['top-k'] !== undefined) {
                 searchOptions.top_k = options['top-k'];
             }
+            // Set content type filter for search-level filtering
+            const contentTypeFilter = options['content-type'];
+            if (contentTypeFilter && contentTypeFilter !== 'all') {
+                searchOptions.contentType = contentTypeFilter;
+            }
             // Phase 2: Disable reranking for image-to-image searches to preserve visual similarity
             let rerankingForciblyDisabled = false;
             if (isImage && embedder) {
@@ -174,16 +179,6 @@ export async function runSearch(query, options = {}) {
                 results = await searchEngine.search(query, searchOptions);
             }
             const searchTime = Date.now() - startTime;
-            // Apply content type filter if specified
-            const contentTypeFilter = options['content-type'];
-            if (contentTypeFilter && contentTypeFilter !== 'all') {
-                const originalCount = results.length;
-                results = results.filter(r => r.contentType === contentTypeFilter);
-                if (results.length < originalCount) {
-                    console.log(`Filtered to ${results.length} ${contentTypeFilter} result${results.length === 1 ? '' : 's'} (from ${originalCount} total)`);
-                    console.log('');
-                }
-            }
             // Display results
             if (results.length === 0) {
                 console.log('No results found.');

package/dist/core/binary-index-format.d.ts CHANGED Viewed

@@ -25,10 +25,19 @@ export interface BinaryIndexData {
         id: number;
         vector: Float32Array;
     }>;
+    hasContentTypeGroups?: boolean;
+    textVectors?: Array<{
+        id: number;
+        vector: Float32Array;
+    }>;
+    imageVectors?: Array<{
+        id: number;
+        vector: Float32Array;
+    }>;
 }
 export declare class BinaryIndexFormat {
     /**
-     * Save index data to binary format
+     * Save index data to binary format (original format for backward compatibility)
      *
      * File structure:
      * - Header (24 bytes): dimensions, maxElements, M, efConstruction, seed, currentSize
@@ -39,7 +48,24 @@ export declare class BinaryIndexFormat {
      */
     static save(indexPath: string, data: BinaryIndexData): Promise<void>;
     /**
-     * Load index data from binary format
+     * Save index data to grouped binary format
+     *
+     * File structure:
+     * - Extended Header (40 bytes):
+     *   - Original 6 fields (24 bytes)
+     *   - hasGroups flag (4 bytes)
+     *   - textOffset (4 bytes)
+     *   - textCount (4 bytes)
+     *   - imageOffset (4 bytes)
+     *   - imageCount (4 bytes)
+     * - Data section: [text vectors...][image vectors...]
+     *
+     * @param indexPath Path to save the binary index file
+     * @param data Index data to serialize
+     */
+    static saveGrouped(indexPath: string, data: BinaryIndexData): Promise<void>;
+    /**
+     * Load index data from binary format (supports both original and grouped formats)
      *
      * Uses zero-copy Float32Array views for efficient loading.
      * Copies the views to ensure data persistence after buffer lifecycle.

package/dist/core/binary-index-format.js CHANGED Viewed

@@ -17,7 +17,7 @@
 import { readFileSync, writeFileSync } from 'fs';
 export class BinaryIndexFormat {
     /**
-     * Save index data to binary format
+     * Save index data to binary format (original format for backward compatibility)
      *
      * File structure:
      * - Header (24 bytes): dimensions, maxElements, M, efConstruction, seed, currentSize
@@ -66,7 +66,115 @@ export class BinaryIndexFormat {
         writeFileSync(indexPath, Buffer.from(buffer));
     }
     /**
-     * Load index data from binary format
+     * Save index data to grouped binary format
+     *
+     * File structure:
+     * - Extended Header (40 bytes):
+     *   - Original 6 fields (24 bytes)
+     *   - hasGroups flag (4 bytes)
+     *   - textOffset (4 bytes)
+     *   - textCount (4 bytes)
+     *   - imageOffset (4 bytes)
+     *   - imageCount (4 bytes)
+     * - Data section: [text vectors...][image vectors...]
+     *
+     * @param indexPath Path to save the binary index file
+     * @param data Index data to serialize
+     */
+    static async saveGrouped(indexPath, data) {
+        if (!data.hasContentTypeGroups || !data.textVectors || !data.imageVectors) {
+            // Fallback to original format
+            return this.save(indexPath, data);
+        }
+        const headerSize = 44; // Extended header: 24 + 20 bytes (hasGroups + textOffset + textCount + imageOffset + imageCount)
+        const vectorSize = 4 + (data.dimensions * 4); // id + vector
+        // Calculate offsets and total size
+        const textOffset = headerSize;
+        const imageOffset = textOffset + (data.textVectors.length * vectorSize);
+        const totalSize = imageOffset + (data.imageVectors.length * vectorSize);
+        const buffer = new ArrayBuffer(totalSize);
+        const view = new DataView(buffer);
+        let offset = 0;
+        // Write extended header (40 bytes, all little-endian)
+        if (offset + 40 > buffer.byteLength) {
+            throw new Error(`Header write would exceed buffer bounds: offset=${offset}, headerSize=40, bufferSize=${buffer.byteLength}`);
+        }
+        view.setUint32(offset, data.dimensions, true);
+        offset += 4;
+        view.setUint32(offset, data.maxElements, true);
+        offset += 4;
+        view.setUint32(offset, data.M, true);
+        offset += 4;
+        view.setUint32(offset, data.efConstruction, true);
+        offset += 4;
+        view.setUint32(offset, data.seed, true);
+        offset += 4;
+        view.setUint32(offset, data.currentSize, true);
+        offset += 4;
+        // Extended fields
+        view.setUint32(offset, 1, true);
+        offset += 4; // hasGroups = 1
+        view.setUint32(offset, textOffset, true);
+        offset += 4;
+        view.setUint32(offset, data.textVectors.length, true);
+        offset += 4;
+        view.setUint32(offset, imageOffset, true);
+        offset += 4;
+        view.setUint32(offset, data.imageVectors.length, true);
+        offset += 4;
+        // Write text vectors
+        for (const item of data.textVectors) {
+            // Ensure 4-byte alignment
+            if (offset % 4 !== 0) {
+                throw new Error(`Offset ${offset} is not 4-byte aligned`);
+            }
+            // Check bounds before writing
+            if (offset + 4 > buffer.byteLength) {
+                throw new Error(`ID write would exceed buffer bounds: offset=${offset}, bufferSize=${buffer.byteLength}`);
+            }
+            // Write vector ID
+            view.setUint32(offset, item.id, true);
+            offset += 4;
+            // Check bounds for vector data
+            const vectorDataSize = item.vector.length * 4;
+            if (offset + vectorDataSize > buffer.byteLength) {
+                throw new Error(`Vector data write would exceed buffer bounds: offset=${offset}, dataSize=${vectorDataSize}, bufferSize=${buffer.byteLength}`);
+            }
+            // Write vector data
+            for (let i = 0; i < item.vector.length; i++) {
+                view.setFloat32(offset, item.vector[i], true);
+                offset += 4;
+            }
+        }
+        // Write image vectors
+        for (const item of data.imageVectors) {
+            // Ensure 4-byte alignment
+            if (offset % 4 !== 0) {
+                throw new Error(`Offset ${offset} is not 4-byte aligned`);
+            }
+            // Check bounds before writing
+            if (offset + 4 > buffer.byteLength) {
+                throw new Error(`ID write would exceed buffer bounds: offset=${offset}, bufferSize=${buffer.byteLength}`);
+            }
+            // Write vector ID
+            view.setUint32(offset, item.id, true);
+            offset += 4;
+            // Check bounds for vector data
+            const vectorDataSize = item.vector.length * 4;
+            if (offset + vectorDataSize > buffer.byteLength) {
+                throw new Error(`Vector data write would exceed buffer bounds: offset=${offset}, dataSize=${vectorDataSize}, bufferSize=${buffer.byteLength}`);
+            }
+            // Write vector data
+            for (let i = 0; i < item.vector.length; i++) {
+                view.setFloat32(offset, item.vector[i], true);
+                offset += 4;
+            }
+        }
+        // Write to file
+        writeFileSync(indexPath, Buffer.from(buffer));
+    }
+    /**
+     * Load index data from binary format (supports both original and grouped formats)
      *
      * Uses zero-copy Float32Array views for efficient loading.
      * Copies the views to ensure data persistence after buffer lifecycle.
@@ -78,7 +186,7 @@ export class BinaryIndexFormat {
         const buffer = readFileSync(indexPath);
         const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
         let offset = 0;
-        // Read header (24 bytes, all little-endian)
+        // Read basic header (24 bytes, all little-endian)
         const dimensions = view.getUint32(offset, true);
         offset += 4;
         const maxElements = view.getUint32(offset, true);
@@ -91,32 +199,93 @@ export class BinaryIndexFormat {
         offset += 4;
         const currentSize = view.getUint32(offset, true);
         offset += 4;
-        // Read vectors
-        const vectors = [];
-        for (let i = 0; i < currentSize; i++) {
-            // Ensure 4-byte alignment (should always be true with our format)
-            if (offset % 4 !== 0) {
-                throw new Error(`Offset ${offset} is not 4-byte aligned`);
+        // Check if this is the extended grouped format (40+ bytes header)
+        const hasGroups = buffer.byteLength >= 40 ? view.getUint32(offset, true) : 0;
+        if (hasGroups === 1 && buffer.byteLength >= 40) {
+            // Load grouped format
+            const textOffset = view.getUint32(offset + 4, true);
+            const textCount = view.getUint32(offset + 8, true);
+            const imageOffset = view.getUint32(offset + 12, true);
+            const imageCount = view.getUint32(offset + 16, true);
+            // Load text vectors
+            const textVectors = [];
+            offset = textOffset;
+            for (let i = 0; i < textCount; i++) {
+                // Ensure 4-byte alignment
+                if (offset % 4 !== 0) {
+                    throw new Error(`Offset ${offset} is not 4-byte aligned`);
+                }
+                // Read vector ID
+                const id = view.getUint32(offset, true);
+                offset += 4;
+                // Zero-copy Float32Array view
+                const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
+                // Copy to avoid buffer lifecycle issues
+                const vector = new Float32Array(vectorView);
+                offset += dimensions * 4;
+                textVectors.push({ id, vector });
             }
-            // Read vector ID
-            const id = view.getUint32(offset, true);
-            offset += 4;
-            // Zero-copy Float32Array view (fast!)
-            const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
-            // Copy to avoid buffer lifecycle issues
-            const vector = new Float32Array(vectorView);
-            offset += dimensions * 4;
-            vectors.push({ id, vector });
+            // Load image vectors
+            const imageVectors = [];
+            offset = imageOffset;
+            for (let i = 0; i < imageCount; i++) {
+                // Ensure 4-byte alignment
+                if (offset % 4 !== 0) {
+                    throw new Error(`Offset ${offset} is not 4-byte aligned`);
+                }
+                // Read vector ID
+                const id = view.getUint32(offset, true);
+                offset += 4;
+                // Zero-copy Float32Array view
+                const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
+                // Copy to avoid buffer lifecycle issues
+                const vector = new Float32Array(vectorView);
+                offset += dimensions * 4;
+                imageVectors.push({ id, vector });
+            }
+            // Combine all vectors for backward compatibility
+            const allVectors = [...textVectors, ...imageVectors];
+            return {
+                dimensions,
+                maxElements,
+                M,
+                efConstruction,
+                seed,
+                currentSize,
+                vectors: allVectors,
+                hasContentTypeGroups: true,
+                textVectors,
+                imageVectors
+            };
+        }
+        else {
+            // Load original format
+            const vectors = [];
+            for (let i = 0; i < currentSize; i++) {
+                // Ensure 4-byte alignment (should always be true with our format)
+                if (offset % 4 !== 0) {
+                    throw new Error(`Offset ${offset} is not 4-byte aligned`);
+                }
+                // Read vector ID
+                const id = view.getUint32(offset, true);
+                offset += 4;
+                // Zero-copy Float32Array view (fast!)
+                const vectorView = new Float32Array(buffer.buffer, buffer.byteOffset + offset, dimensions);
+                // Copy to avoid buffer lifecycle issues
+                const vector = new Float32Array(vectorView);
+                offset += dimensions * 4;
+                vectors.push({ id, vector });
+            }
+            return {
+                dimensions,
+                maxElements,
+                M,
+                efConstruction,
+                seed,
+                currentSize,
+                vectors
+            };
         }
-        return {
-            dimensions,
-            maxElements,
-            M,
-            efConstruction,
-            seed,
-            currentSize,
-            vectors
-        };
     }
 }
 //# sourceMappingURL=binary-index-format.js.map

package/dist/core/ingestion.d.ts CHANGED Viewed

@@ -162,9 +162,13 @@ export declare class IngestionPipeline {
      */
     private storeDocumentsAndChunksWithContentTypes;
     /**
-     * Update vector index with new embeddings
+     * Update vector index with new embeddings (supports grouped content type storage)
      */
     private updateVectorIndex;
+    /**
+     * Filter documents based on ingestion mode to avoid processing incompatible content types
+     */
+    private filterDocumentsByMode;
     /**
      * Converts MIME type to simple content type for embedding function
      * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')

package/dist/core/ingestion.js CHANGED Viewed

@@ -287,21 +287,30 @@ export class IngestionPipeline {
         try {
             // Phase 1: File Discovery and Processing with Content-Type Detection
             console.log('\n--- Phase 1: File Discovery and Processing ---');
-            const fileResult = await discoverAndProcessFiles(path, options.fileOptions, this.pathManager);
-            if (fileResult.documents.length === 0) {
+            const mode = options.mode || 'text';
+            const fileOptions = {
+                recursive: true,
+                maxFileSize: 10 * 1024 * 1024, // 10MB
+                ...options.fileOptions,
+                mode
+            };
+            const fileResult = await discoverAndProcessFiles(path, fileOptions, this.pathManager);
+            // Additional filtering as fallback (should be minimal with mode-aware discovery)
+            const filteredResult = this.filterDocumentsByMode(fileResult, mode);
+            if (filteredResult.documents.length === 0) {
                 console.log('No documents found to process');
                 return {
                     documentsProcessed: 0,
                     chunksCreated: 0,
                     embeddingsGenerated: 0,
-                    documentErrors: fileResult.processingResult.errors.length,
+                    documentErrors: filteredResult.processingResult.errors.length,
                     embeddingErrors: 0,
                     processingTimeMs: Date.now() - startTime,
                     contentIds: []
                 };
             }
             // Content-type detection and routing
-            const contentTypeStats = this.analyzeContentTypes(fileResult.documents);
+            const contentTypeStats = this.analyzeContentTypes(filteredResult.documents);
             console.log(`📊 Content analysis: ${contentTypeStats.text} text, ${contentTypeStats.image} image, ${contentTypeStats.other} other files`);
             // Phase 2: Document Chunking with Content-Type Awareness
             console.log('\n--- Phase 2: Document Chunking ---');
@@ -309,7 +318,7 @@ export class IngestionPipeline {
                 chunkSize: config.chunk_size,
                 chunkOverlap: config.chunk_overlap
             };
-            const chunkingResult = await this.chunkDocumentsWithContentTypes(fileResult.documents, effectiveChunkConfig, options.mode);
+            const chunkingResult = await this.chunkDocumentsWithContentTypes(filteredResult.documents, effectiveChunkConfig, options.mode);
             if (chunkingResult.totalChunks === 0) {
                 console.log('No chunks created from documents');
                 return {
@@ -334,10 +343,10 @@ export class IngestionPipeline {
             const endTime = Date.now();
             const processingTimeMs = endTime - startTime;
             const result = {
-                documentsProcessed: fileResult.documents.length,
+                documentsProcessed: filteredResult.documents.length,
                 chunksCreated: chunkingResult.totalChunks,
                 embeddingsGenerated: embeddingResult.embeddings.length,
-                documentErrors: fileResult.processingResult.errors.length,
+                documentErrors: filteredResult.processingResult.errors.length,
                 embeddingErrors: embeddingResult.errors,
                 processingTimeMs,
                 contentIds
@@ -595,16 +604,35 @@ export class IngestionPipeline {
         return contentIds;
     }
     /**
-     * Update vector index with new embeddings
+     * Update vector index with new embeddings (supports grouped content type storage)
      */
     async updateVectorIndex(embeddings) {
+        console.log('updateVectorIndex called with', embeddings.length, 'embeddings');
         if (embeddings.length === 0) {
             console.log('No embeddings to add to vector index');
             return;
         }
         console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
         try {
-            await this.indexManager.addVectors(embeddings);
+            // Group embeddings by content type for optimized storage
+            const groupedEmbeddings = embeddings.reduce((groups, embedding) => {
+                const contentType = embedding.contentType || 'text';
+                if (!groups[contentType]) {
+                    groups[contentType] = [];
+                }
+                groups[contentType].push(embedding);
+                return groups;
+            }, {});
+            const textEmbeddings = groupedEmbeddings.text || [];
+            const imageEmbeddings = groupedEmbeddings.image || [];
+            console.log(`Grouped: ${textEmbeddings.length} text, ${imageEmbeddings.length} image vectors`);
+            // Use grouped storage method if available, fallback to regular method
+            if (this.indexManager.addGroupedEmbeddings) {
+                await this.indexManager.addGroupedEmbeddings(textEmbeddings, imageEmbeddings);
+            }
+            else {
+                await this.indexManager.addVectors(embeddings);
+            }
             console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
         }
         catch (error) {
@@ -612,6 +640,45 @@ export class IngestionPipeline {
             throw error;
         }
     }
+    /**
+     * Filter documents based on ingestion mode to avoid processing incompatible content types
+     */
+    filterDocumentsByMode(fileResult, mode) {
+        if (mode === 'multimodal') {
+            // In multimodal mode, keep all documents
+            return fileResult;
+        }
+        // In text mode, filter out image documents
+        const filteredDocuments = fileResult.documents.filter(doc => {
+            const contentType = doc.metadata?.contentType || 'text';
+            const isCompatible = contentType === 'text' ||
+                contentType.startsWith('text/') ||
+                contentType === 'application/pdf' ||
+                contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
+            if (!isCompatible) {
+                console.log(`⚠️ Skipping ${doc.source} (${contentType}) - not compatible with text mode`);
+            }
+            return isCompatible;
+        });
+        // Update processing result to reflect filtering
+        const filteredProcessingResult = {
+            ...fileResult.processingResult,
+            skippedFiles: [
+                ...(fileResult.processingResult.skippedFiles || []),
+                ...fileResult.documents
+                    .filter(doc => !filteredDocuments.includes(doc))
+                    .map(doc => ({
+                    path: doc.source,
+                    reason: `Content type not compatible with ${mode} mode`
+                }))
+            ]
+        };
+        return {
+            documents: filteredDocuments,
+            discoveryResult: fileResult.discoveryResult,
+            processingResult: filteredProcessingResult
+        };
+    }
     /**
      * Converts MIME type to simple content type for embedding function
      * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')

package/dist/core/reranking-strategies.js CHANGED Viewed

@@ -194,7 +194,7 @@ export class TextDerivedRerankingStrategy {
         catch (error) {
             console.warn(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : 'Unknown error'}`);
             // Fallback to filename-based description
-            const filename = imagePath.split('/').pop() || imagePath.split('\\').pop() || imagePath;
+            const filename = imagePath.split('/').pop() || imagePath;
             return `Image file: ${filename}`;
         }
     }
@@ -211,17 +211,16 @@ export class TextDerivedRerankingStrategy {
             // Step 1: Convert images to text descriptions
             const processedResults = await Promise.all(results.map(async (result) => {
                 if (result.contentType === 'image') {
-                    // Generate text description for image using the file path from document.source
-                    const description = await this.generateImageDescription(result.document.source);
+                    // Generate text description for image
+                    const description = await this.generateImageDescription(result.content);
                     return {
                         ...result,
                         content: description,
-                        contentType: 'text', // Change to 'text' so cross-encoder will process it
                         originalContent: result.content,
                         originalContentType: result.contentType,
                         metadata: {
                             ...result.metadata,
-                            originalImagePath: result.document.source,
+                            originalImagePath: result.content,
                             generatedDescription: description
                         }
                     };

package/dist/core/search.js CHANGED Viewed

@@ -139,7 +139,8 @@ export class SearchEngine {
             const searchStartTime = performance.now();
             let searchResult;
             try {
-                searchResult = this.indexManager.search(queryVector, topK);
+                const contentType = options.contentType;
+                searchResult = this.indexManager.search(queryVector, topK, contentType);
             }
             catch (error) {
                 if (error instanceof Error && error.message.includes('No embedding ID found for hash')) {

package/dist/core/types.d.ts CHANGED Viewed

@@ -49,7 +49,7 @@ export interface RerankingInterface {
 export interface SearchOptions {
     top_k?: number;
     rerank?: boolean;
-    contentType?: string;
+    contentType?: 'text' | 'image' | 'combined';
 }
 export interface Chunk {
     text: string;

package/dist/core/vector-index.d.ts CHANGED Viewed

@@ -64,5 +64,9 @@ export declare class VectorIndex {
      * Resize index to accommodate more vectors
      */
     resizeIndex(newMaxElements: number): void;
+    /**
+     * Get index options (for external access to configuration)
+     */
+    getOptions(): VectorIndexOptions;
 }
 //# sourceMappingURL=vector-index.d.ts.map

package/dist/core/vector-index.js CHANGED Viewed

@@ -321,5 +321,11 @@ export class VectorIndex {
             throw new Error(`Failed to resize index: ${error}`);
         }
     }
+    /**
+     * Get index options (for external access to configuration)
+     */
+    getOptions() {
+        return { ...this.options };
+    }
 }
 //# sourceMappingURL=vector-index.js.map

package/dist/file-processor.d.ts CHANGED Viewed

@@ -8,6 +8,8 @@ export interface FileProcessorOptions {
     recursive?: boolean;
     /** Maximum file size in bytes (default: 10MB) */
     maxFileSize?: number;
+    /** Processing mode to filter compatible files */
+    mode?: 'text' | 'multimodal';
 }
 /**
  * Default options for file processing

package/dist/file-processor.js CHANGED Viewed

@@ -188,6 +188,15 @@ async function discoverFilesRecursive(dirPath, options) {
                         // Check file size based on content type
                         const stats = await fs.stat(fullPath);
                         const contentType = getContentType(fullPath);
+                        // Filter by mode: skip incompatible content types
+                        const mode = options.mode || 'text';
+                        if (mode === 'text' && contentType === 'image') {
+                            result.skipped.push({
+                                path: fullPath,
+                                reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
+                            });
+                            continue;
+                        }
                         // Different size limits for different content types
                         const maxSize = contentType === 'image'
                             ? 50 * 1024 * 1024 // 50MB for images
@@ -250,6 +259,17 @@ export async function discoverFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIO
                 };
             }
             const contentType = getContentType(resolvedPath);
+            // Filter by mode: skip incompatible content types
+            const mode = options.mode || 'text';
+            if (mode === 'text' && contentType === 'image') {
+                return {
+                    files: [],
+                    skipped: [{
+                            path: resolvedPath,
+                            reason: `Image files not supported in text mode. Use --mode multimodal for image processing.`
+                        }]
+                };
+            }
             // Check file size based on content type
             const maxSize = contentType === 'image'
                 ? 50 * 1024 * 1024 // 50MB for images

package/dist/index-manager.d.ts CHANGED Viewed

@@ -7,12 +7,16 @@ export interface IndexStats {
 export declare class IndexManager {
     private modelName?;
     private vectorIndex;
+    private textIndex?;
+    private imageIndex?;
     private db;
     private indexPath;
     private dbPath;
     private isInitialized;
     private hashToEmbeddingId;
     private embeddingIdToHash;
+    private groupedEmbeddings?;
+    private vectorIndexOptions;
     constructor(indexPath: string, dbPath: string, dimensions: number, modelName?: string | undefined);
     /**
      * Initialize the index manager and load existing index if available
@@ -30,6 +34,10 @@ export declare class IndexManager {
      * Requirements: 5.3 - When new documents are added THEN system SHALL append new chunks and vectors without rebuilding existing index
      */
     addVectors(embeddings: EmbeddingResult[]): Promise<void>;
+    /**
+     * Add grouped embeddings by content type (for new grouped format)
+     */
+    addGroupedEmbeddings(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
     /**
      * Rebuild the entire index from scratch
      * Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
@@ -68,10 +76,18 @@ export declare class IndexManager {
      * Save the vector index to disk
      */
     saveIndex(): Promise<void>;
+    /**
+     * Create specialized indexes for text and image content when grouped data is available
+     */
+    private createSpecializedIndexes;
+    /**
+     * Save index with content type grouping (for new grouped format)
+     */
+    saveGroupedIndex(textEmbeddings: EmbeddingResult[], imageEmbeddings: EmbeddingResult[]): Promise<void>;
     /**
      * Search for similar vectors
      */
-    search(queryVector: Float32Array, k?: number): {
+    search(queryVector: Float32Array, k?: number, contentType?: 'text' | 'image' | 'combined'): {
         embeddingIds: string[];
         distances: number[];
     };

package/dist/index-manager.js CHANGED Viewed

@@ -1,26 +1,33 @@
 import { VectorIndex } from './core/vector-index.js';
+import { BinaryIndexFormat } from './core/binary-index-format.js';
 import { openDatabase, getSystemInfo, setSystemInfo } from './core/db.js';
 import { config, getModelDefaults } from './core/config.js';
 export class IndexManager {
     modelName;
     vectorIndex;
+    textIndex;
+    imageIndex;
     db = null;
     indexPath;
     dbPath;
     isInitialized = false;
     hashToEmbeddingId = new Map();
     embeddingIdToHash = new Map();
+    groupedEmbeddings;
+    vectorIndexOptions;
     constructor(indexPath, dbPath, dimensions, modelName) {
         this.modelName = modelName;
         this.indexPath = indexPath;
         this.dbPath = dbPath;
-        // Initialize with provided dimensions from config
-        this.vectorIndex = new VectorIndex(indexPath, {
+        // Store options for creating specialized indexes
+        this.vectorIndexOptions = {
             dimensions: dimensions,
             maxElements: 100000, // Start with 100k capacity
             efConstruction: 200,
             M: 16
-        });
+        };
+        // Initialize with provided dimensions from config
+        this.vectorIndex = new VectorIndex(indexPath, this.vectorIndexOptions);
     }
     /**
      * Initialize the index manager and load existing index if available
@@ -47,6 +54,8 @@ export class IndexManager {
                 // Only try to load existing index if not forcing recreation
                 console.log('Loading existing vector index...');
                 await this.vectorIndex.loadIndex();
+                // Check if the loaded index has grouped data and create specialized indexes
+                await this.createSpecializedIndexes();
             }
             // Always populate the embedding ID mapping from existing database entries
             // This is needed both for new and existing indexes
@@ -55,7 +64,8 @@ export class IndexManager {
                 this.hashEmbeddingId(chunk.embedding_id); // This will populate the mapping
             }
             this.isInitialized = true;
-            console.log(`Index manager initialized with ${this.vectorIndex.getCurrentCount()} vectors`);
+            const vectorCount = this.vectorIndex.getCurrentCount();
+            console.log(`Index manager initialized with ${vectorCount} vectors${this.textIndex && this.imageIndex ? ' (multi-graph mode)' : ''}`);
         }
         catch (error) {
             throw new Error(`Failed to initialize index manager: ${error}`);
@@ -153,6 +163,31 @@ export class IndexManager {
             throw new Error(`Failed to add vectors to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
         }
     }
+    /**
+     * Add grouped embeddings by content type (for new grouped format)
+     */
+    async addGroupedEmbeddings(textEmbeddings, imageEmbeddings) {
+        if (!this.isInitialized) {
+            throw new Error('Index manager not initialized');
+        }
+        console.log(`addGroupedEmbeddings: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
+        const allEmbeddings = [...textEmbeddings, ...imageEmbeddings];
+        if (allEmbeddings.length === 0) {
+            return;
+        }
+        try {
+            // Store grouped information for later saving
+            this.groupedEmbeddings = { text: textEmbeddings, image: imageEmbeddings };
+            console.log('addGroupedEmbeddings: stored grouped embeddings');
+            // Add all embeddings to the index (maintains current behavior)
+            await this.addVectors(allEmbeddings);
+            console.log('addGroupedEmbeddings: addVectors completed');
+            // The saveIndex method will now use grouped format if groupedEmbeddings exists
+        }
+        catch (error) {
+            throw new Error(`Failed to add grouped embeddings to index: ${error instanceof Error ? error.message : 'Unknown error'}`);
+        }
+    }
     /**
      * Rebuild the entire index from scratch
      * Requirements: 5.2, 5.4 - Create full index rebuild functionality for model changes or document deletions
@@ -349,16 +384,122 @@ export class IndexManager {
         if (!this.isInitialized) {
             throw new Error('Index manager not initialized');
         }
-        await this.vectorIndex.saveIndex();
+        // If we have grouped embeddings, save in grouped format
+        if (this.groupedEmbeddings) {
+            console.log('IndexManager: Saving in grouped format');
+            await this.saveGroupedIndex(this.groupedEmbeddings.text, this.groupedEmbeddings.image);
+            // Clear grouped data after saving
+            this.groupedEmbeddings = undefined;
+        }
+        else {
+            console.log('IndexManager: Saving in standard format');
+            await this.vectorIndex.saveIndex();
+        }
+    }
+    /**
+     * Create specialized indexes for text and image content when grouped data is available
+     */
+    async createSpecializedIndexes() {
+        try {
+            // Load the index data to check if it has grouped information
+            const indexData = await BinaryIndexFormat.load(this.indexPath);
+            if (indexData.hasContentTypeGroups && indexData.textVectors && indexData.imageVectors) {
+                // Only create specialized indexes if we have both text and image vectors
+                // In text-only mode, textVectors would be populated but imageVectors empty
+                // In multimodal mode, both would be populated
+                const hasTextVectors = indexData.textVectors.length > 0;
+                const hasImageVectors = indexData.imageVectors.length > 0;
+                if (hasTextVectors && hasImageVectors) {
+                    console.log('Creating specialized indexes for content type filtering...');
+                    // Create text-only index
+                    this.textIndex = new VectorIndex(`${this.indexPath}.text`, this.vectorIndexOptions);
+                    await this.textIndex.initialize();
+                    this.textIndex.addVectors(indexData.textVectors);
+                    console.log(`✓ Text index created with ${indexData.textVectors.length} vectors`);
+                    // Create image-only index
+                    this.imageIndex = new VectorIndex(`${this.indexPath}.image`, this.vectorIndexOptions);
+                    await this.imageIndex.initialize();
+                    this.imageIndex.addVectors(indexData.imageVectors);
+                    console.log(`✓ Image index created with ${indexData.imageVectors.length} vectors`);
+                    console.log('✓ Specialized indexes ready for content type filtering');
+                }
+                else if (hasTextVectors) {
+                    console.log('Text-only index detected - using combined index for all searches');
+                    // In text-only mode, we don't need specialized indexes
+                    // The combined index (vectorIndex) already contains all text vectors
+                }
+            }
+        }
+        catch (error) {
+            console.warn('Failed to create specialized indexes, falling back to combined index:', error);
+            // Continue without specialized indexes - search will still work with combined index
+        }
+    }
+    /**
+     * Save index with content type grouping (for new grouped format)
+     */
+    async saveGroupedIndex(textEmbeddings, imageEmbeddings) {
+        if (!this.isInitialized) {
+            throw new Error('Index manager not initialized');
+        }
+        console.log(`saveGroupedIndex: text=${textEmbeddings.length}, image=${imageEmbeddings.length}`);
+        // Group vectors by content type
+        const textVectors = textEmbeddings.map((embedding) => ({
+            id: this.hashEmbeddingId(embedding.embedding_id),
+            vector: embedding.vector
+        }));
+        const imageVectors = imageEmbeddings.map((embedding) => ({
+            id: this.hashEmbeddingId(embedding.embedding_id),
+            vector: embedding.vector
+        }));
+        // Get index parameters
+        const options = this.vectorIndex.getOptions();
+        const allVectors = [...textVectors, ...imageVectors];
+        console.log(`saveGroupedIndex: dimensions=${options.dimensions}, totalVectors=${allVectors.length}`);
+        const indexData = {
+            dimensions: options.dimensions,
+            maxElements: options.maxElements,
+            M: options.M || 16,
+            efConstruction: options.efConstruction || 200,
+            seed: options.seed || 100,
+            currentSize: textVectors.length + imageVectors.length,
+            vectors: allVectors, // Required for backward compatibility
+            hasContentTypeGroups: true,
+            textVectors,
+            imageVectors
+        };
+        console.log('saveGroupedIndex: Calling BinaryIndexFormat.saveGrouped');
+        // Save using grouped format
+        await BinaryIndexFormat.saveGrouped(this.indexPath, indexData);
+        console.log(`✓ Saved grouped index with ${textVectors.length} text and ${imageVectors.length} image vectors`);
     }
     /**
      * Search for similar vectors
      */
-    search(queryVector, k = 5) {
+    search(queryVector, k = 5, contentType) {
         if (!this.isInitialized) {
             throw new Error('Index manager not initialized');
         }
-        const results = this.vectorIndex.search(queryVector, k);
+        // Select the appropriate index based on content type
+        let targetIndex;
+        // If we have specialized indexes (multimodal mode), use them for filtering
+        if (this.textIndex && this.imageIndex) {
+            if (contentType === 'text') {
+                targetIndex = this.textIndex;
+            }
+            else if (contentType === 'image') {
+                targetIndex = this.imageIndex;
+            }
+            else {
+                // 'combined' or undefined
+                targetIndex = this.vectorIndex;
+            }
+        }
+        else {
+            // No specialized indexes (text-only mode) - ignore contentType and use combined index
+            targetIndex = this.vectorIndex;
+        }
+        const results = targetIndex.search(queryVector, k);
         // Convert numeric IDs back to embedding IDs
         const embeddingIds = results.neighbors.map(id => this.unhashEmbeddingId(id));
         return {

package/dist/multimodal/clip-embedder.js CHANGED Viewed

@@ -338,73 +338,78 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
         if (!this.textModel || !this.tokenizer) {
             throw new Error('CLIP text model or tokenizer not initialized');
         }
-        // Use the validated CLIPTextModelWithProjection approach (no pixel_values errors)
-        // Tokenize text with CLIP's requirements
-        // The tokenizer handles truncation at 77 TOKENS (not characters)
-        const tokens = await this.tokenizer(processedText, {
-            padding: true,
-            truncation: true,
-            max_length: 77, // CLIP's text sequence length limit (77 tokens)
-            return_tensors: 'pt'
-        });
-        // Log token information for debugging (only in development)
-        if (process.env.NODE_ENV === 'development') {
-            const tokenIds = tokens.input_ids?.data || [];
-            const actualTokenCount = Array.from(tokenIds).filter((id) => id !== 0).length;
-            if (actualTokenCount >= 77) {
-                console.warn(`Text truncated by tokenizer: "${processedText.substring(0, 50)}..." (truncated to 77 tokens)`);
-            }
-        }
-        // Generate text embedding using CLIPTextModelWithProjection
-        const output = await this.textModel(tokens);
-        // Extract embedding from text_embeds (no pixel_values dependency)
-        const embedding = new Float32Array(output.text_embeds.data);
-        // Validate embedding dimensions and values
-        if (embedding.length !== this.dimensions) {
-            throw new Error(`CLIP embedding dimension mismatch: expected ${this.dimensions}, got ${embedding.length}`);
-        }
-        // Validate that all values are finite numbers
-        const invalidValues = Array.from(embedding).filter(val => !isFinite(val) || isNaN(val));
-        if (invalidValues.length > 0) {
-            throw new Error(`CLIP embedding contains ${invalidValues.length} invalid values`);
-        }
-        // Validate embedding quality - should not be all zeros
-        const nonZeroValues = Array.from(embedding).filter(val => Math.abs(val) > 1e-8);
-        if (nonZeroValues.length === 0) {
-            throw new Error('CLIP embedding is all zeros');
-        }
-        // Calculate embedding magnitude before normalization for quality assessment
-        const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
-        if (magnitudeBeforeNorm < 1e-6) {
-            throw new Error(`CLIP embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
-        }
-        // Apply L2-normalization (CLIP models are trained with normalized embeddings)
-        this.normalizeEmbedding(embedding);
-        // Verify normalization was successful
-        const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
-        if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
-            console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
-        }
-        // Log text embedding generation
-        console.log(`[CLIP] Generated text embedding for: "${processedText.substring(0, 30)}${processedText.length > 30 ? '...' : ''}"`);
-        // Generate unique embedding ID
-        const embeddingId = this.generateEmbeddingId(processedText, 'text');
-        return {
-            embedding_id: embeddingId,
-            vector: embedding,
-            contentType: 'text',
-            metadata: {
-                originalText: text,
-                processedText: processedText,
-                textLength: processedText.length,
-                embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
-                embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
-                normalized: true,
-                modelName: this.modelName,
-                modelType: this.modelType,
-                dimensions: this.dimensions
+        try {
+            // Use the validated CLIPTextModelWithProjection approach (no pixel_values errors)
+            // Tokenize text with CLIP's requirements
+            // The tokenizer handles truncation at 77 TOKENS (not characters)
+            const tokens = await this.tokenizer(processedText, {
+                padding: true,
+                truncation: true,
+                max_length: 77, // CLIP's text sequence length limit (77 tokens)
+                return_tensors: 'pt'
+            });
+            // Log token information for debugging (only in development)
+            if (process.env.NODE_ENV === 'development') {
+                const tokenIds = tokens.input_ids?.data || [];
+                const actualTokenCount = Array.from(tokenIds).filter((id) => id !== 0).length;
+                if (actualTokenCount >= 77) {
+                    console.warn(`Text truncated by tokenizer: "${processedText.substring(0, 50)}..." (truncated to 77 tokens)`);
+                }
             }
-        };
+            // Generate text embedding using CLIPTextModelWithProjection
+            const output = await this.textModel(tokens);
+            // Extract embedding from text_embeds (no pixel_values dependency)
+            const embedding = new Float32Array(output.text_embeds.data);
+            // Validate embedding dimensions and values
+            if (embedding.length !== this.dimensions) {
+                throw new Error(`CLIP embedding dimension mismatch: expected ${this.dimensions}, got ${embedding.length}`);
+            }
+            // Validate that all values are finite numbers
+            const invalidValues = Array.from(embedding).filter(val => !isFinite(val) || isNaN(val));
+            if (invalidValues.length > 0) {
+                throw new Error(`CLIP embedding contains ${invalidValues.length} invalid values`);
+            }
+            // Validate embedding quality - should not be all zeros
+            const nonZeroValues = Array.from(embedding).filter(val => Math.abs(val) > 1e-8);
+            if (nonZeroValues.length === 0) {
+                throw new Error('CLIP embedding is all zeros');
+            }
+            // Calculate embedding magnitude before normalization for quality assessment
+            const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
+            if (magnitudeBeforeNorm < 1e-6) {
+                throw new Error(`CLIP embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
+            }
+            // Apply L2-normalization (CLIP models are trained with normalized embeddings)
+            this.normalizeEmbedding(embedding);
+            // Verify normalization was successful
+            const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
+            if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
+                console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
+            }
+            // Log text embedding generation
+            console.log(`[CLIP] Generated text embedding for: "${processedText.substring(0, 30)}${processedText.length > 30 ? '...' : ''}"`);
+            // Generate unique embedding ID
+            const embeddingId = this.generateEmbeddingId(processedText, 'text');
+            return {
+                embedding_id: embeddingId,
+                vector: embedding,
+                contentType: 'text',
+                metadata: {
+                    originalText: text,
+                    processedText: processedText,
+                    textLength: processedText.length,
+                    embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
+                    embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
+                    normalized: true,
+                    modelName: this.modelName,
+                    modelType: this.modelType,
+                    dimensions: this.dimensions
+                }
+            };
+        }
+        catch (error) {
+            throw error;
+        }
     }
     // =============================================================================
     // IMAGE EMBEDDING METHODS

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "rag-lite-ts",
-  "version": "2.1.0",
+  "version": "2.1.1",
   "description": "Local-first TypeScript retrieval engine with Chameleon Multimodal Architecture for semantic search over text and image content",
   "type": "module",
   "main": "./dist/index.js",