npm - rag-lite-ts - Versions diffs - 2.0.4 → 2.1.0 - Mend

rag-lite-ts 2.0.4 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +815 -808
package/dist/cli/indexer.js +2 -38
package/dist/cli/search.d.ts +1 -1
package/dist/cli/search.js +118 -9
package/dist/cli.js +77 -94
package/dist/config.js +3 -0
package/dist/core/database-connection-manager.js +5 -9
package/dist/core/db.js +173 -173
package/dist/core/ingestion.js +50 -9
package/dist/core/lazy-dependency-loader.d.ts +3 -8
package/dist/core/lazy-dependency-loader.js +11 -29
package/dist/core/mode-detection-service.js +1 -1
package/dist/core/reranking-config.d.ts +1 -1
package/dist/core/reranking-config.js +7 -16
package/dist/core/reranking-factory.js +3 -184
package/dist/core/reranking-strategies.js +5 -4
package/dist/core/search.d.ts +10 -0
package/dist/core/search.js +34 -11
package/dist/factories/ingestion-factory.js +3 -1
package/dist/mcp-server.js +147 -120
package/dist/multimodal/clip-embedder.js +70 -71
package/package.json +105 -105

package/dist/core/db.js CHANGED Viewed

@@ -97,112 +97,112 @@ function enhanceSQLiteError(error, sql) {
 export async function initializeSchema(connection) {
     try {
         // Create documents table with content type support and content_id reference
-        await connection.run(`
-      CREATE TABLE IF NOT EXISTS documents (
-        id INTEGER PRIMARY KEY AUTOINCREMENT,
-        content_id TEXT,                        -- References content_metadata.id
-        source TEXT NOT NULL UNIQUE,
-        title TEXT NOT NULL,
-        content_type TEXT DEFAULT 'text',
-        metadata TEXT,
-        created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
-        updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
-        FOREIGN KEY (content_id) REFERENCES content_metadata(id)
-      )
+        await connection.run(`
+      CREATE TABLE IF NOT EXISTS documents (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        content_id TEXT,                        -- References content_metadata.id
+        source TEXT NOT NULL UNIQUE,
+        title TEXT NOT NULL,
+        content_type TEXT DEFAULT 'text',
+        metadata TEXT,
+        created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+        updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+        FOREIGN KEY (content_id) REFERENCES content_metadata(id)
+      )
     `);
         // Create chunks table with content type and metadata support
-        await connection.run(`
-      CREATE TABLE IF NOT EXISTS chunks (
-        id INTEGER PRIMARY KEY AUTOINCREMENT,
-        embedding_id TEXT NOT NULL UNIQUE,
-        document_id INTEGER NOT NULL,
-        content TEXT NOT NULL,
-        content_type TEXT DEFAULT 'text',
-        chunk_index INTEGER NOT NULL,
-        metadata TEXT,
-        created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
-        FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
-      )
+        await connection.run(`
+      CREATE TABLE IF NOT EXISTS chunks (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        embedding_id TEXT NOT NULL UNIQUE,
+        document_id INTEGER NOT NULL,
+        content TEXT NOT NULL,
+        content_type TEXT DEFAULT 'text',
+        chunk_index INTEGER NOT NULL,
+        metadata TEXT,
+        created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+        FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
+      )
     `);
         // Create content_metadata table for unified content system
-        await connection.run(`
-      CREATE TABLE IF NOT EXISTS content_metadata (
-        id TEXT PRIMARY KEY,                    -- Hash-based content ID
-        storage_type TEXT NOT NULL CHECK (storage_type IN ('filesystem', 'content_dir')),
-        original_path TEXT,                     -- Original file path (filesystem only)
-        content_path TEXT NOT NULL,             -- Actual storage path
-        display_name TEXT NOT NULL,             -- User-friendly name
-        content_type TEXT NOT NULL,             -- MIME type
-        file_size INTEGER NOT NULL,             -- Size in bytes
-        content_hash TEXT NOT NULL,             -- SHA-256 hash
-        created_at DATETIME DEFAULT CURRENT_TIMESTAMP
-      )
+        await connection.run(`
+      CREATE TABLE IF NOT EXISTS content_metadata (
+        id TEXT PRIMARY KEY,                    -- Hash-based content ID
+        storage_type TEXT NOT NULL CHECK (storage_type IN ('filesystem', 'content_dir')),
+        original_path TEXT,                     -- Original file path (filesystem only)
+        content_path TEXT NOT NULL,             -- Actual storage path
+        display_name TEXT NOT NULL,             -- User-friendly name
+        content_type TEXT NOT NULL,             -- MIME type
+        file_size INTEGER NOT NULL,             -- Size in bytes
+        content_hash TEXT NOT NULL,             -- SHA-256 hash
+        created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+      )
     `);
         // Create storage_stats table for basic content directory tracking
-        await connection.run(`
-      CREATE TABLE IF NOT EXISTS storage_stats (
-        id INTEGER PRIMARY KEY CHECK (id = 1),
-        content_dir_files INTEGER DEFAULT 0,
-        content_dir_size INTEGER DEFAULT 0,
-        filesystem_refs INTEGER DEFAULT 0,
-        last_cleanup DATETIME,
-        updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
-      )
+        await connection.run(`
+      CREATE TABLE IF NOT EXISTS storage_stats (
+        id INTEGER PRIMARY KEY CHECK (id = 1),
+        content_dir_files INTEGER DEFAULT 0,
+        content_dir_size INTEGER DEFAULT 0,
+        filesystem_refs INTEGER DEFAULT 0,
+        last_cleanup DATETIME,
+        updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+      )
     `);
         // Create system_info table for mode persistence and model tracking
-        await connection.run(`
-      CREATE TABLE IF NOT EXISTS system_info (
-        id INTEGER PRIMARY KEY CHECK (id = 1),
-        -- Core mode and model information
-        mode TEXT NOT NULL DEFAULT 'text' CHECK (mode IN ('text', 'multimodal')),
-        model_name TEXT NOT NULL DEFAULT 'sentence-transformers/all-MiniLM-L6-v2',
-        model_type TEXT NOT NULL DEFAULT 'sentence-transformer' CHECK (model_type IN ('sentence-transformer', 'clip')),
-        model_dimensions INTEGER NOT NULL DEFAULT 384,
-        model_version TEXT NOT NULL DEFAULT '',
-        -- Content type support (JSON array)
-        supported_content_types TEXT NOT NULL DEFAULT '["text"]',
-        -- Reranking configuration
-        reranking_strategy TEXT DEFAULT 'cross-encoder' CHECK (
-          reranking_strategy IN ('cross-encoder', 'text-derived', 'metadata', 'hybrid', 'disabled')
-        ),
-        reranking_model TEXT,
-        reranking_config TEXT, -- JSON configuration for strategy-specific settings
-        -- Timestamps
-        created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
-        updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
-      )
+        await connection.run(`
+      CREATE TABLE IF NOT EXISTS system_info (
+        id INTEGER PRIMARY KEY CHECK (id = 1),
+        -- Core mode and model information
+        mode TEXT NOT NULL DEFAULT 'text' CHECK (mode IN ('text', 'multimodal')),
+        model_name TEXT NOT NULL DEFAULT 'sentence-transformers/all-MiniLM-L6-v2',
+        model_type TEXT NOT NULL DEFAULT 'sentence-transformer' CHECK (model_type IN ('sentence-transformer', 'clip')),
+        model_dimensions INTEGER NOT NULL DEFAULT 384,
+        model_version TEXT NOT NULL DEFAULT '',
+        -- Content type support (JSON array)
+        supported_content_types TEXT NOT NULL DEFAULT '["text"]',
+        -- Reranking configuration
+        reranking_strategy TEXT DEFAULT 'cross-encoder' CHECK (
+          reranking_strategy IN ('cross-encoder', 'text-derived', 'disabled')
+        ),
+        reranking_model TEXT,
+        reranking_config TEXT, -- JSON configuration for strategy-specific settings
+        -- Timestamps
+        created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+        updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+      )
     `);
         // Clean slate approach - no migration logic needed
         // Users will perform fresh ingestion with the new architecture
         // Create indexes for performance
-        await connection.run(`
-      CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)
+        await connection.run(`
+      CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)
     `);
-        await connection.run(`
-      CREATE INDEX IF NOT EXISTS idx_chunks_embedding_id ON chunks(embedding_id)
+        await connection.run(`
+      CREATE INDEX IF NOT EXISTS idx_chunks_embedding_id ON chunks(embedding_id)
     `);
-        await connection.run(`
-      CREATE INDEX IF NOT EXISTS idx_documents_source ON documents(source)
+        await connection.run(`
+      CREATE INDEX IF NOT EXISTS idx_documents_source ON documents(source)
     `);
-        await connection.run(`
-      CREATE INDEX IF NOT EXISTS idx_chunks_content_type ON chunks(content_type)
+        await connection.run(`
+      CREATE INDEX IF NOT EXISTS idx_chunks_content_type ON chunks(content_type)
     `);
-        await connection.run(`
-      CREATE INDEX IF NOT EXISTS idx_documents_content_type ON documents(content_type)
+        await connection.run(`
+      CREATE INDEX IF NOT EXISTS idx_documents_content_type ON documents(content_type)
     `);
-        await connection.run(`
-      CREATE INDEX IF NOT EXISTS idx_documents_content_id ON documents(content_id)
+        await connection.run(`
+      CREATE INDEX IF NOT EXISTS idx_documents_content_id ON documents(content_id)
     `);
         // Create indexes for content metadata table for efficient lookup
-        await connection.run(`
-      CREATE INDEX IF NOT EXISTS idx_content_hash ON content_metadata(content_hash)
+        await connection.run(`
+      CREATE INDEX IF NOT EXISTS idx_content_hash ON content_metadata(content_hash)
     `);
-        await connection.run(`
-      CREATE INDEX IF NOT EXISTS idx_storage_type ON content_metadata(storage_type)
+        await connection.run(`
+      CREATE INDEX IF NOT EXISTS idx_storage_type ON content_metadata(storage_type)
     `);
         console.log('Database schema initialized successfully');
     }
@@ -308,24 +308,24 @@ export async function getChunksByEmbeddingIds(connection, embeddingIds) {
     }
     try {
         const placeholders = embeddingIds.map(() => '?').join(',');
-        const sql = `
-      SELECT
-        c.id,
-        c.embedding_id,
-        c.document_id,
-        c.content,
-        c.content_type,
-        c.chunk_index,
-        c.metadata,
-        c.created_at,
-        d.source as document_source,
-        d.title as document_title,
-        d.content_type as document_content_type,
-        d.content_id as document_content_id
-      FROM chunks c
-      JOIN documents d ON c.document_id = d.id
-      WHERE c.embedding_id IN (${placeholders})
-      ORDER BY c.chunk_index
+        const sql = `
+      SELECT
+        c.id,
+        c.embedding_id,
+        c.document_id,
+        c.content,
+        c.content_type,
+        c.chunk_index,
+        c.metadata,
+        c.created_at,
+        d.source as document_source,
+        d.title as document_title,
+        d.content_type as document_content_type,
+        d.content_id as document_content_id
+      FROM chunks c
+      JOIN documents d ON c.document_id = d.id
+      WHERE c.embedding_id IN (${placeholders})
+      ORDER BY c.chunk_index
     `;
         const results = await connection.all(sql, embeddingIds);
         // Parse metadata JSON strings back to objects
@@ -381,12 +381,12 @@ function validateContentType(contentType) {
  */
 export async function getSystemInfo(connection) {
     try {
-        const result = await connection.get(`
-      SELECT
-        mode, model_name, model_type, model_dimensions, model_version,
-        supported_content_types, reranking_strategy, reranking_model,
-        reranking_config, created_at, updated_at
-      FROM system_info WHERE id = 1
+        const result = await connection.get(`
+      SELECT
+        mode, model_name, model_type, model_dimensions, model_version,
+        supported_content_types, reranking_strategy, reranking_model,
+        reranking_config, created_at, updated_at
+      FROM system_info WHERE id = 1
     `);
         if (!result) {
             return null;
@@ -492,12 +492,12 @@ export async function setSystemInfo(connection, systemInfo) {
         }
         else {
             // Insert new row with provided values and defaults
-            const insertSql = `
-        INSERT INTO system_info (
-          id, mode, model_name, model_type, model_dimensions, model_version,
-          supported_content_types, reranking_strategy, reranking_model, reranking_config,
-          created_at, updated_at
-        ) VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
+            const insertSql = `
+        INSERT INTO system_info (
+          id, mode, model_name, model_type, model_dimensions, model_version,
+          supported_content_types, reranking_strategy, reranking_model, reranking_config,
+          created_at, updated_at
+        ) VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
       `;
             await connection.run(insertSql, [
                 systemInfo.mode || 'text',
@@ -556,24 +556,24 @@ export async function getDocumentsByContentType(connection, contentType) {
 export async function getChunksByContentType(connection, contentType) {
     try {
         validateContentType(contentType);
-        const sql = `
-      SELECT
-        c.id,
-        c.embedding_id,
-        c.document_id,
-        c.content,
-        c.content_type,
-        c.chunk_index,
-        c.metadata,
-        c.created_at,
-        d.source as document_source,
-        d.title as document_title,
-        d.content_type as document_content_type,
-        d.content_id as document_content_id
-      FROM chunks c
-      JOIN documents d ON c.document_id = d.id
-      WHERE c.content_type = ?
-      ORDER BY d.source, c.chunk_index
+        const sql = `
+      SELECT
+        c.id,
+        c.embedding_id,
+        c.document_id,
+        c.content,
+        c.content_type,
+        c.chunk_index,
+        c.metadata,
+        c.created_at,
+        d.source as document_source,
+        d.title as document_title,
+        d.content_type as document_content_type,
+        d.content_id as document_content_id
+      FROM chunks c
+      JOIN documents d ON c.document_id = d.id
+      WHERE c.content_type = ?
+      ORDER BY d.source, c.chunk_index
     `;
         const results = await connection.all(sql, [contentType]);
         // Parse metadata JSON strings back to objects
@@ -594,16 +594,16 @@ export async function getChunksByContentType(connection, contentType) {
 export async function getContentTypeStatistics(connection) {
     try {
         // Get document statistics
-        const docStats = await connection.all(`
-      SELECT content_type, COUNT(*) as count
-      FROM documents
-      GROUP BY content_type
+        const docStats = await connection.all(`
+      SELECT content_type, COUNT(*) as count
+      FROM documents
+      GROUP BY content_type
     `);
         // Get chunk statistics
-        const chunkStats = await connection.all(`
-      SELECT content_type, COUNT(*) as count
-      FROM chunks
-      GROUP BY content_type
+        const chunkStats = await connection.all(`
+      SELECT content_type, COUNT(*) as count
+      FROM chunks
+      GROUP BY content_type
     `);
         // Get totals
         const totalDocs = await connection.get('SELECT COUNT(*) as count FROM documents');
@@ -672,11 +672,11 @@ export async function updateChunkMetadata(connection, chunkId, metadata) {
  */
 export async function insertContentMetadata(connection, contentMetadata) {
     try {
-        await connection.run(`
-      INSERT INTO content_metadata (
-        id, storage_type, original_path, content_path, display_name,
-        content_type, file_size, content_hash
-      ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+        await connection.run(`
+      INSERT INTO content_metadata (
+        id, storage_type, original_path, content_path, display_name,
+        content_type, file_size, content_hash
+      ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
     `, [
             contentMetadata.id,
             contentMetadata.storageType,
@@ -703,11 +703,11 @@ export async function insertContentMetadata(connection, contentMetadata) {
  */
 export async function getContentMetadata(connection, contentId) {
     try {
-        const result = await connection.get(`
-      SELECT id, storage_type, original_path, content_path, display_name,
-             content_type, file_size, content_hash, created_at
-      FROM content_metadata
-      WHERE id = ?
+        const result = await connection.get(`
+      SELECT id, storage_type, original_path, content_path, display_name,
+             content_type, file_size, content_hash, created_at
+      FROM content_metadata
+      WHERE id = ?
     `, [contentId]);
         if (!result) {
             return null;
@@ -736,11 +736,11 @@ export async function getContentMetadata(connection, contentId) {
  */
 export async function getContentMetadataByHash(connection, contentHash) {
     try {
-        const result = await connection.get(`
-      SELECT id, storage_type, original_path, content_path, display_name,
-             content_type, file_size, content_hash, created_at
-      FROM content_metadata
-      WHERE content_hash = ?
+        const result = await connection.get(`
+      SELECT id, storage_type, original_path, content_path, display_name,
+             content_type, file_size, content_hash, created_at
+      FROM content_metadata
+      WHERE content_hash = ?
     `, [contentHash]);
         if (!result) {
             return null;
@@ -769,12 +769,12 @@ export async function getContentMetadataByHash(connection, contentHash) {
  */
 export async function getContentMetadataByStorageType(connection, storageType) {
     try {
-        const results = await connection.all(`
-      SELECT id, storage_type, original_path, content_path, display_name,
-             content_type, file_size, content_hash, created_at
-      FROM content_metadata
-      WHERE storage_type = ?
-      ORDER BY created_at DESC
+        const results = await connection.all(`
+      SELECT id, storage_type, original_path, content_path, display_name,
+             content_type, file_size, content_hash, created_at
+      FROM content_metadata
+      WHERE storage_type = ?
+      ORDER BY created_at DESC
     `, [storageType]);
         return results.map((result) => ({
             id: result.id,
@@ -814,11 +814,11 @@ export async function deleteContentMetadata(connection, contentId) {
  */
 export async function getStorageStats(connection) {
     try {
-        const result = await connection.get(`
-      SELECT content_dir_files, content_dir_size, filesystem_refs,
-             last_cleanup, updated_at
-      FROM storage_stats
-      WHERE id = 1
+        const result = await connection.get(`
+      SELECT content_dir_files, content_dir_size, filesystem_refs,
+             last_cleanup, updated_at
+      FROM storage_stats
+      WHERE id = 1
     `);
         if (!result) {
             return null;
@@ -874,11 +874,11 @@ export async function updateStorageStats(connection, stats) {
         }
         else {
             // Insert new row with provided values and defaults
-            const insertSql = `
-        INSERT INTO storage_stats (
-          id, content_dir_files, content_dir_size, filesystem_refs,
-          last_cleanup, updated_at
-        ) VALUES (1, ?, ?, ?, ?, CURRENT_TIMESTAMP)
+            const insertSql = `
+        INSERT INTO storage_stats (
+          id, content_dir_files, content_dir_size, filesystem_refs,
+          last_cleanup, updated_at
+        ) VALUES (1, ?, ?, ?, ?, CURRENT_TIMESTAMP)
       `;
             await connection.run(insertSql, [
                 stats.contentDirFiles || 0,

package/dist/core/ingestion.js CHANGED Viewed

@@ -9,6 +9,9 @@ import { config } from './config.js';
 import { DocumentPathManager } from './path-manager.js';
 import { existsSync } from 'fs';
 import { ContentManager } from './content-manager.js';
+import { createRequire } from 'module';
+// Create require for CommonJS modules in ES module context
+const require = createRequire(import.meta.url);
 /**
  * Main ingestion pipeline class
  * Coordinates the entire process from file discovery to vector storage
@@ -198,7 +201,23 @@ export class IngestionPipeline {
                 try {
                     // Convert MIME type to simple content type for embedding function
                     const contentTypeForEmbedding = this.getContentTypeForEmbedding(document.metadata?.contentType);
-                    const embedding = await this.embedFn(chunk.text, contentTypeForEmbedding);
+                    // For images, use the image path from metadata instead of text description
+                    let contentForEmbedding = chunk.text;
+                    if (contentTypeForEmbedding === 'image' && document.metadata) {
+                        // Try to get image path from metadata (contentPath, originalPath, or source)
+                        // contentPath is where the image is stored (from contentResult)
+                        const imagePath = document.metadata.contentPath ||
+                            document.metadata.originalPath ||
+                            document.metadata.source;
+                        if (imagePath) {
+                            contentForEmbedding = imagePath;
+                        }
+                        else {
+                            // Fallback: try to extract path from source if available
+                            console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
+                        }
+                    }
+                    const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
                     // Enhance embedding result with content type metadata
                     if (!embedding.contentType) {
                         embedding.contentType = contentTypeForEmbedding;
@@ -444,7 +463,20 @@ export class IngestionPipeline {
                 try {
                     // Convert MIME type to simple content type for embedding function
                     const contentTypeForEmbedding = this.getContentTypeForEmbedding(chunk.contentType);
-                    const embedding = await this.embedFn(chunk.text, contentTypeForEmbedding);
+                    // For images, use the image path from metadata instead of text description
+                    let contentForEmbedding = chunk.text;
+                    if (contentTypeForEmbedding === 'image' && chunk.metadata) {
+                        // Try to get image path from metadata (originalPath or contentPath)
+                        const imagePath = chunk.metadata.originalPath || chunk.metadata.contentPath || chunk.metadata.source;
+                        if (imagePath) {
+                            contentForEmbedding = imagePath;
+                        }
+                        else {
+                            // Fallback: try to extract path from source if available
+                            console.warn(`Image chunk ${i + 1} missing image path in metadata, using text content as fallback`);
+                        }
+                    }
+                    const embedding = await this.embedFn(contentForEmbedding, contentTypeForEmbedding);
                     // Enhance embedding result with content type metadata if not already present
                     if (!embedding.contentType) {
                         embedding.contentType = contentTypeForEmbedding;
@@ -585,21 +617,28 @@ export class IngestionPipeline {
      * @param mimeType - MIME type string (e.g., 'text/plain', 'image/jpeg')
      * @returns Simple content type ('text', 'image', etc.)
      */
-    getContentTypeForEmbedding(mimeType) {
-        if (!mimeType) {
+    getContentTypeForEmbedding(contentType) {
+        if (!contentType) {
+            return 'text';
+        }
+        // Handle simple content type strings (used by chunking)
+        if (contentType === 'image') {
+            return 'image';
+        }
+        else if (contentType === 'text') {
             return 'text';
         }
-        // Convert MIME types to simple content types
-        if (mimeType.startsWith('text/')) {
+        // Convert MIME types to simple content types (legacy support)
+        if (contentType.startsWith('text/')) {
             return 'text';
         }
-        else if (mimeType.startsWith('image/')) {
+        else if (contentType.startsWith('image/')) {
             return 'image';
         }
-        else if (mimeType === 'application/pdf') {
+        else if (contentType === 'application/pdf') {
             return 'text'; // PDFs are processed as text
         }
-        else if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
+        else if (contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
             return 'text'; // DOCX files are processed as text
         }
         else {
@@ -668,6 +707,7 @@ export class IngestionPipeline {
                     contentType: 'image',
                     contentId: contentResult.contentId,
                     storageType: contentResult.storageType,
+                    contentPath: contentResult.contentPath, // Store contentPath for embedding
                     originalPath: metadata.originalPath,
                     ...imageMetadata // Spread all image metadata fields
                 }
@@ -684,6 +724,7 @@ export class IngestionPipeline {
                     contentType: 'image',
                     contentId: contentResult.contentId,
                     storageType: contentResult.storageType,
+                    contentPath: contentResult.contentPath, // Store contentPath for embedding
                     originalPath: metadata.originalPath,
                     processingError: error instanceof Error ? error.message : String(error)
                 }

package/dist/core/lazy-dependency-loader.d.ts CHANGED Viewed

@@ -59,15 +59,10 @@ export declare class LazyRerankerLoader {
      */
     static loadTextDerivedReranker(): Promise<RerankFunction>;
     /**
-     * Lazily load metadata-based reranker for multimodal mode
-     * Only imports when specifically needed
+     * Lazily load CLIP AutoProcessor for consistent image preprocessing
+     * Shares processor instances across embedder instances to ensure identical preprocessing
      */
-    static loadMetadataReranker(): Promise<RerankFunction>;
-    /**
-     * Lazily load hybrid reranker for multimodal mode
-     * Combines multiple reranking strategies (uses text-derived for now)
-     */
-    static loadHybridReranker(): Promise<RerankFunction>;
+    static loadCLIPAutoProcessor(modelName: string): Promise<any>;
     /**
      * Check if a reranker is already loaded in cache
      */

package/dist/core/lazy-dependency-loader.js CHANGED Viewed

@@ -198,32 +198,18 @@ export class LazyRerankerLoader {
         });
     }
     /**
-     * Lazily load metadata-based reranker for multimodal mode
-     * Only imports when specifically needed
+     * Lazily load CLIP AutoProcessor for consistent image preprocessing
+     * Shares processor instances across embedder instances to ensure identical preprocessing
      */
-    static async loadMetadataReranker() {
-        const cacheKey = 'reranker:metadata';
+    static async loadCLIPAutoProcessor(modelName) {
+        const cacheKey = `processor:clip:${modelName}`;
         return this.cache.getOrLoad(cacheKey, async () => {
-            console.log('🔄 Lazy loading metadata reranker (multimodal)');
-            // Dynamic import - only loaded when multimodal mode uses metadata reranking
-            const { MetadataRerankingStrategy } = await import('./reranking-strategies.js');
-            const reranker = new MetadataRerankingStrategy();
-            console.log('✅ Metadata reranker loaded');
-            return reranker.rerank.bind(reranker);
-        });
-    }
-    /**
-     * Lazily load hybrid reranker for multimodal mode
-     * Combines multiple reranking strategies (uses text-derived for now)
-     */
-    static async loadHybridReranker() {
-        const cacheKey = 'reranker:hybrid';
-        return this.cache.getOrLoad(cacheKey, async () => {
-            console.log('🔄 Lazy loading hybrid reranker (multimodal)');
-            // For now, hybrid reranking uses text-derived
-            // TODO: Implement proper hybrid reranking in future tasks
-            console.log('🔄 Hybrid reranking not yet implemented, using text-derived');
-            return this.loadTextDerivedReranker();
+            console.log(`🔄 Lazy loading CLIP AutoProcessor: ${modelName}`);
+            // Dynamic import - only loaded when CLIP models are used
+            const { AutoProcessor } = await import('@huggingface/transformers');
+            const processor = await AutoProcessor.from_pretrained(modelName);
+            console.log(`✅ CLIP AutoProcessor loaded: ${modelName}`);
+            return processor;
         });
     }
     /**
@@ -371,12 +357,8 @@ export class LazyDependencyManager {
                 return LazyRerankerLoader.loadTextReranker();
             case 'text-derived':
                 return LazyRerankerLoader.loadTextDerivedReranker();
-            case 'metadata':
-                return LazyRerankerLoader.loadMetadataReranker();
-            case 'hybrid':
-                return LazyRerankerLoader.loadHybridReranker();
             default:
-                throw new Error(`Unknown reranking strategy '${strategy}'. Supported strategies: cross-encoder, text-derived, metadata, hybrid, disabled`);
+                throw new Error(`Unknown reranking strategy '${strategy}'. Supported strategies: cross-encoder, text-derived, disabled`);
         }
     }
     /**

package/dist/core/mode-detection-service.js CHANGED Viewed

@@ -526,7 +526,7 @@ export class ModeDetectionService {
      * @private
      */
     validateRerankingStrategy(strategy) {
-        const validStrategies = ['cross-encoder', 'text-derived', 'metadata', 'hybrid', 'disabled'];
+        const validStrategies = ['cross-encoder', 'text-derived', 'disabled'];
         if (!validStrategies.includes(strategy)) {
             throw createError.validation(`Invalid reranking strategy '${strategy}'. Must be one of: ${validStrategies.join(', ')}`);
         }

package/dist/core/reranking-config.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@
  * Provides straightforward configuration types and validation for different
  * reranking strategies without complex interface patterns.
  */
-export type RerankingStrategyType = 'cross-encoder' | 'text-derived' | 'metadata' | 'hybrid' | 'disabled';
+export type RerankingStrategyType = 'cross-encoder' | 'text-derived' | 'disabled';
 export interface RerankingConfig {
     strategy: RerankingStrategyType;
     model?: string;