npm - rag-lite-ts - Versions diffs - 2.0.0 → 2.0.2 - Mend

rag-lite-ts 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +0 -1
package/dist/core/batch-processing-optimizer.js +6 -11
package/dist/core/binary-index-format.d.ts +52 -0
package/dist/core/binary-index-format.js +122 -0
package/dist/core/ingestion.js +13 -3
package/dist/core/model-registry.js +4 -4
package/dist/core/reranking-strategies.d.ts +1 -16
package/dist/core/reranking-strategies.js +12 -82
package/dist/core/vector-index.d.ts +1 -1
package/dist/core/vector-index.js +31 -32
package/dist/dom-polyfills.js +3 -6
package/dist/factories/index.d.ts +2 -0
package/dist/factories/index.js +2 -0
package/dist/factories/polymorphic-factory.d.ts +50 -0
package/dist/factories/polymorphic-factory.js +159 -0
package/dist/file-processor.js +30 -102
package/dist/index.d.ts +23 -0
package/dist/index.js +18 -0
package/dist/ingestion.js +18 -3
package/dist/multimodal/clip-embedder.d.ts +18 -5
package/dist/multimodal/clip-embedder.js +73 -26
package/dist/search.d.ts +34 -9
package/dist/search.js +28 -10
package/package.json +13 -4

package/dist/multimodal/clip-embedder.js CHANGED Viewed

@@ -268,6 +268,33 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
         }
     }
     // =============================================================================
+    // NORMALIZATION UTILITIES
+    // =============================================================================
+    /**
+     * Apply L2-normalization to an embedding vector
+     *
+     * L2-normalization ensures that all embeddings have unit length (magnitude = 1),
+     * which is essential for CLIP models as they were trained with normalized embeddings.
+     * This normalization makes cosine similarity calculations more reliable and ensures
+     * that vector magnitudes don't affect similarity scores.
+     *
+     * @param embedding - The embedding vector to normalize (modified in-place)
+     * @returns The normalized embedding vector (same reference as input)
+     * @private
+     */
+    normalizeEmbedding(embedding) {
+        // Calculate L2 norm (magnitude)
+        const magnitude = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
+        // Avoid division by zero
+        if (magnitude > 0) {
+            // Normalize each component by dividing by magnitude
+            for (let i = 0; i < embedding.length; i++) {
+                embedding[i] /= magnitude;
+            }
+        }
+        return embedding;
+    }
+    // =============================================================================
     // TEXT EMBEDDING METHODS
     // =============================================================================
     /**
@@ -277,11 +304,11 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
      * pixel_values errors. Text is tokenized with CLIP's 77 token limit and
      * automatically truncated if necessary.
      *
-     * Returns a 512-dimensional embedding vector in the unified CLIP embedding space,
-     * which is directly comparable to image embeddings for cross-modal search.
+     * Returns a 512-dimensional L2-normalized embedding vector in the unified CLIP
+     * embedding space, which is directly comparable to image embeddings for cross-modal search.
      *
      * @param text - The text to embed (will be trimmed and validated)
-     * @returns EmbeddingResult with 512-dimensional vector and metadata
+     * @returns EmbeddingResult with 512-dimensional normalized vector and metadata
      * @throws {Error} If text is empty, model not loaded, or embedding fails
      *
      * @example
@@ -312,15 +339,13 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
             throw new Error('CLIP text model or tokenizer not initialized');
         }
         try {
-            // Validate and truncate text if necessary (CLIP has a 77 token limit)
-            this.validateTextLength(text);
-            const finalProcessedText = this.truncateText(processedText);
             // Use the validated CLIPTextModelWithProjection approach (no pixel_values errors)
             // Tokenize text with CLIP's requirements
-            const tokens = await this.tokenizer(finalProcessedText, {
+            // The tokenizer handles truncation at 77 TOKENS (not characters)
+            const tokens = await this.tokenizer(processedText, {
                 padding: true,
                 truncation: true,
-                max_length: 77, // CLIP's text sequence length limit
+                max_length: 77, // CLIP's text sequence length limit (77 tokens)
                 return_tensors: 'pt'
             });
             // Log token information for debugging (only in development)
@@ -328,7 +353,7 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
                 const tokenIds = tokens.input_ids?.data || [];
                 const actualTokenCount = Array.from(tokenIds).filter((id) => id !== 0).length;
                 if (actualTokenCount >= 77) {
-                    console.warn(`Text truncated: "${finalProcessedText.substring(0, 50)}..." (${actualTokenCount}+ tokens -> 77 tokens)`);
+                    console.warn(`Text truncated by tokenizer: "${processedText.substring(0, 50)}..." (truncated to 77 tokens)`);
                 }
             }
             // Generate text embedding using CLIPTextModelWithProjection
@@ -349,22 +374,31 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
             if (nonZeroValues.length === 0) {
                 throw new Error('CLIP embedding is all zeros');
             }
-            // Calculate embedding magnitude for quality assessment
-            const magnitude = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
-            if (magnitude < 1e-6) {
-                throw new Error(`CLIP embedding has critically low magnitude: ${magnitude.toExponential(3)}`);
+            // Calculate embedding magnitude before normalization for quality assessment
+            const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
+            if (magnitudeBeforeNorm < 1e-6) {
+                throw new Error(`CLIP embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
+            }
+            // Apply L2-normalization (CLIP models are trained with normalized embeddings)
+            this.normalizeEmbedding(embedding);
+            // Verify normalization was successful
+            const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
+            if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
+                console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
             }
             // Generate unique embedding ID
-            const embeddingId = this.generateEmbeddingId(finalProcessedText, 'text');
+            const embeddingId = this.generateEmbeddingId(processedText, 'text');
             return {
                 embedding_id: embeddingId,
                 vector: embedding,
                 contentType: 'text',
                 metadata: {
                     originalText: text,
-                    processedText: finalProcessedText,
-                    textLength: finalProcessedText.length,
-                    embeddingMagnitude: magnitude,
+                    processedText: processedText,
+                    textLength: processedText.length,
+                    embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
+                    embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
+                    normalized: true,
                     modelName: this.modelName,
                     modelType: this.modelType,
                     dimensions: this.dimensions
@@ -389,10 +423,10 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
      * - Converted to proper pixel_values format using AutoProcessor
      * - Normalized for CLIP vision model
      *
-     * Returns a 512-dimensional embedding vector directly comparable to text embeddings.
+     * Returns a 512-dimensional L2-normalized embedding vector directly comparable to text embeddings.
      *
      * @param imagePath - Local file path or URL to the image
-     * @returns EmbeddingResult with 512-dimensional vector and metadata
+     * @returns EmbeddingResult with 512-dimensional normalized vector and metadata
      * @throws {Error} If image not found, unsupported format, or embedding fails
      *
      * @example
@@ -459,10 +493,17 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
             if (nonZeroValues.length === 0) {
                 throw new Error('CLIP image embedding is all zeros');
             }
-            // Calculate embedding magnitude for quality assessment
-            const magnitude = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
-            if (magnitude < 1e-6) {
-                throw new Error(`CLIP image embedding has critically low magnitude: ${magnitude.toExponential(3)}`);
+            // Calculate embedding magnitude before normalization for quality assessment
+            const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
+            if (magnitudeBeforeNorm < 1e-6) {
+                throw new Error(`CLIP image embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
+            }
+            // Apply L2-normalization (CLIP models are trained with normalized embeddings)
+            this.normalizeEmbedding(embedding);
+            // Verify normalization was successful
+            const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
+            if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
+                console.warn(`Warning: Image embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
             }
             // Generate unique embedding ID
             const embeddingId = this.generateEmbeddingId(processedPath, 'image');
@@ -472,7 +513,9 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
                 contentType: 'image',
                 metadata: {
                     imagePath: processedPath,
-                    embeddingMagnitude: magnitude,
+                    embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
+                    embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
+                    normalized: true,
                     modelName: this.modelName,
                     modelType: this.modelType,
                     dimensions: this.dimensions
@@ -637,8 +680,9 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
                     const { createTextBatchProcessor } = await import('../core/batch-processing-optimizer.js');
                     const batchProcessor = createTextBatchProcessor();
                     // Convert to EmbeddingBatchItem format
+                    // Let tokenizer handle truncation at 77 tokens (not characters)
                     const batchItems = textItems.map(item => ({
-                        content: this.truncateText(item.content.trim()),
+                        content: item.content.trim(),
                         contentType: item.contentType,
                         metadata: item.metadata
                     }));
@@ -728,7 +772,8 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
      */
     async processBatchText(textItems) {
         // Prepare texts for batch processing
-        const texts = textItems.map(item => this.truncateText(item.content.trim()));
+        // Let tokenizer handle truncation at 77 tokens (not characters)
+        const texts = textItems.map(item => item.content.trim());
         // Tokenize all texts in batch
         const tokensBatch = await Promise.all(texts.map(text => this.tokenizer(text, {
             padding: true,
@@ -749,6 +794,8 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
             if (embedding.length !== this.dimensions) {
                 throw new Error(`CLIP embedding dimension mismatch for item ${i}: expected ${this.dimensions}, got ${embedding.length}`);
             }
+            // Apply L2-normalization (CLIP models are trained with normalized embeddings)
+            this.normalizeEmbedding(embedding);
             const embeddingId = this.generateEmbeddingId(item.content, 'text');
             results.push({
                 embedding_id: embeddingId,

package/dist/search.d.ts CHANGED Viewed

@@ -1,25 +1,44 @@
 /**
- * Public API SearchEngine - Simple constructor interface with internal factory usage
+ * Public API SearchEngine - Simple constructor with Chameleon Architecture
  *
- * This class provides a clean, simple API while using the new core architecture
- * internally. It handles dependency injection automatically.
+ * This class provides a clean, simple API that automatically adapts to the mode
+ * (text or multimodal) stored in the database during ingestion. The system detects
+ * the mode and creates the appropriate embedder and reranker without user intervention.
+ *
+ * Chameleon Architecture Features:
+ * - Automatic mode detection from database configuration
+ * - Seamless switching between text and multimodal modes
+ * - Appropriate embedder selection (sentence-transformer or CLIP)
+ * - Mode-specific reranking strategies
  *
  * @example
  * ```typescript
- * // Simple usage
+ * // Simple usage - mode automatically detected from database
  * const search = new SearchEngine('./index.bin', './db.sqlite');
  * const results = await search.search('query');
  *
- * // With options
+ * // Works for both text and multimodal databases
+ * // Text mode: uses sentence-transformer embeddings
+ * // Multimodal mode: uses CLIP embeddings for cross-modal search
+ *
+ * // With options (advanced)
  * const search = new SearchEngine('./index.bin', './db.sqlite', {
- *   embeddingModel: 'all-MiniLM-L6-v2',
  *   enableReranking: true
  * });
  * ```
  */
-import { type TextSearchOptions } from './factories/index.js';
 import type { SearchResult, SearchOptions, EmbedFunction, RerankFunction } from './core/types.js';
-export interface SearchEngineOptions extends TextSearchOptions {
+export interface SearchEngineOptions {
+    /** Embedding model name override */
+    embeddingModel?: string;
+    /** Embedding batch size override */
+    batchSize?: number;
+    /** Reranking model name override */
+    rerankingModel?: string;
+    /** Whether to enable reranking (default: true) */
+    enableReranking?: boolean;
+    /** Top-k results to return (default: from config) */
+    topK?: number;
     /** Custom embedding function (advanced usage) */
     embedFn?: EmbedFunction;
     /** Custom reranking function (advanced usage) */
@@ -33,7 +52,13 @@ export declare class SearchEngine {
     private initPromise;
     constructor(indexPath: string, dbPath: string, options?: SearchEngineOptions);
     /**
-     * Initialize the search engine using the factory or direct injection
+     * Initialize the search engine using polymorphic factory or direct injection
+     *
+     * Chameleon Architecture Implementation:
+     * - Automatically detects mode from database (text or multimodal)
+     * - Creates appropriate embedder based on detected mode
+     * - Applies mode-specific reranking strategies
+     * - Provides seamless polymorphic behavior
      */
     private initialize;
     /**

package/dist/search.js CHANGED Viewed

@@ -1,24 +1,33 @@
 /**
- * Public API SearchEngine - Simple constructor interface with internal factory usage
+ * Public API SearchEngine - Simple constructor with Chameleon Architecture
  *
- * This class provides a clean, simple API while using the new core architecture
- * internally. It handles dependency injection automatically.
+ * This class provides a clean, simple API that automatically adapts to the mode
+ * (text or multimodal) stored in the database during ingestion. The system detects
+ * the mode and creates the appropriate embedder and reranker without user intervention.
+ *
+ * Chameleon Architecture Features:
+ * - Automatic mode detection from database configuration
+ * - Seamless switching between text and multimodal modes
+ * - Appropriate embedder selection (sentence-transformer or CLIP)
+ * - Mode-specific reranking strategies
  *
  * @example
  * ```typescript
- * // Simple usage
+ * // Simple usage - mode automatically detected from database
  * const search = new SearchEngine('./index.bin', './db.sqlite');
  * const results = await search.search('query');
  *
- * // With options
+ * // Works for both text and multimodal databases
+ * // Text mode: uses sentence-transformer embeddings
+ * // Multimodal mode: uses CLIP embeddings for cross-modal search
+ *
+ * // With options (advanced)
  * const search = new SearchEngine('./index.bin', './db.sqlite', {
- *   embeddingModel: 'all-MiniLM-L6-v2',
  *   enableReranking: true
  * });
  * ```
  */
 import { SearchEngine as CoreSearchEngine } from './core/search.js';
-import { TextSearchFactory } from './factories/index.js';
 export class SearchEngine {
     indexPath;
     dbPath;
@@ -42,7 +51,13 @@ export class SearchEngine {
         }
     }
     /**
-     * Initialize the search engine using the factory or direct injection
+     * Initialize the search engine using polymorphic factory or direct injection
+     *
+     * Chameleon Architecture Implementation:
+     * - Automatically detects mode from database (text or multimodal)
+     * - Creates appropriate embedder based on detected mode
+     * - Applies mode-specific reranking strategies
+     * - Provides seamless polymorphic behavior
      */
     async initialize() {
         if (this.coreEngine) {
@@ -81,8 +96,11 @@ export class SearchEngine {
                 this.coreEngine = new CoreSearchEngine(embedFn, indexManager, db, this.options.rerankFn, contentResolver);
             }
             else {
-                // Use factory for standard initialization
-                this.coreEngine = await TextSearchFactory.create(this.indexPath, this.dbPath, this.options);
+                // Use core polymorphic factory for automatic mode detection (Chameleon Architecture)
+                // This enables SearchEngine to automatically adapt to text or multimodal mode
+                // based on the configuration stored in the database during ingestion
+                const { PolymorphicSearchFactory } = await import('./core/polymorphic-search-factory.js');
+                this.coreEngine = await PolymorphicSearchFactory.create(this.indexPath, this.dbPath);
             }
         })();
         return this.initPromise;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "rag-lite-ts",
-  "version": "2.0.0",
+  "version": "2.0.2",
   "description": "Local-first TypeScript retrieval engine with Chameleon Multimodal Architecture for semantic search over text and image content",
   "type": "module",
   "main": "./dist/index.js",
@@ -31,9 +31,16 @@
     "build:test": "tsc --project tsconfig.test.json",
     "clean": "rimraf dist",
     "dev": "tsc --watch",
-    "test": "npm run build:test && node --test dist/text/tokenizer.test.js dist/core/chunker.test.js dist/text/embedder.test.js dist/core/vector-index.test.js dist/index-manager.test.js dist/core/search.test.js dist/file-processor.test.js dist/mcp-server.test.js dist/preprocess.test.js dist/core/config.test.js dist/preprocessors/integration.test.js dist/cli/cli.test.js",
-    "test:integration": "npm run build && npm run build:test && node --test dist/integration.test.js",
-    "test:all": "npm run test && npm run test:integration",
+    "test": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/core dist/__tests__/text dist/__tests__/preprocessors",
+    "test:verbose": "npm run build:test && node --expose-gc --test --test-concurrency=1 --test-reporter=tap dist/__tests__/core dist/__tests__/text dist/__tests__/preprocessors",
+    "test:core": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/core",
+    "test:core:verbose": "npm run build:test && node --expose-gc --test --test-concurrency=1 --test-reporter=tap dist/__tests__/core",
+    "test:text": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/text",
+    "test:preprocessors": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/preprocessors",
+    "test:integration": "npm run build && npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/integration",
+    "test:integration:verbose": "npm run build && npm run build:test && node --expose-gc --test --test-concurrency=1 --test-reporter=tap dist/__tests__/integration",
+    "test:all": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__",
+    "test:all:verbose": "npm run build:test && node --expose-gc --test --test-concurrency=1 --test-reporter=tap dist/__tests__",
     "prepublishOnly": "npm run clean && npm run build"
   },
   "keywords": [
@@ -71,6 +78,7 @@
   "dependencies": {
     "@huggingface/transformers": "^3.7.5",
     "@modelcontextprotocol/sdk": "^1.18.2",
+    "csv-parse": "^6.1.0",
     "hnswlib-wasm": "^0.8.2",
     "jsdom": "^27.0.0",
     "lru-cache": "^11.2.2",
@@ -84,6 +92,7 @@
     "@types/node": "^20.11.0",
     "js-yaml": "^4.1.0",
     "rimraf": "^5.0.5",
+    "tsx": "^4.20.6",
     "typescript": "^5.3.0"
   },
   "optionalDependencies": {