rag-lite-ts 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -268,6 +268,33 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
268
268
  }
269
269
  }
270
270
  // =============================================================================
271
+ // NORMALIZATION UTILITIES
272
+ // =============================================================================
273
+ /**
274
+ * Apply L2-normalization to an embedding vector
275
+ *
276
+ * L2-normalization ensures that all embeddings have unit length (magnitude = 1),
277
+ * which is essential for CLIP models as they were trained with normalized embeddings.
278
+ * This normalization makes cosine similarity calculations more reliable and ensures
279
+ * that vector magnitudes don't affect similarity scores.
280
+ *
281
+ * @param embedding - The embedding vector to normalize (modified in-place)
282
+ * @returns The normalized embedding vector (same reference as input)
283
+ * @private
284
+ */
285
+ normalizeEmbedding(embedding) {
286
+ // Calculate L2 norm (magnitude)
287
+ const magnitude = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
288
+ // Avoid division by zero
289
+ if (magnitude > 0) {
290
+ // Normalize each component by dividing by magnitude
291
+ for (let i = 0; i < embedding.length; i++) {
292
+ embedding[i] /= magnitude;
293
+ }
294
+ }
295
+ return embedding;
296
+ }
297
+ // =============================================================================
271
298
  // TEXT EMBEDDING METHODS
272
299
  // =============================================================================
273
300
  /**
@@ -277,11 +304,11 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
277
304
  * pixel_values errors. Text is tokenized with CLIP's 77 token limit and
278
305
  * automatically truncated if necessary.
279
306
  *
280
- * Returns a 512-dimensional embedding vector in the unified CLIP embedding space,
281
- * which is directly comparable to image embeddings for cross-modal search.
307
+ * Returns a 512-dimensional L2-normalized embedding vector in the unified CLIP
308
+ * embedding space, which is directly comparable to image embeddings for cross-modal search.
282
309
  *
283
310
  * @param text - The text to embed (will be trimmed and validated)
284
- * @returns EmbeddingResult with 512-dimensional vector and metadata
311
+ * @returns EmbeddingResult with 512-dimensional normalized vector and metadata
285
312
  * @throws {Error} If text is empty, model not loaded, or embedding fails
286
313
  *
287
314
  * @example
@@ -312,15 +339,13 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
312
339
  throw new Error('CLIP text model or tokenizer not initialized');
313
340
  }
314
341
  try {
315
- // Validate and truncate text if necessary (CLIP has a 77 token limit)
316
- this.validateTextLength(text);
317
- const finalProcessedText = this.truncateText(processedText);
318
342
  // Use the validated CLIPTextModelWithProjection approach (no pixel_values errors)
319
343
  // Tokenize text with CLIP's requirements
320
- const tokens = await this.tokenizer(finalProcessedText, {
344
+ // The tokenizer handles truncation at 77 TOKENS (not characters)
345
+ const tokens = await this.tokenizer(processedText, {
321
346
  padding: true,
322
347
  truncation: true,
323
- max_length: 77, // CLIP's text sequence length limit
348
+ max_length: 77, // CLIP's text sequence length limit (77 tokens)
324
349
  return_tensors: 'pt'
325
350
  });
326
351
  // Log token information for debugging (only in development)
@@ -328,7 +353,7 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
328
353
  const tokenIds = tokens.input_ids?.data || [];
329
354
  const actualTokenCount = Array.from(tokenIds).filter((id) => id !== 0).length;
330
355
  if (actualTokenCount >= 77) {
331
- console.warn(`Text truncated: "${finalProcessedText.substring(0, 50)}..." (${actualTokenCount}+ tokens -> 77 tokens)`);
356
+ console.warn(`Text truncated by tokenizer: "${processedText.substring(0, 50)}..." (truncated to 77 tokens)`);
332
357
  }
333
358
  }
334
359
  // Generate text embedding using CLIPTextModelWithProjection
@@ -349,22 +374,31 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
349
374
  if (nonZeroValues.length === 0) {
350
375
  throw new Error('CLIP embedding is all zeros');
351
376
  }
352
- // Calculate embedding magnitude for quality assessment
353
- const magnitude = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
354
- if (magnitude < 1e-6) {
355
- throw new Error(`CLIP embedding has critically low magnitude: ${magnitude.toExponential(3)}`);
377
+ // Calculate embedding magnitude before normalization for quality assessment
378
+ const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
379
+ if (magnitudeBeforeNorm < 1e-6) {
380
+ throw new Error(`CLIP embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
381
+ }
382
+ // Apply L2-normalization (CLIP models are trained with normalized embeddings)
383
+ this.normalizeEmbedding(embedding);
384
+ // Verify normalization was successful
385
+ const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
386
+ if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
387
+ console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
356
388
  }
357
389
  // Generate unique embedding ID
358
- const embeddingId = this.generateEmbeddingId(finalProcessedText, 'text');
390
+ const embeddingId = this.generateEmbeddingId(processedText, 'text');
359
391
  return {
360
392
  embedding_id: embeddingId,
361
393
  vector: embedding,
362
394
  contentType: 'text',
363
395
  metadata: {
364
396
  originalText: text,
365
- processedText: finalProcessedText,
366
- textLength: finalProcessedText.length,
367
- embeddingMagnitude: magnitude,
397
+ processedText: processedText,
398
+ textLength: processedText.length,
399
+ embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
400
+ embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
401
+ normalized: true,
368
402
  modelName: this.modelName,
369
403
  modelType: this.modelType,
370
404
  dimensions: this.dimensions
@@ -389,10 +423,10 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
389
423
  * - Converted to proper pixel_values format using AutoProcessor
390
424
  * - Normalized for CLIP vision model
391
425
  *
392
- * Returns a 512-dimensional embedding vector directly comparable to text embeddings.
426
+ * Returns a 512-dimensional L2-normalized embedding vector directly comparable to text embeddings.
393
427
  *
394
428
  * @param imagePath - Local file path or URL to the image
395
- * @returns EmbeddingResult with 512-dimensional vector and metadata
429
+ * @returns EmbeddingResult with 512-dimensional normalized vector and metadata
396
430
  * @throws {Error} If image not found, unsupported format, or embedding fails
397
431
  *
398
432
  * @example
@@ -459,10 +493,17 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
459
493
  if (nonZeroValues.length === 0) {
460
494
  throw new Error('CLIP image embedding is all zeros');
461
495
  }
462
- // Calculate embedding magnitude for quality assessment
463
- const magnitude = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
464
- if (magnitude < 1e-6) {
465
- throw new Error(`CLIP image embedding has critically low magnitude: ${magnitude.toExponential(3)}`);
496
+ // Calculate embedding magnitude before normalization for quality assessment
497
+ const magnitudeBeforeNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
498
+ if (magnitudeBeforeNorm < 1e-6) {
499
+ throw new Error(`CLIP image embedding has critically low magnitude: ${magnitudeBeforeNorm.toExponential(3)}`);
500
+ }
501
+ // Apply L2-normalization (CLIP models are trained with normalized embeddings)
502
+ this.normalizeEmbedding(embedding);
503
+ // Verify normalization was successful
504
+ const magnitudeAfterNorm = Math.sqrt(Array.from(embedding).reduce((sum, val) => sum + val * val, 0));
505
+ if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
506
+ console.warn(`Warning: Image embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
466
507
  }
467
508
  // Generate unique embedding ID
468
509
  const embeddingId = this.generateEmbeddingId(processedPath, 'image');
@@ -472,7 +513,9 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
472
513
  contentType: 'image',
473
514
  metadata: {
474
515
  imagePath: processedPath,
475
- embeddingMagnitude: magnitude,
516
+ embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
517
+ embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
518
+ normalized: true,
476
519
  modelName: this.modelName,
477
520
  modelType: this.modelType,
478
521
  dimensions: this.dimensions
@@ -637,8 +680,9 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
637
680
  const { createTextBatchProcessor } = await import('../core/batch-processing-optimizer.js');
638
681
  const batchProcessor = createTextBatchProcessor();
639
682
  // Convert to EmbeddingBatchItem format
683
+ // Let tokenizer handle truncation at 77 tokens (not characters)
640
684
  const batchItems = textItems.map(item => ({
641
- content: this.truncateText(item.content.trim()),
685
+ content: item.content.trim(),
642
686
  contentType: item.contentType,
643
687
  metadata: item.metadata
644
688
  }));
@@ -728,7 +772,8 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
728
772
  */
729
773
  async processBatchText(textItems) {
730
774
  // Prepare texts for batch processing
731
- const texts = textItems.map(item => this.truncateText(item.content.trim()));
775
+ // Let tokenizer handle truncation at 77 tokens (not characters)
776
+ const texts = textItems.map(item => item.content.trim());
732
777
  // Tokenize all texts in batch
733
778
  const tokensBatch = await Promise.all(texts.map(text => this.tokenizer(text, {
734
779
  padding: true,
@@ -749,6 +794,8 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
749
794
  if (embedding.length !== this.dimensions) {
750
795
  throw new Error(`CLIP embedding dimension mismatch for item ${i}: expected ${this.dimensions}, got ${embedding.length}`);
751
796
  }
797
+ // Apply L2-normalization (CLIP models are trained with normalized embeddings)
798
+ this.normalizeEmbedding(embedding);
752
799
  const embeddingId = this.generateEmbeddingId(item.content, 'text');
753
800
  results.push({
754
801
  embedding_id: embeddingId,
package/dist/search.d.ts CHANGED
@@ -1,25 +1,44 @@
1
1
  /**
2
- * Public API SearchEngine - Simple constructor interface with internal factory usage
2
+ * Public API SearchEngine - Simple constructor with Chameleon Architecture
3
3
  *
4
- * This class provides a clean, simple API while using the new core architecture
5
- * internally. It handles dependency injection automatically.
4
+ * This class provides a clean, simple API that automatically adapts to the mode
5
+ * (text or multimodal) stored in the database during ingestion. The system detects
6
+ * the mode and creates the appropriate embedder and reranker without user intervention.
7
+ *
8
+ * Chameleon Architecture Features:
9
+ * - Automatic mode detection from database configuration
10
+ * - Seamless switching between text and multimodal modes
11
+ * - Appropriate embedder selection (sentence-transformer or CLIP)
12
+ * - Mode-specific reranking strategies
6
13
  *
7
14
  * @example
8
15
  * ```typescript
9
- * // Simple usage
16
+ * // Simple usage - mode automatically detected from database
10
17
  * const search = new SearchEngine('./index.bin', './db.sqlite');
11
18
  * const results = await search.search('query');
12
19
  *
13
- * // With options
20
+ * // Works for both text and multimodal databases
21
+ * // Text mode: uses sentence-transformer embeddings
22
+ * // Multimodal mode: uses CLIP embeddings for cross-modal search
23
+ *
24
+ * // With options (advanced)
14
25
  * const search = new SearchEngine('./index.bin', './db.sqlite', {
15
- * embeddingModel: 'all-MiniLM-L6-v2',
16
26
  * enableReranking: true
17
27
  * });
18
28
  * ```
19
29
  */
20
- import { type TextSearchOptions } from './factories/index.js';
21
30
  import type { SearchResult, SearchOptions, EmbedFunction, RerankFunction } from './core/types.js';
22
- export interface SearchEngineOptions extends TextSearchOptions {
31
+ export interface SearchEngineOptions {
32
+ /** Embedding model name override */
33
+ embeddingModel?: string;
34
+ /** Embedding batch size override */
35
+ batchSize?: number;
36
+ /** Reranking model name override */
37
+ rerankingModel?: string;
38
+ /** Whether to enable reranking (default: true) */
39
+ enableReranking?: boolean;
40
+ /** Top-k results to return (default: from config) */
41
+ topK?: number;
23
42
  /** Custom embedding function (advanced usage) */
24
43
  embedFn?: EmbedFunction;
25
44
  /** Custom reranking function (advanced usage) */
@@ -33,7 +52,13 @@ export declare class SearchEngine {
33
52
  private initPromise;
34
53
  constructor(indexPath: string, dbPath: string, options?: SearchEngineOptions);
35
54
  /**
36
- * Initialize the search engine using the factory or direct injection
55
+ * Initialize the search engine using polymorphic factory or direct injection
56
+ *
57
+ * Chameleon Architecture Implementation:
58
+ * - Automatically detects mode from database (text or multimodal)
59
+ * - Creates appropriate embedder based on detected mode
60
+ * - Applies mode-specific reranking strategies
61
+ * - Provides seamless polymorphic behavior
37
62
  */
38
63
  private initialize;
39
64
  /**
package/dist/search.js CHANGED
@@ -1,24 +1,33 @@
1
1
  /**
2
- * Public API SearchEngine - Simple constructor interface with internal factory usage
2
+ * Public API SearchEngine - Simple constructor with Chameleon Architecture
3
3
  *
4
- * This class provides a clean, simple API while using the new core architecture
5
- * internally. It handles dependency injection automatically.
4
+ * This class provides a clean, simple API that automatically adapts to the mode
5
+ * (text or multimodal) stored in the database during ingestion. The system detects
6
+ * the mode and creates the appropriate embedder and reranker without user intervention.
7
+ *
8
+ * Chameleon Architecture Features:
9
+ * - Automatic mode detection from database configuration
10
+ * - Seamless switching between text and multimodal modes
11
+ * - Appropriate embedder selection (sentence-transformer or CLIP)
12
+ * - Mode-specific reranking strategies
6
13
  *
7
14
  * @example
8
15
  * ```typescript
9
- * // Simple usage
16
+ * // Simple usage - mode automatically detected from database
10
17
  * const search = new SearchEngine('./index.bin', './db.sqlite');
11
18
  * const results = await search.search('query');
12
19
  *
13
- * // With options
20
+ * // Works for both text and multimodal databases
21
+ * // Text mode: uses sentence-transformer embeddings
22
+ * // Multimodal mode: uses CLIP embeddings for cross-modal search
23
+ *
24
+ * // With options (advanced)
14
25
  * const search = new SearchEngine('./index.bin', './db.sqlite', {
15
- * embeddingModel: 'all-MiniLM-L6-v2',
16
26
  * enableReranking: true
17
27
  * });
18
28
  * ```
19
29
  */
20
30
  import { SearchEngine as CoreSearchEngine } from './core/search.js';
21
- import { TextSearchFactory } from './factories/index.js';
22
31
  export class SearchEngine {
23
32
  indexPath;
24
33
  dbPath;
@@ -42,7 +51,13 @@ export class SearchEngine {
42
51
  }
43
52
  }
44
53
  /**
45
- * Initialize the search engine using the factory or direct injection
54
+ * Initialize the search engine using polymorphic factory or direct injection
55
+ *
56
+ * Chameleon Architecture Implementation:
57
+ * - Automatically detects mode from database (text or multimodal)
58
+ * - Creates appropriate embedder based on detected mode
59
+ * - Applies mode-specific reranking strategies
60
+ * - Provides seamless polymorphic behavior
46
61
  */
47
62
  async initialize() {
48
63
  if (this.coreEngine) {
@@ -81,8 +96,11 @@ export class SearchEngine {
81
96
  this.coreEngine = new CoreSearchEngine(embedFn, indexManager, db, this.options.rerankFn, contentResolver);
82
97
  }
83
98
  else {
84
- // Use factory for standard initialization
85
- this.coreEngine = await TextSearchFactory.create(this.indexPath, this.dbPath, this.options);
99
+ // Use core polymorphic factory for automatic mode detection (Chameleon Architecture)
100
+ // This enables SearchEngine to automatically adapt to text or multimodal mode
101
+ // based on the configuration stored in the database during ingestion
102
+ const { PolymorphicSearchFactory } = await import('./core/polymorphic-search-factory.js');
103
+ this.coreEngine = await PolymorphicSearchFactory.create(this.indexPath, this.dbPath);
86
104
  }
87
105
  })();
88
106
  return this.initPromise;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "rag-lite-ts",
3
- "version": "2.0.0",
3
+ "version": "2.0.2",
4
4
  "description": "Local-first TypeScript retrieval engine with Chameleon Multimodal Architecture for semantic search over text and image content",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -31,9 +31,16 @@
31
31
  "build:test": "tsc --project tsconfig.test.json",
32
32
  "clean": "rimraf dist",
33
33
  "dev": "tsc --watch",
34
- "test": "npm run build:test && node --test dist/text/tokenizer.test.js dist/core/chunker.test.js dist/text/embedder.test.js dist/core/vector-index.test.js dist/index-manager.test.js dist/core/search.test.js dist/file-processor.test.js dist/mcp-server.test.js dist/preprocess.test.js dist/core/config.test.js dist/preprocessors/integration.test.js dist/cli/cli.test.js",
35
- "test:integration": "npm run build && npm run build:test && node --test dist/integration.test.js",
36
- "test:all": "npm run test && npm run test:integration",
34
+ "test": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/core dist/__tests__/text dist/__tests__/preprocessors",
35
+ "test:verbose": "npm run build:test && node --expose-gc --test --test-concurrency=1 --test-reporter=tap dist/__tests__/core dist/__tests__/text dist/__tests__/preprocessors",
36
+ "test:core": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/core",
37
+ "test:core:verbose": "npm run build:test && node --expose-gc --test --test-concurrency=1 --test-reporter=tap dist/__tests__/core",
38
+ "test:text": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/text",
39
+ "test:preprocessors": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/preprocessors",
40
+ "test:integration": "npm run build && npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__/integration",
41
+ "test:integration:verbose": "npm run build && npm run build:test && node --expose-gc --test --test-concurrency=1 --test-reporter=tap dist/__tests__/integration",
42
+ "test:all": "npm run build:test && node --expose-gc --test --test-concurrency=1 dist/__tests__",
43
+ "test:all:verbose": "npm run build:test && node --expose-gc --test --test-concurrency=1 --test-reporter=tap dist/__tests__",
37
44
  "prepublishOnly": "npm run clean && npm run build"
38
45
  },
39
46
  "keywords": [
@@ -71,6 +78,7 @@
71
78
  "dependencies": {
72
79
  "@huggingface/transformers": "^3.7.5",
73
80
  "@modelcontextprotocol/sdk": "^1.18.2",
81
+ "csv-parse": "^6.1.0",
74
82
  "hnswlib-wasm": "^0.8.2",
75
83
  "jsdom": "^27.0.0",
76
84
  "lru-cache": "^11.2.2",
@@ -84,6 +92,7 @@
84
92
  "@types/node": "^20.11.0",
85
93
  "js-yaml": "^4.1.0",
86
94
  "rimraf": "^5.0.5",
95
+ "tsx": "^4.20.6",
87
96
  "typescript": "^5.3.0"
88
97
  },
89
98
  "optionalDependencies": {