rag-lite-ts 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,8 +15,8 @@ export const DEFAULT_BATCH_CONFIG = {
15
15
  textBatchSize: 16,
16
16
  imageBatchSize: 4, // Smaller for memory-intensive image processing
17
17
  maxConcurrentBatches: 2,
18
- // Memory management (256MB threshold)
19
- memoryThresholdMB: 256,
18
+ // Memory management (512MB threshold for multimodal processing)
19
+ memoryThresholdMB: 512,
20
20
  enableMemoryMonitoring: true,
21
21
  enableGarbageCollection: true,
22
22
  // Progress reporting every 5 batches
@@ -402,13 +402,8 @@ export class BatchProcessingOptimizer {
402
402
  */
403
403
  async preloadImageProcessingModels() {
404
404
  try {
405
- if (!this.resourcePool.has('imageToText')) {
406
- console.log('Preloading image-to-text processor...');
407
- const processor = await LazyMultimodalLoader.loadImageToTextProcessor();
408
- this.resourcePool.set('imageToText', processor);
409
- // Register with resource manager
410
- this.resourceManager.registerImageProcessor(processor, 'image-to-text');
411
- }
405
+ // Note: Image-to-text processor is loaded on-demand by file-processor.ts
406
+ // to avoid conflicts with different pipeline configurations
412
407
  if (!this.resourcePool.has('metadataExtractor')) {
413
408
  console.log('Preloading image metadata extractor...');
414
409
  const extractor = await LazyMultimodalLoader.loadImageMetadataExtractor();
@@ -519,7 +514,7 @@ export function createImageBatchProcessor() {
519
514
  return new BatchProcessingOptimizer({
520
515
  imageBatchSize: 2, // Very small batches for memory efficiency
521
516
  textBatchSize: 8,
522
- memoryThresholdMB: 128, // Lower threshold for images
517
+ memoryThresholdMB: 512, // Higher threshold for memory-intensive image processing
523
518
  enableMemoryMonitoring: true,
524
519
  enableGarbageCollection: true,
525
520
  enableParallelProcessing: false, // Sequential for better memory control
@@ -534,7 +529,7 @@ export function createTextBatchProcessor() {
534
529
  textBatchSize: 32, // Larger batches for text
535
530
  imageBatchSize: 4,
536
531
  enableParallelProcessing: true, // Parallel processing for text
537
- memoryThresholdMB: 512, // Higher threshold for text
532
+ memoryThresholdMB: 256, // Lower threshold sufficient for text processing
538
533
  progressReportInterval: 10
539
534
  });
540
535
  }
@@ -290,7 +290,7 @@ export class IngestionPipeline {
290
290
  chunkSize: config.chunk_size,
291
291
  chunkOverlap: config.chunk_overlap
292
292
  };
293
- const chunkingResult = await this.chunkDocumentsWithContentTypes(fileResult.documents, effectiveChunkConfig);
293
+ const chunkingResult = await this.chunkDocumentsWithContentTypes(fileResult.documents, effectiveChunkConfig, options.mode);
294
294
  if (chunkingResult.totalChunks === 0) {
295
295
  console.log('No chunks created from documents');
296
296
  return {
@@ -364,7 +364,7 @@ export class IngestionPipeline {
364
364
  * Chunk all documents and organize results with content-type awareness
365
365
  * Enhanced to handle different content types appropriately
366
366
  */
367
- async chunkDocumentsWithContentTypes(documents, chunkConfig) {
367
+ async chunkDocumentsWithContentTypes(documents, chunkConfig, mode) {
368
368
  const documentChunks = [];
369
369
  const allChunks = [];
370
370
  let totalChunks = 0;
@@ -384,8 +384,18 @@ export class IngestionPipeline {
384
384
  metadata: document.metadata
385
385
  }];
386
386
  }
387
+ else if (mode === 'multimodal') {
388
+ // In multimodal mode, don't chunk text - CLIP handles truncation at 77 tokens
389
+ // Chunking doesn't make sense because CLIP can't handle long text anyway
390
+ chunks = [{
391
+ text: document.content,
392
+ chunkIndex: 0,
393
+ contentType: 'text',
394
+ metadata: document.metadata
395
+ }];
396
+ }
387
397
  else {
388
- // For text documents, use normal chunking
398
+ // For text mode, use normal chunking
389
399
  const textChunks = await chunkDocument(document, chunkConfig);
390
400
  chunks = textChunks.map(chunk => ({
391
401
  ...chunk,
@@ -69,7 +69,7 @@ export const SUPPORTED_MODELS = {
69
69
  supportsMetadata: true,
70
70
  supportsMultimodal: true, // True cross-modal search capabilities
71
71
  maxBatchSize: 8,
72
- maxTextLength: 77, // CLIP's text sequence length limit
72
+ maxTextLength: 77, // CLIP's token limit (tokenizer handles truncation)
73
73
  supportedImageFormats: ['jpg', 'jpeg', 'png', 'webp', 'gif']
74
74
  },
75
75
  requirements: {
@@ -92,7 +92,7 @@ export const SUPPORTED_MODELS = {
92
92
  supportsMetadata: true,
93
93
  supportsMultimodal: true, // True cross-modal search capabilities
94
94
  maxBatchSize: 4,
95
- maxTextLength: 77, // CLIP's text sequence length limit
95
+ maxTextLength: 77, // CLIP's token limit (tokenizer handles truncation)
96
96
  supportedImageFormats: ['jpg', 'jpeg', 'png', 'webp', 'gif']
97
97
  },
98
98
  requirements: {
@@ -194,9 +194,9 @@ export class ModelRegistry {
194
194
  suggestions.push('Use smaller batch sizes for optimal performance');
195
195
  }
196
196
  // Text length limitations
197
- if (modelInfo.capabilities.maxTextLength && modelInfo.capabilities.maxTextLength < 512) {
197
+ if (modelInfo.capabilities.maxTextLength && modelInfo.capabilities.maxTextLength < 256) {
198
198
  warnings.push(`Model has limited text length: ${modelInfo.capabilities.maxTextLength} characters`);
199
- suggestions.push('Consider chunking long texts before processing');
199
+ suggestions.push('Long texts will be truncated by the tokenizer');
200
200
  }
201
201
  // Image format support
202
202
  if (modelInfo.capabilities.supportsImages && modelInfo.capabilities.supportedImageFormats) {
@@ -97,20 +97,10 @@ export declare class TextDerivedRerankingStrategy implements RerankingStrategy {
97
97
  readonly supportedContentTypes: string[];
98
98
  isEnabled: boolean;
99
99
  private crossEncoderReranker;
100
- private imageToTextModel;
101
- private imageToTextModelName;
102
- private initialized;
103
100
  constructor(imageToTextModelName?: string, crossEncoderModelName?: string);
104
- /**
105
- * Initialize the image-to-text model if not already done
106
- */
107
- private ensureInitialized;
108
- /**
109
- * Ensure DOM polyfills are set up for transformers.js
110
- */
111
- private ensurePolyfills;
112
101
  /**
113
102
  * Generate text description for an image
103
+ * Uses the shared image-to-text functionality from file-processor
114
104
  */
115
105
  private generateImageDescription;
116
106
  /**
@@ -128,11 +118,6 @@ export declare class TextDerivedRerankingStrategy implements RerankingStrategy {
128
118
  description: string;
129
119
  requiredModels: string[];
130
120
  configOptions: {
131
- imageToTextModel: {
132
- type: string;
133
- description: string;
134
- default: string;
135
- };
136
121
  crossEncoderModel: {
137
122
  type: string;
138
123
  description: string;
@@ -174,69 +174,22 @@ export class TextDerivedRerankingStrategy {
174
174
  supportedContentTypes = ['text', 'image'];
175
175
  isEnabled = true;
176
176
  crossEncoderReranker;
177
- imageToTextModel = null;
178
- imageToTextModelName = 'Xenova/vit-gpt2-image-captioning';
179
- initialized = false;
180
177
  constructor(imageToTextModelName, crossEncoderModelName) {
181
- if (imageToTextModelName) {
182
- this.imageToTextModelName = imageToTextModelName;
183
- }
178
+ // Note: imageToTextModelName parameter is kept for backward compatibility
179
+ // but is no longer used since we delegate to file-processor's implementation
184
180
  // Create the underlying cross-encoder strategy
185
181
  this.crossEncoderReranker = new CrossEncoderRerankingStrategy(crossEncoderModelName);
186
182
  }
187
- /**
188
- * Initialize the image-to-text model if not already done
189
- */
190
- async ensureInitialized() {
191
- if (!this.initialized) {
192
- try {
193
- console.log(`Loading image-to-text model: ${this.imageToTextModelName}`);
194
- // Set up polyfills for transformers.js
195
- this.ensurePolyfills();
196
- const { pipeline } = await import('@huggingface/transformers');
197
- this.imageToTextModel = await pipeline('image-to-text', this.imageToTextModelName);
198
- this.initialized = true;
199
- console.log(`Image-to-text model loaded successfully: ${this.imageToTextModelName}`);
200
- }
201
- catch (error) {
202
- console.warn(`Image-to-text model initialization failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
203
- this.isEnabled = false;
204
- }
205
- }
206
- }
207
- /**
208
- * Ensure DOM polyfills are set up for transformers.js
209
- */
210
- ensurePolyfills() {
211
- if (typeof window === 'undefined' && typeof globalThis !== 'undefined') {
212
- if (typeof globalThis.self === 'undefined') {
213
- globalThis.self = globalThis;
214
- }
215
- if (typeof global.self === 'undefined') {
216
- global.self = global;
217
- }
218
- }
219
- }
220
183
  /**
221
184
  * Generate text description for an image
185
+ * Uses the shared image-to-text functionality from file-processor
222
186
  */
223
187
  async generateImageDescription(imagePath) {
224
- await this.ensureInitialized();
225
- if (!this.imageToTextModel) {
226
- throw new Error('Image-to-text model not loaded');
227
- }
228
188
  try {
229
- const result = await this.imageToTextModel(imagePath);
230
- // Handle different response formats from the pipeline
231
- if (Array.isArray(result) && result.length > 0) {
232
- return result[0].generated_text || result[0].text || String(result[0]);
233
- }
234
- else if (result && typeof result === 'object') {
235
- return result.generated_text || result.text || String(result);
236
- }
237
- else {
238
- return String(result);
239
- }
189
+ // Use the file-processor's image description function which has proven to work reliably
190
+ const { generateImageDescriptionForFile } = await import('../file-processor.js');
191
+ const result = await generateImageDescriptionForFile(imagePath);
192
+ return result.description;
240
193
  }
241
194
  catch (error) {
242
195
  console.warn(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : 'Unknown error'}`);
@@ -249,22 +202,11 @@ export class TextDerivedRerankingStrategy {
249
202
  * Rerank search results using text-derived approach
250
203
  */
251
204
  rerank = async (query, results, contentType) => {
252
- // If strategy is disabled, return results unchanged
253
- if (!this.isEnabled) {
254
- return results;
255
- }
256
205
  // Validate content type
257
206
  if (contentType && !this.supportedContentTypes.includes(contentType)) {
258
207
  throw new Error(`Text-derived strategy does not support content type '${contentType}'. ` +
259
208
  `Supported types: ${this.supportedContentTypes.join(', ')}`);
260
209
  }
261
- // Ensure models are initialized
262
- await this.ensureInitialized();
263
- // If initialization failed, return results unchanged
264
- if (!this.isEnabled) {
265
- console.warn('Text-derived reranker not enabled, returning results unchanged');
266
- return results;
267
- }
268
210
  try {
269
211
  // Step 1: Convert images to text descriptions
270
212
  const processedResults = await Promise.all(results.map(async (result) => {
@@ -314,12 +256,8 @@ export class TextDerivedRerankingStrategy {
314
256
  * Configure the reranking strategy
315
257
  */
316
258
  configure(config) {
317
- if (config.imageToTextModel && typeof config.imageToTextModel === 'string') {
318
- this.imageToTextModelName = config.imageToTextModel;
319
- // Reset initialization to use new model
320
- this.initialized = false;
321
- this.imageToTextModel = null;
322
- }
259
+ // Note: imageToTextModel configuration is no longer used
260
+ // since we delegate to file-processor's implementation
323
261
  if (config.crossEncoderModel && typeof config.crossEncoderModel === 'string') {
324
262
  this.crossEncoderReranker.configure({ modelName: config.crossEncoderModel });
325
263
  }
@@ -334,15 +272,10 @@ export class TextDerivedRerankingStrategy {
334
272
  return {
335
273
  description: 'Text-derived reranking that converts images to text descriptions then applies cross-encoder reranking',
336
274
  requiredModels: [
337
- 'Xenova/vit-gpt2-image-captioning', // Image-to-text model
275
+ 'Xenova/vit-gpt2-image-captioning', // Image-to-text model (via file-processor)
338
276
  'Xenova/ms-marco-MiniLM-L-6-v2' // Cross-encoder model
339
277
  ],
340
278
  configOptions: {
341
- imageToTextModel: {
342
- type: 'string',
343
- description: 'Image-to-text model name for generating descriptions',
344
- default: 'Xenova/vit-gpt2-image-captioning'
345
- },
346
279
  crossEncoderModel: {
347
280
  type: 'string',
348
281
  description: 'Cross-encoder model name for text reranking',
@@ -360,16 +293,15 @@ export class TextDerivedRerankingStrategy {
360
293
  * Check if the strategy is ready to use
361
294
  */
362
295
  async isReady() {
363
- await this.ensureInitialized();
364
296
  const crossEncoderReady = await this.crossEncoderReranker.isReady();
365
- return this.isEnabled && this.imageToTextModel !== null && crossEncoderReady;
297
+ return this.isEnabled && crossEncoderReady;
366
298
  }
367
299
  /**
368
300
  * Get the current model names being used
369
301
  */
370
302
  getModelNames() {
371
303
  return {
372
- imageToText: this.imageToTextModelName,
304
+ imageToText: 'Xenova/vit-gpt2-image-captioning', // Fixed model via file-processor
373
305
  crossEncoder: this.crossEncoderReranker.getModelName()
374
306
  };
375
307
  }
@@ -377,8 +309,6 @@ export class TextDerivedRerankingStrategy {
377
309
  * Clean up resources
378
310
  */
379
311
  async cleanup() {
380
- this.initialized = false;
381
- this.imageToTextModel = null;
382
312
  await this.crossEncoderReranker.cleanup();
383
313
  }
384
314
  }
@@ -30,11 +30,8 @@ if (typeof window === 'undefined') {
30
30
  if (typeof globalThis.navigator === 'undefined') {
31
31
  globalThis.navigator = dom.window.navigator;
32
32
  }
33
- // Polyfill createImageBitmap if needed (for image processing)
34
- if (typeof globalThis.createImageBitmap === 'undefined') {
35
- globalThis.createImageBitmap = dom.window.createImageBitmap || (() => {
36
- throw new Error('createImageBitmap not available in Node.js environment');
37
- });
38
- }
33
+ // Note: Do NOT polyfill createImageBitmap with a fake implementation
34
+ // RawImage.fromURL() will handle image loading correctly without it
35
+ // Setting a fake createImageBitmap that throws errors breaks image loading
39
36
  }
40
37
  //# sourceMappingURL=dom-polyfills.js.map
@@ -346,24 +346,35 @@ function extractTitle(content, filePath) {
346
346
  * Cache for image-to-text pipeline to avoid reloading
347
347
  */
348
348
  let imageToTextPipeline = null;
349
+ let imageToTextPipelinePromise = null;
349
350
  /**
350
- * Initialize the image-to-text pipeline
351
+ * Initialize the image-to-text pipeline with proper async locking
351
352
  */
352
353
  async function initializeImageToTextPipeline(modelName = 'Xenova/vit-gpt2-image-captioning') {
354
+ // Return cached pipeline if available
353
355
  if (imageToTextPipeline) {
354
356
  return imageToTextPipeline;
355
357
  }
356
- try {
357
- const { pipeline } = await import('@huggingface/transformers');
358
- console.log(`Loading image-to-text model: ${modelName}`);
359
- imageToTextPipeline = await pipeline('image-to-text', modelName);
360
- console.log(`Successfully loaded image-to-text model: ${modelName}`);
361
- return imageToTextPipeline;
362
- }
363
- catch (error) {
364
- console.error(`Failed to load image-to-text model ${modelName}:`, error);
365
- throw new Error(`Failed to initialize image-to-text pipeline: ${error instanceof Error ? error.message : String(error)}`);
358
+ // If pipeline is currently loading, wait for it
359
+ if (imageToTextPipelinePromise) {
360
+ return imageToTextPipelinePromise;
366
361
  }
362
+ // Start loading pipeline
363
+ imageToTextPipelinePromise = (async () => {
364
+ try {
365
+ const { pipeline } = await import('@huggingface/transformers');
366
+ console.log(`Loading image-to-text model: ${modelName}`);
367
+ imageToTextPipeline = await pipeline('image-to-text', modelName);
368
+ console.log(`Successfully loaded image-to-text model: ${modelName}`);
369
+ return imageToTextPipeline;
370
+ }
371
+ catch (error) {
372
+ console.error(`Failed to load image-to-text model ${modelName}:`, error);
373
+ imageToTextPipelinePromise = null; // Reset on error so it can be retried
374
+ throw new Error(`Failed to initialize image-to-text pipeline: ${error instanceof Error ? error.message : String(error)}`);
375
+ }
376
+ })();
377
+ return imageToTextPipelinePromise;
367
378
  }
368
379
  /**
369
380
  * Parse PNG image dimensions from file buffer
@@ -545,8 +556,11 @@ async function extractImageMetadata(imagePath) {
545
556
  async function generateImageDescription(imagePath, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
546
557
  try {
547
558
  const pipeline = await initializeImageToTextPipeline(options.model);
548
- // Generate description
549
- const result = await pipeline(imagePath, {
559
+ // Load image using RawImage.fromURL which works with local file paths
560
+ const { RawImage } = await import('@huggingface/transformers');
561
+ const image = await RawImage.fromURL(imagePath);
562
+ // Generate description with loaded image
563
+ const result = await pipeline(image, {
550
564
  max_length: options.maxLength || 50,
551
565
  num_beams: 4,
552
566
  early_stopping: true
@@ -597,93 +611,6 @@ async function generateImageDescriptionsBatch(imagePaths, options = DEFAULT_IMAG
597
611
  }
598
612
  return results;
599
613
  }
600
- /**
601
- * Generate text descriptions for multiple images using optimized batch processing
602
- * Uses BatchProcessingOptimizer for memory-efficient processing of large image collections
603
- */
604
- async function generateImageDescriptionsBatchOptimized(imagePaths, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
605
- // For small batches, use the existing implementation
606
- if (imagePaths.length <= 10) {
607
- return generateImageDescriptionsBatch(imagePaths, options);
608
- }
609
- try {
610
- // Import batch processing optimizer
611
- const { createImageBatchProcessor } = await import('./core/batch-processing-optimizer.js');
612
- const batchProcessor = createImageBatchProcessor();
613
- // Convert image paths to batch items
614
- const batchItems = imagePaths.map(path => ({
615
- content: path,
616
- contentType: 'image',
617
- metadata: { originalPath: path }
618
- }));
619
- // Create image description function
620
- const imageDescriptionFunction = async (item) => {
621
- try {
622
- const result = await generateImageDescription(item.content, options);
623
- return {
624
- embedding_id: `img_desc_${Date.now()}_${Math.random()}`,
625
- vector: new Float32Array([0]), // Placeholder vector
626
- contentType: 'image',
627
- metadata: {
628
- path: item.content,
629
- description: result.description,
630
- confidence: result.confidence,
631
- model: result.model
632
- }
633
- };
634
- }
635
- catch (error) {
636
- throw new Error(`Failed to generate description for ${item.content}: ${error instanceof Error ? error.message : String(error)}`);
637
- }
638
- };
639
- // Process with optimization and progress reporting
640
- const batchResult = await batchProcessor.processBatch(batchItems, imageDescriptionFunction, (stats) => {
641
- console.log(`Image description progress: ${stats.processedItems}/${stats.totalItems} (${Math.round((stats.processedItems / stats.totalItems) * 100)}%)`);
642
- console.log(` Memory usage: ${stats.memoryUsageMB}MB (peak: ${stats.peakMemoryUsageMB}MB)`);
643
- if (stats.failedItems > 0) {
644
- console.log(` Failed items: ${stats.failedItems}`);
645
- }
646
- });
647
- // Log final statistics
648
- console.log(`✓ Image description generation complete:`);
649
- console.log(` Processed: ${batchResult.stats.processedItems}/${batchResult.stats.totalItems}`);
650
- console.log(` Failed: ${batchResult.stats.failedItems}`);
651
- console.log(` Processing time: ${Math.round(batchResult.stats.processingTimeMs / 1000)}s`);
652
- console.log(` Rate: ${Math.round(batchResult.stats.itemsPerSecond)} images/sec`);
653
- console.log(` Peak memory usage: ${batchResult.stats.peakMemoryUsageMB}MB`);
654
- if (batchResult.stats.retryCount > 0) {
655
- console.log(` Retries: ${batchResult.stats.retryCount}`);
656
- }
657
- // Convert results back to expected format
658
- const results = [];
659
- // Add successful results
660
- for (const result of batchResult.results) {
661
- if (result.metadata?.description) {
662
- results.push({
663
- path: result.metadata.path,
664
- result: {
665
- description: result.metadata.description,
666
- confidence: result.metadata.confidence,
667
- model: result.metadata.model
668
- }
669
- });
670
- }
671
- }
672
- // Add failed results
673
- for (const error of batchResult.errors) {
674
- results.push({
675
- path: error.item.content,
676
- error: error.error
677
- });
678
- }
679
- return results;
680
- }
681
- catch (error) {
682
- console.warn(`Optimized batch processing failed, falling back to standard batch processing: ${error instanceof Error ? error.message : String(error)}`);
683
- // Fall back to existing implementation
684
- return generateImageDescriptionsBatch(imagePaths, options);
685
- }
686
- }
687
614
  /**
688
615
  * Process image file to extract text description and metadata
689
616
  */
@@ -834,8 +761,8 @@ export async function processFiles(filePaths, pathManager, imageToTextOptions) {
834
761
  if (imageFiles.length > 0) {
835
762
  console.log(`Processing ${imageFiles.length} image files with optimized batch processing`);
836
763
  try {
837
- // Use optimized batch processing for image descriptions
838
- const batchResults = await generateImageDescriptionsBatchOptimized(imageFiles, imageToTextOptions);
764
+ // Use batch processing for image descriptions
765
+ const batchResults = await generateImageDescriptionsBatch(imageFiles, imageToTextOptions);
839
766
  // Convert batch results to documents with metadata extraction
840
767
  for (const batchResult of batchResults) {
841
768
  try {
@@ -961,6 +888,7 @@ export async function cleanupImageProcessingResources() {
961
888
  await imageToTextPipeline.dispose();
962
889
  }
963
890
  imageToTextPipeline = null;
891
+ imageToTextPipelinePromise = null;
964
892
  console.log('Image-to-text pipeline cleaned up');
965
893
  }
966
894
  catch (error) {
package/dist/ingestion.js CHANGED
@@ -64,7 +64,12 @@ export class IngestionPipeline {
64
64
  if (!this.corePipeline) {
65
65
  throw new Error('IngestionPipeline failed to initialize');
66
66
  }
67
- return this.corePipeline.ingestFile(filePath, options);
67
+ // Merge mode from constructor options with runtime options
68
+ const mergedOptions = {
69
+ ...options,
70
+ mode: options?.mode || this.options.mode
71
+ };
72
+ return this.corePipeline.ingestFile(filePath, mergedOptions);
68
73
  }
69
74
  /**
70
75
  * Ingest all documents in a directory
@@ -74,7 +79,12 @@ export class IngestionPipeline {
74
79
  if (!this.corePipeline) {
75
80
  throw new Error('IngestionPipeline failed to initialize');
76
81
  }
77
- return this.corePipeline.ingestDirectory(directoryPath, options);
82
+ // Merge mode from constructor options with runtime options
83
+ const mergedOptions = {
84
+ ...options,
85
+ mode: options?.mode || this.options.mode
86
+ };
87
+ return this.corePipeline.ingestDirectory(directoryPath, mergedOptions);
78
88
  }
79
89
  /**
80
90
  * Ingest content from memory buffer
@@ -95,7 +105,12 @@ export class IngestionPipeline {
95
105
  if (!this.corePipeline) {
96
106
  throw new Error('IngestionPipeline failed to initialize');
97
107
  }
98
- return this.corePipeline.ingestFromMemory(content, metadata, options);
108
+ // Merge mode from constructor options with runtime options
109
+ const mergedOptions = {
110
+ ...options,
111
+ mode: options?.mode || this.options.mode
112
+ };
113
+ return this.corePipeline.ingestFromMemory(content, metadata, mergedOptions);
99
114
  }
100
115
  /**
101
116
  * Clean up resources
@@ -339,15 +339,13 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
339
339
  throw new Error('CLIP text model or tokenizer not initialized');
340
340
  }
341
341
  try {
342
- // Validate and truncate text if necessary (CLIP has a 77 token limit)
343
- this.validateTextLength(text);
344
- const finalProcessedText = this.truncateText(processedText);
345
342
  // Use the validated CLIPTextModelWithProjection approach (no pixel_values errors)
346
343
  // Tokenize text with CLIP's requirements
347
- const tokens = await this.tokenizer(finalProcessedText, {
344
+ // The tokenizer handles truncation at 77 TOKENS (not characters)
345
+ const tokens = await this.tokenizer(processedText, {
348
346
  padding: true,
349
347
  truncation: true,
350
- max_length: 77, // CLIP's text sequence length limit
348
+ max_length: 77, // CLIP's text sequence length limit (77 tokens)
351
349
  return_tensors: 'pt'
352
350
  });
353
351
  // Log token information for debugging (only in development)
@@ -355,7 +353,7 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
355
353
  const tokenIds = tokens.input_ids?.data || [];
356
354
  const actualTokenCount = Array.from(tokenIds).filter((id) => id !== 0).length;
357
355
  if (actualTokenCount >= 77) {
358
- console.warn(`Text truncated: "${finalProcessedText.substring(0, 50)}..." (${actualTokenCount}+ tokens -> 77 tokens)`);
356
+ console.warn(`Text truncated by tokenizer: "${processedText.substring(0, 50)}..." (truncated to 77 tokens)`);
359
357
  }
360
358
  }
361
359
  // Generate text embedding using CLIPTextModelWithProjection
@@ -389,15 +387,15 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
389
387
  console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
390
388
  }
391
389
  // Generate unique embedding ID
392
- const embeddingId = this.generateEmbeddingId(finalProcessedText, 'text');
390
+ const embeddingId = this.generateEmbeddingId(processedText, 'text');
393
391
  return {
394
392
  embedding_id: embeddingId,
395
393
  vector: embedding,
396
394
  contentType: 'text',
397
395
  metadata: {
398
396
  originalText: text,
399
- processedText: finalProcessedText,
400
- textLength: finalProcessedText.length,
397
+ processedText: processedText,
398
+ textLength: processedText.length,
401
399
  embeddingMagnitudeBeforeNorm: magnitudeBeforeNorm,
402
400
  embeddingMagnitudeAfterNorm: magnitudeAfterNorm,
403
401
  normalized: true,
@@ -682,8 +680,9 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
682
680
  const { createTextBatchProcessor } = await import('../core/batch-processing-optimizer.js');
683
681
  const batchProcessor = createTextBatchProcessor();
684
682
  // Convert to EmbeddingBatchItem format
683
+ // Let tokenizer handle truncation at 77 tokens (not characters)
685
684
  const batchItems = textItems.map(item => ({
686
- content: this.truncateText(item.content.trim()),
685
+ content: item.content.trim(),
687
686
  contentType: item.contentType,
688
687
  metadata: item.metadata
689
688
  }));
@@ -773,7 +772,8 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
773
772
  */
774
773
  async processBatchText(textItems) {
775
774
  // Prepare texts for batch processing
776
- const texts = textItems.map(item => this.truncateText(item.content.trim()));
775
+ // Let tokenizer handle truncation at 77 tokens (not characters)
776
+ const texts = textItems.map(item => item.content.trim());
777
777
  // Tokenize all texts in batch
778
778
  const tokensBatch = await Promise.all(texts.map(text => this.tokenizer(text, {
779
779
  padding: true,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "rag-lite-ts",
3
- "version": "2.0.1",
3
+ "version": "2.0.2",
4
4
  "description": "Local-first TypeScript retrieval engine with Chameleon Multimodal Architecture for semantic search over text and image content",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",