rag-lite-ts 2.0.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -438,6 +438,33 @@ Now Claude can search your docs directly! Works with any MCP-compatible AI tool.
438
438
  </tr>
439
439
  </table>
440
440
 
441
+ ### 📁 Supported File Formats
442
+
443
+ RAG-lite TS supports the following file formats with full processing implementations:
444
+
445
+ **Text Mode:**
446
+ - Markdown: `.md`, `.mdx`
447
+ - Plain text: `.txt`
448
+ - Documents: `.pdf`, `.docx`
449
+
450
+ **Multimodal Mode** (includes all text formats plus):
451
+ - Images: `.jpg`, `.jpeg`, `.png`, `.gif`, `.webp`, `.bmp`
452
+
453
+ All formats work seamlessly with both single file and directory ingestion:
454
+
455
+ ```bash
456
+ # Single file ingestion
457
+ raglite ingest ./document.pdf
458
+ raglite ingest ./readme.md
459
+ raglite ingest ./notes.txt
460
+
461
+ # Directory ingestion (processes all supported formats)
462
+ raglite ingest ./docs/
463
+
464
+ # Multimodal ingestion (includes images)
465
+ raglite ingest ./mixed-content/ --mode multimodal
466
+ ```
467
+
441
468
  ## 🔧 How It Works
442
469
 
443
470
  RAG-lite TS follows a clean, efficient pipeline:
@@ -148,12 +148,31 @@ export async function runIngest(path, options = {}) {
148
148
  const pathType = stats.isDirectory() ? 'directory' : 'file';
149
149
  // Validate file type for single files
150
150
  if (stats.isFile()) {
151
- const validExtensions = ['.md', '.txt'];
151
+ const mode = options.mode || 'text';
152
+ // Only formats with actual processing implementations
153
+ const textExtensions = ['.md', '.txt', '.mdx', '.pdf', '.docx'];
154
+ const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'];
155
+ const validExtensions = mode === 'multimodal'
156
+ ? [...textExtensions, ...imageExtensions]
157
+ : textExtensions;
152
158
  const hasValidExtension = validExtensions.some(ext => path.toLowerCase().endsWith(ext));
153
159
  if (!hasValidExtension) {
154
160
  console.error(`Error: Unsupported file type: ${path}`);
155
161
  console.error('');
156
- console.error('Supported file types: .md, .txt');
162
+ if (mode === 'multimodal') {
163
+ console.error('Supported file types in multimodal mode:');
164
+ console.error(' Text: .md, .txt, .mdx');
165
+ console.error(' Documents: .pdf, .docx');
166
+ console.error(' Images: .jpg, .jpeg, .png, .gif, .webp, .bmp');
167
+ }
168
+ else {
169
+ console.error('Supported file types in text mode:');
170
+ console.error(' Text: .md, .txt, .mdx');
171
+ console.error(' Documents: .pdf, .docx');
172
+ console.error('');
173
+ console.error('For image files, use --mode multimodal:');
174
+ console.error(' raglite ingest <path> --mode multimodal');
175
+ }
157
176
  console.error('');
158
177
  console.error('If you want to ingest multiple files, provide a directory path instead.');
159
178
  process.exit(EXIT_CODES.INVALID_ARGUMENTS);
package/dist/cli.js CHANGED
@@ -26,7 +26,7 @@ Commands:
26
26
  help Show this help message
27
27
 
28
28
  Examples:
29
- raglite ingest ./docs/ # Ingest all .md/.txt files in docs/
29
+ raglite ingest ./docs/ # Ingest all .md/.txt/.docx/.pdf files in docs/
30
30
  raglite ingest ./readme.md # Ingest single file
31
31
  raglite ingest ./docs/ --model Xenova/all-mpnet-base-v2 # Use higher quality model
32
32
  raglite ingest ./docs/ --mode multimodal # Enable multimodal processing
@@ -126,7 +126,7 @@ function validateArgs(command, args, options) {
126
126
  console.error('Usage: raglite ingest <path>');
127
127
  console.error('');
128
128
  console.error('Examples:');
129
- console.error(' raglite ingest ./docs/ # Ingest all .md/.txt files in docs/');
129
+ console.error(' raglite ingest ./docs/ # Ingest all .md/.txt/.docx/.pdf files in docs/');
130
130
  console.error(' raglite ingest ./readme.md # Ingest single file');
131
131
  console.error(' raglite ingest ./docs/ --model Xenova/all-mpnet-base-v2 # Use higher quality model');
132
132
  console.error(' raglite ingest ./docs/ --mode multimodal # Enable multimodal processing');
@@ -421,18 +421,35 @@ export class TextIngestionFactory {
421
421
  console.log(`📁 Creating index directory: ${indexDir}`);
422
422
  mkdirSync(indexDir, { recursive: true });
423
423
  }
424
- // Step 1: Get model-specific defaults and merge with options
425
- const modelDefaults = getModelDefaults(options.embeddingModel || config.embedding_model);
424
+ // Step 1: Determine effective mode and select appropriate default model
425
+ const effectiveMode = options.mode || 'text';
426
+ // Step 1.5: Select model based on mode if not explicitly provided
427
+ let effectiveModel;
428
+ if (options.embeddingModel) {
429
+ // Use explicitly provided model
430
+ effectiveModel = options.embeddingModel;
431
+ }
432
+ else {
433
+ // Select default model based on mode
434
+ if (effectiveMode === 'multimodal') {
435
+ const { DEFAULT_MODELS } = await import('../core/model-registry.js');
436
+ effectiveModel = DEFAULT_MODELS['clip'];
437
+ console.log(`📊 No model specified for multimodal mode, using default: ${effectiveModel}`);
438
+ }
439
+ else {
440
+ effectiveModel = config.embedding_model;
441
+ }
442
+ }
443
+ // Step 2: Get model-specific defaults and merge with options
444
+ const modelDefaults = getModelDefaults(effectiveModel);
426
445
  const effectiveBatchSize = options.batchSize ?? modelDefaults.batch_size;
427
446
  const effectiveChunkSize = options.chunkSize ?? modelDefaults.chunk_size;
428
447
  const effectiveChunkOverlap = options.chunkOverlap ?? modelDefaults.chunk_overlap;
429
- // Step 1.5: Validate mode-model compatibility at creation time
430
- const effectiveMode = options.mode || 'text';
431
- const effectiveModel = options.embeddingModel || config.embedding_model;
448
+ // Step 3: Validate mode-model compatibility at creation time
432
449
  console.log('🔍 Validating mode-model compatibility...');
433
450
  validateModeModelCompatibilityOrThrow(effectiveMode, effectiveModel);
434
451
  console.log('✓ Mode-model compatibility validated');
435
- // Step 2: Initialize embedding function based on mode
452
+ // Step 4: Initialize embedding function based on mode
436
453
  let embedFn;
437
454
  if (effectiveMode === 'multimodal') {
438
455
  console.log('📊 Loading CLIP embedding model for multimodal mode...');
@@ -463,10 +480,10 @@ export class TextIngestionFactory {
463
480
  await initializeSchema(db);
464
481
  console.log('✓ Database connection established');
465
482
  // Step 3.1: Handle mode storage during ingestion
466
- await this.handleModeStorage(db, options, modelDefaults);
467
- // Step 4: Initialize index manager
483
+ await this.handleModeStorage(db, options, modelDefaults, effectiveModel);
484
+ // Step 5: Initialize index manager
468
485
  console.log('📇 Initializing vector index...');
469
- const indexManager = new IndexManager(indexPath, dbPath, modelDefaults.dimensions, options.embeddingModel || config.embedding_model);
486
+ const indexManager = new IndexManager(indexPath, dbPath, modelDefaults.dimensions, effectiveModel);
470
487
  // Check if we need to force recreation due to model change
471
488
  let forceRecreate = false;
472
489
  if (options.forceRebuild && existsSync(indexPath) && existsSync(dbPath)) {
@@ -477,9 +494,8 @@ export class TextIngestionFactory {
477
494
  const tempDb = await openDatabase(dbPath);
478
495
  try {
479
496
  const storedModel = await getStoredModelInfo(tempDb);
480
- const currentModel = options.embeddingModel || config.embedding_model;
481
- if (storedModel && storedModel.modelName !== currentModel) {
482
- console.log(`🔄 Model change detected: ${storedModel.modelName} → ${currentModel}`);
497
+ if (storedModel && storedModel.modelName !== effectiveModel) {
498
+ console.log(`🔄 Model change detected: ${storedModel.modelName} ${effectiveModel}`);
483
499
  console.log(`🔄 Dimensions change: ${storedModel.dimensions} → ${modelDefaults.dimensions}`);
484
500
  }
485
501
  else if (storedModel && storedModel.dimensions !== modelDefaults.dimensions) {
@@ -503,9 +519,8 @@ export class TextIngestionFactory {
503
519
  // Update stored model info when rebuilding or creating new index
504
520
  if (options.forceRebuild || forceRecreate) {
505
521
  const { setStoredModelInfo } = await import('../core/db.js');
506
- const currentModel = options.embeddingModel || config.embedding_model;
507
- await setStoredModelInfo(db, currentModel, modelDefaults.dimensions);
508
- console.log(`✓ Updated stored model info: ${currentModel} (${modelDefaults.dimensions} dimensions)`);
522
+ await setStoredModelInfo(db, effectiveModel, modelDefaults.dimensions);
523
+ console.log(`✓ Updated stored model info: ${effectiveModel} (${modelDefaults.dimensions} dimensions)`);
509
524
  }
510
525
  }
511
526
  else {
@@ -555,11 +570,10 @@ export class TextIngestionFactory {
555
570
  * Creates or validates system info based on the provided mode and options
556
571
  * @private
557
572
  */
558
- static async handleModeStorage(db, options, modelDefaults) {
573
+ static async handleModeStorage(db, options, modelDefaults, effectiveModel) {
559
574
  const { getSystemInfo, setSystemInfo } = await import('../core/db.js');
560
- // Determine the effective mode and model
575
+ // Determine the effective mode and reranking strategy
561
576
  const effectiveMode = options.mode || 'text';
562
- const effectiveModel = options.embeddingModel || config.embedding_model;
563
577
  const effectiveRerankingStrategy = options.rerankingStrategy || 'cross-encoder';
564
578
  // Determine model type based on model name
565
579
  let modelType;
package/dist/indexer.js CHANGED
@@ -16,11 +16,14 @@ async function main() {
16
16
  console.error(' <path> File or directory path to ingest (.md and .txt files)');
17
17
  console.error('');
18
18
  console.error('Examples:');
19
- console.error(' node indexer.js ./docs/ # Ingest all .md/.txt files in docs/');
19
+ console.error(' node indexer.js ./docs/ # Ingest all .md/.txt/.pdf/.docs files in docs/');
20
20
  console.error(' node indexer.js ./readme.md # Ingest single file');
21
21
  console.error(' node indexer.js ../project/docs/ # Ingest from parent directory');
22
22
  console.error('');
23
- console.error('Supported file types: .md (Markdown), .txt (Plain text)');
23
+ console.error('Supported file types:');
24
+ console.error(' Text: .md, .txt, .mdx');
25
+ console.error(' Documents: .pdf, .docx');
26
+ console.error(' Images (multimodal mode): .jpg, .jpeg, .png, .gif, .webp, .bmp');
24
27
  console.error('');
25
28
  console.error('After ingestion, use: node search.js "your query"');
26
29
  process.exit(EXIT_CODES.INVALID_ARGUMENTS);
@@ -501,16 +501,23 @@ class RagLiteMCPServer {
501
501
  catch (error) {
502
502
  throw new Error(`Cannot access path: ${args.path}. Check permissions.`);
503
503
  }
504
- // Validate file type for single files
504
+ // Validate mode parameter
505
+ const mode = args.mode || 'text';
506
+ // Validate file type for single files (only formats with actual processing implementations)
505
507
  if (stats.isFile()) {
506
- const validExtensions = ['.md', '.txt'];
508
+ const textExtensions = ['.md', '.txt', '.mdx', '.pdf', '.docx'];
509
+ const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'];
510
+ const validExtensions = mode === 'multimodal'
511
+ ? [...textExtensions, ...imageExtensions]
512
+ : textExtensions;
507
513
  const hasValidExtension = validExtensions.some(ext => args.path.toLowerCase().endsWith(ext));
508
514
  if (!hasValidExtension) {
509
- throw new Error(`Unsupported file type: ${args.path}. Supported types: .md, .txt`);
515
+ const supportedTypes = mode === 'multimodal'
516
+ ? '.md, .txt, .mdx, .pdf, .docx, .jpg, .jpeg, .png, .gif, .webp, .bmp'
517
+ : '.md, .txt, .mdx, .pdf, .docx';
518
+ throw new Error(`Unsupported file type: ${args.path}. Supported types: ${supportedTypes}`);
510
519
  }
511
520
  }
512
- // Validate mode parameter
513
- const mode = args.mode || 'text';
514
521
  if (!['text', 'multimodal'].includes(mode)) {
515
522
  throw new Error(`Invalid mode: ${mode}. Supported modes: text, multimodal`);
516
523
  }
@@ -585,8 +592,8 @@ class RagLiteMCPServer {
585
592
  chunks_per_second: result.processingTimeMs > 0 ?
586
593
  Math.round(result.chunksCreated / (result.processingTimeMs / 1000) * 100) / 100 : 0,
587
594
  supported_file_types: mode === 'multimodal'
588
- ? ['md', 'txt', 'jpg', 'jpeg', 'png', 'gif', 'webp']
589
- : ['md', 'txt'],
595
+ ? ['md', 'txt', 'mdx', 'pdf', 'docx', 'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']
596
+ : ['md', 'txt', 'mdx', 'pdf', 'docx'],
590
597
  success: true
591
598
  };
592
599
  return {
@@ -1132,7 +1139,7 @@ class RagLiteMCPServer {
1132
1139
  text_search: true,
1133
1140
  image_search: false,
1134
1141
  multimodal_reranking: false,
1135
- supported_file_types: ['md', 'txt']
1142
+ supported_file_types: ['md', 'txt', 'mdx', 'pdf', 'docx']
1136
1143
  };
1137
1144
  }
1138
1145
  else if (systemInfo.mode === 'multimodal') {
@@ -1140,7 +1147,7 @@ class RagLiteMCPServer {
1140
1147
  text_search: true,
1141
1148
  image_search: true,
1142
1149
  multimodal_reranking: true,
1143
- supported_file_types: ['md', 'txt', 'jpg', 'png', 'gif', 'webp']
1150
+ supported_file_types: ['md', 'txt', 'mdx', 'pdf', 'docx', 'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']
1144
1151
  };
1145
1152
  }
1146
1153
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "rag-lite-ts",
3
- "version": "2.0.2",
3
+ "version": "2.0.3",
4
4
  "description": "Local-first TypeScript retrieval engine with Chameleon Multimodal Architecture for semantic search over text and image content",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",