rag-lite-ts 2.0.5 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,22 @@ import { IngestionFactory } from './factories/ingestion-factory.js';
31
31
  import { getSystemInfo } from './core/db.js';
32
32
  import { DatabaseConnectionManager } from './core/database-connection-manager.js';
33
33
  import { config, validateCoreConfig, ConfigurationError } from './core/config.js';
34
+ /**
35
+ * Detect MIME type from file path or extension
36
+ */
37
+ function getMimeTypeFromPath(filePath) {
38
+ const ext = filePath.toLowerCase().split('.').pop() || '';
39
+ const mimeTypes = {
40
+ 'jpg': 'image/jpeg',
41
+ 'jpeg': 'image/jpeg',
42
+ 'png': 'image/png',
43
+ 'gif': 'image/gif',
44
+ 'webp': 'image/webp',
45
+ 'bmp': 'image/bmp',
46
+ 'svg': 'image/svg+xml'
47
+ };
48
+ return mimeTypes[ext] || 'image/jpeg'; // Default to JPEG if unknown
49
+ }
34
50
  /**
35
51
  * MCP Server class that wraps RAG-lite TS functionality
36
52
  * Implements MCP protocol interface without creating REST/GraphQL endpoints
@@ -118,8 +134,8 @@ class RagLiteMCPServer {
118
134
  },
119
135
  rerank_strategy: {
120
136
  type: 'string',
121
- description: 'Reranking strategy for multimodal mode. Options: text-derived (default), metadata, hybrid, disabled',
122
- enum: ['text-derived', 'metadata', 'hybrid', 'disabled']
137
+ description: 'Reranking strategy for multimodal mode. Options: text-derived (default), disabled',
138
+ enum: ['text-derived', 'disabled']
123
139
  },
124
140
  force_rebuild: {
125
141
  type: 'boolean',
@@ -152,8 +168,8 @@ class RagLiteMCPServer {
152
168
  },
153
169
  rerank_strategy: {
154
170
  type: 'string',
155
- description: 'Reranking strategy for multimodal mode. Options: text-derived (default), metadata, hybrid, disabled',
156
- enum: ['text-derived', 'metadata', 'hybrid', 'disabled'],
171
+ description: 'Reranking strategy for multimodal mode. Options: text-derived (default), disabled',
172
+ enum: ['text-derived', 'disabled'],
157
173
  default: 'text-derived'
158
174
  },
159
175
  title: {
@@ -375,50 +391,60 @@ class RagLiteMCPServer {
375
391
  const startTime = Date.now();
376
392
  const results = await this.searchEngine.search(args.query, searchOptions);
377
393
  const searchTime = Date.now() - startTime;
378
- // Format results for MCP response with content type information
379
- const formattedResults = {
394
+ // Format results for MCP response with proper image content support
395
+ const textResults = {
380
396
  query: args.query,
381
397
  results_count: results.length,
382
398
  search_time_ms: searchTime,
383
- results: await Promise.all(results.map(async (result, index) => {
384
- const formattedResult = {
385
- rank: index + 1,
386
- score: Math.round(result.score * 100) / 100, // Round to 2 decimal places
387
- content_type: result.contentType,
388
- document: {
389
- id: result.document.id,
390
- title: result.document.title,
391
- source: result.document.source,
392
- content_type: result.document.contentType
393
- },
394
- text: result.content
395
- };
396
- // For image content, include base64-encoded image data for MCP clients
397
- if (result.contentType === 'image' && result.document.contentId) {
398
- try {
399
- const imageData = await this.searchEngine.getContent(result.document.contentId, 'base64');
400
- formattedResult.image_data = imageData;
401
- formattedResult.image_format = 'base64';
402
- }
403
- catch (error) {
404
- // If image retrieval fails, include error but don't fail the entire search
405
- formattedResult.image_error = error instanceof Error ? error.message : 'Failed to retrieve image';
406
- }
407
- }
408
- // Include metadata if available
409
- if (result.metadata) {
410
- formattedResult.metadata = result.metadata;
411
- }
412
- return formattedResult;
399
+ results: results.map((result, index) => ({
400
+ rank: index + 1,
401
+ score: Math.round(result.score * 100) / 100,
402
+ content_type: result.contentType,
403
+ document: {
404
+ id: result.document.id,
405
+ title: result.document.title,
406
+ source: result.document.source,
407
+ content_type: result.document.contentType
408
+ },
409
+ text: result.content,
410
+ metadata: result.metadata,
411
+ // Reference to image content if applicable
412
+ has_image: result.contentType === 'image' && !!result.document.contentId
413
413
  }))
414
414
  };
415
+ // Build MCP response content array
416
+ const responseContent = [
417
+ {
418
+ type: 'text',
419
+ text: JSON.stringify(textResults, null, 2)
420
+ }
421
+ ];
422
+ // Add proper MCP image content for each image result
423
+ for (const result of results) {
424
+ if (result.contentType === 'image' && result.document.contentId) {
425
+ try {
426
+ const imageData = await this.searchEngine.getContent(result.document.contentId, 'base64');
427
+ const mimeType = getMimeTypeFromPath(result.document.source);
428
+ responseContent.push({
429
+ type: 'image',
430
+ data: imageData,
431
+ mimeType: mimeType,
432
+ annotations: {
433
+ audience: ['user'],
434
+ priority: 0.8,
435
+ title: result.document.title,
436
+ source: result.document.source
437
+ }
438
+ });
439
+ }
440
+ catch (error) {
441
+ // If image retrieval fails, log but don't fail the entire search
442
+ console.error(`Failed to retrieve image for ${result.document.source}:`, error);
443
+ }
444
+ }
445
+ }
415
446
  return {
416
- content: [
417
- {
418
- type: 'text',
419
- text: JSON.stringify(formattedResults, null, 2),
420
- },
421
- ],
447
+ content: responseContent
422
448
  };
423
449
  }
424
450
  catch (error) {
@@ -549,7 +575,7 @@ class RagLiteMCPServer {
549
575
  if (mode === 'text') {
550
576
  throw new Error('Reranking strategy parameter is only supported in multimodal mode');
551
577
  }
552
- const validStrategies = ['text-derived', 'metadata', 'hybrid', 'disabled'];
578
+ const validStrategies = ['text-derived', 'disabled'];
553
579
  if (!validStrategies.includes(args.rerank_strategy)) {
554
580
  throw new Error(`Invalid reranking strategy: ${args.rerank_strategy}. Supported strategies: ${validStrategies.join(', ')}`);
555
581
  }
@@ -1223,48 +1249,61 @@ class RagLiteMCPServer {
1223
1249
  const startTime = Date.now();
1224
1250
  const results = await this.searchEngine.search(args.query, searchOptions);
1225
1251
  const searchTime = Date.now() - startTime;
1226
- // Format results for MCP response with content type information and image data
1227
- const formattedResults = {
1252
+ // Format results for MCP response with proper image content support
1253
+ const textResults = {
1228
1254
  query: args.query,
1229
1255
  content_type_filter: args.content_type || 'all',
1230
1256
  results_count: results.length,
1231
1257
  search_time_ms: searchTime,
1232
- results: await Promise.all(results.map(async (result, index) => {
1233
- const formattedResult = {
1234
- rank: index + 1,
1235
- score: Math.round(result.score * 100) / 100,
1236
- content_type: result.contentType,
1237
- document: {
1238
- id: result.document.id,
1239
- title: result.document.title,
1240
- source: result.document.source,
1241
- content_type: result.document.contentType
1242
- },
1243
- text: result.content,
1244
- metadata: result.metadata
1245
- };
1246
- // For image content, include base64-encoded image data for MCP clients
1247
- if (result.contentType === 'image' && result.document.contentId) {
1248
- try {
1249
- const imageData = await this.searchEngine.getContent(result.document.contentId, 'base64');
1250
- formattedResult.image_data = imageData;
1251
- formattedResult.image_format = 'base64';
1252
- }
1253
- catch (error) {
1254
- // If image retrieval fails, include error but don't fail the entire search
1255
- formattedResult.image_error = error instanceof Error ? error.message : 'Failed to retrieve image';
1256
- }
1257
- }
1258
- return formattedResult;
1258
+ results: results.map((result, index) => ({
1259
+ rank: index + 1,
1260
+ score: Math.round(result.score * 100) / 100,
1261
+ content_type: result.contentType,
1262
+ document: {
1263
+ id: result.document.id,
1264
+ title: result.document.title,
1265
+ source: result.document.source,
1266
+ content_type: result.document.contentType
1267
+ },
1268
+ text: result.content,
1269
+ metadata: result.metadata,
1270
+ // Reference to image content if applicable
1271
+ has_image: result.contentType === 'image' && !!result.document.contentId
1259
1272
  }))
1260
1273
  };
1274
+ // Build MCP response content array
1275
+ const responseContent = [
1276
+ {
1277
+ type: 'text',
1278
+ text: JSON.stringify(textResults, null, 2)
1279
+ }
1280
+ ];
1281
+ // Add proper MCP image content for each image result
1282
+ for (const result of results) {
1283
+ if (result.contentType === 'image' && result.document.contentId) {
1284
+ try {
1285
+ const imageData = await this.searchEngine.getContent(result.document.contentId, 'base64');
1286
+ const mimeType = getMimeTypeFromPath(result.document.source);
1287
+ responseContent.push({
1288
+ type: 'image',
1289
+ data: imageData,
1290
+ mimeType: mimeType,
1291
+ annotations: {
1292
+ audience: ['user'],
1293
+ priority: 0.8,
1294
+ title: result.document.title,
1295
+ source: result.document.source
1296
+ }
1297
+ });
1298
+ }
1299
+ catch (error) {
1300
+ // If image retrieval fails, log but don't fail the entire search
1301
+ console.error(`Failed to retrieve image for ${result.document.source}:`, error);
1302
+ }
1303
+ }
1304
+ }
1261
1305
  return {
1262
- content: [
1263
- {
1264
- type: 'text',
1265
- text: JSON.stringify(formattedResults, null, 2),
1266
- },
1267
- ],
1306
+ content: responseContent
1268
1307
  };
1269
1308
  }
1270
1309
  catch (error) {
@@ -1418,23 +1457,6 @@ class RagLiteMCPServer {
1418
1457
  strategyInfo.accuracy = 'high';
1419
1458
  strategyInfo.use_cases = ['Mixed content with images', 'Visual documentation', 'Diagrams and charts'];
1420
1459
  break;
1421
- case 'metadata':
1422
- strategyInfo.description = 'Uses file metadata, filenames, and content properties for scoring without model inference';
1423
- strategyInfo.requirements = ['None - uses file system metadata only'];
1424
- strategyInfo.supported_content_types = ['text', 'image', 'pdf', 'docx'];
1425
- strategyInfo.performance_impact = 'low';
1426
- strategyInfo.accuracy = 'medium';
1427
- strategyInfo.use_cases = ['Fast retrieval', 'Filename-based search', 'Content type filtering'];
1428
- break;
1429
- case 'hybrid':
1430
- strategyInfo.description = 'Combines multiple reranking signals (semantic + metadata) with configurable weights';
1431
- strategyInfo.requirements = ['Text-derived reranker', 'Metadata reranker'];
1432
- strategyInfo.supported_content_types = ['text', 'image', 'pdf', 'docx'];
1433
- strategyInfo.performance_impact = 'high';
1434
- strategyInfo.accuracy = 'very high';
1435
- strategyInfo.use_cases = ['Best overall accuracy', 'Complex multimodal collections', 'Production systems'];
1436
- strategyInfo.default_weights = { semantic: 0.7, metadata: 0.3 };
1437
- break;
1438
1460
  case 'disabled':
1439
1461
  strategyInfo.description = 'No reranking applied - results ordered by vector similarity scores only';
1440
1462
  strategyInfo.requirements = ['None'];
@@ -1455,8 +1477,8 @@ class RagLiteMCPServer {
1455
1477
  strategies_by_mode: strategiesByMode,
1456
1478
  recommendations: {
1457
1479
  text_mode: 'Use cross-encoder for best accuracy, disabled for best performance',
1458
- multimodal_mode: 'Use hybrid for best accuracy, text-derived for good balance, metadata for fast retrieval',
1459
- development: 'Start with disabled or metadata for fast iteration, upgrade to cross-encoder/text-derived for production'
1480
+ multimodal_mode: 'Use text-derived for best accuracy, disabled for best performance',
1481
+ development: 'Start with disabled for fast iteration, upgrade to cross-encoder/text-derived for production'
1460
1482
  }
1461
1483
  };
1462
1484
  return {
@@ -1505,16 +1527,16 @@ class RagLiteMCPServer {
1505
1527
  const db = await DatabaseConnectionManager.getConnection(config.db_file);
1506
1528
  try {
1507
1529
  // Get document count by content type
1508
- const docsByType = await db.all(`
1509
- SELECT content_type, COUNT(*) as count
1510
- FROM documents
1511
- GROUP BY content_type
1530
+ const docsByType = await db.all(`
1531
+ SELECT content_type, COUNT(*) as count
1532
+ FROM documents
1533
+ GROUP BY content_type
1512
1534
  `);
1513
1535
  // Get chunk count by content type
1514
- const chunksByType = await db.all(`
1515
- SELECT content_type, COUNT(*) as count
1516
- FROM chunks
1517
- GROUP BY content_type
1536
+ const chunksByType = await db.all(`
1537
+ SELECT content_type, COUNT(*) as count
1538
+ FROM chunks
1539
+ GROUP BY content_type
1518
1540
  `);
1519
1541
  enhancedStats.content_breakdown = {
1520
1542
  documents_by_type: docsByType.reduce((acc, row) => {
@@ -386,6 +386,8 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
386
386
  if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
387
387
  console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
388
388
  }
389
+ // Log text embedding generation
390
+ console.log(`[CLIP] Generated text embedding for: "${processedText.substring(0, 30)}${processedText.length > 30 ? '...' : ''}"`);
389
391
  // Generate unique embedding ID
390
392
  const embeddingId = this.generateEmbeddingId(processedText, 'text');
391
393
  return {
@@ -602,9 +604,11 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
602
604
  const absolutePath = path.resolve(imagePath);
603
605
  // Try to use Sharp for better Node.js support
604
606
  try {
605
- const sharp = await import('sharp');
607
+ const sharpModule = await import('sharp');
608
+ const sharp = sharpModule.default;
609
+ sharp.concurrency(2);
606
610
  // Use Sharp to load and get raw pixel data
607
- const { data, info } = await sharp.default(absolutePath)
611
+ const { data, info } = await sharp(absolutePath)
608
612
  .resize(variant.imageSize, variant.imageSize, {
609
613
  fit: 'cover',
610
614
  position: 'center'
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "rag-lite-ts",
3
- "version": "2.0.5",
3
+ "version": "2.1.1",
4
4
  "description": "Local-first TypeScript retrieval engine with Chameleon Multimodal Architecture for semantic search over text and image content",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",