rag-lite-ts 2.0.5 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +815 -808
- package/dist/cli/indexer.js +3 -39
- package/dist/cli/search.d.ts +1 -1
- package/dist/cli/search.js +123 -19
- package/dist/cli.js +77 -94
- package/dist/core/binary-index-format.d.ts +28 -2
- package/dist/core/binary-index-format.js +196 -27
- package/dist/core/db.js +173 -173
- package/dist/core/ingestion.d.ts +5 -1
- package/dist/core/ingestion.js +123 -18
- package/dist/core/lazy-dependency-loader.d.ts +3 -8
- package/dist/core/lazy-dependency-loader.js +11 -29
- package/dist/core/mode-detection-service.js +1 -1
- package/dist/core/reranking-config.d.ts +1 -1
- package/dist/core/reranking-config.js +7 -16
- package/dist/core/reranking-factory.js +3 -184
- package/dist/core/search.d.ts +10 -0
- package/dist/core/search.js +35 -11
- package/dist/core/types.d.ts +1 -1
- package/dist/core/vector-index.d.ts +4 -0
- package/dist/core/vector-index.js +6 -0
- package/dist/factories/ingestion-factory.js +3 -1
- package/dist/file-processor.d.ts +2 -0
- package/dist/file-processor.js +20 -0
- package/dist/index-manager.d.ts +17 -1
- package/dist/index-manager.js +148 -7
- package/dist/mcp-server.js +127 -105
- package/dist/multimodal/clip-embedder.js +6 -2
- package/package.json +1 -1
package/dist/mcp-server.js
CHANGED
|
@@ -31,6 +31,22 @@ import { IngestionFactory } from './factories/ingestion-factory.js';
|
|
|
31
31
|
import { getSystemInfo } from './core/db.js';
|
|
32
32
|
import { DatabaseConnectionManager } from './core/database-connection-manager.js';
|
|
33
33
|
import { config, validateCoreConfig, ConfigurationError } from './core/config.js';
|
|
34
|
+
/**
|
|
35
|
+
* Detect MIME type from file path or extension
|
|
36
|
+
*/
|
|
37
|
+
function getMimeTypeFromPath(filePath) {
|
|
38
|
+
const ext = filePath.toLowerCase().split('.').pop() || '';
|
|
39
|
+
const mimeTypes = {
|
|
40
|
+
'jpg': 'image/jpeg',
|
|
41
|
+
'jpeg': 'image/jpeg',
|
|
42
|
+
'png': 'image/png',
|
|
43
|
+
'gif': 'image/gif',
|
|
44
|
+
'webp': 'image/webp',
|
|
45
|
+
'bmp': 'image/bmp',
|
|
46
|
+
'svg': 'image/svg+xml'
|
|
47
|
+
};
|
|
48
|
+
return mimeTypes[ext] || 'image/jpeg'; // Default to JPEG if unknown
|
|
49
|
+
}
|
|
34
50
|
/**
|
|
35
51
|
* MCP Server class that wraps RAG-lite TS functionality
|
|
36
52
|
* Implements MCP protocol interface without creating REST/GraphQL endpoints
|
|
@@ -118,8 +134,8 @@ class RagLiteMCPServer {
|
|
|
118
134
|
},
|
|
119
135
|
rerank_strategy: {
|
|
120
136
|
type: 'string',
|
|
121
|
-
description: 'Reranking strategy for multimodal mode. Options: text-derived (default),
|
|
122
|
-
enum: ['text-derived', '
|
|
137
|
+
description: 'Reranking strategy for multimodal mode. Options: text-derived (default), disabled',
|
|
138
|
+
enum: ['text-derived', 'disabled']
|
|
123
139
|
},
|
|
124
140
|
force_rebuild: {
|
|
125
141
|
type: 'boolean',
|
|
@@ -152,8 +168,8 @@ class RagLiteMCPServer {
|
|
|
152
168
|
},
|
|
153
169
|
rerank_strategy: {
|
|
154
170
|
type: 'string',
|
|
155
|
-
description: 'Reranking strategy for multimodal mode. Options: text-derived (default),
|
|
156
|
-
enum: ['text-derived', '
|
|
171
|
+
description: 'Reranking strategy for multimodal mode. Options: text-derived (default), disabled',
|
|
172
|
+
enum: ['text-derived', 'disabled'],
|
|
157
173
|
default: 'text-derived'
|
|
158
174
|
},
|
|
159
175
|
title: {
|
|
@@ -375,50 +391,60 @@ class RagLiteMCPServer {
|
|
|
375
391
|
const startTime = Date.now();
|
|
376
392
|
const results = await this.searchEngine.search(args.query, searchOptions);
|
|
377
393
|
const searchTime = Date.now() - startTime;
|
|
378
|
-
// Format results for MCP response with content
|
|
379
|
-
const
|
|
394
|
+
// Format results for MCP response with proper image content support
|
|
395
|
+
const textResults = {
|
|
380
396
|
query: args.query,
|
|
381
397
|
results_count: results.length,
|
|
382
398
|
search_time_ms: searchTime,
|
|
383
|
-
results:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
if (result.contentType === 'image' && result.document.contentId) {
|
|
398
|
-
try {
|
|
399
|
-
const imageData = await this.searchEngine.getContent(result.document.contentId, 'base64');
|
|
400
|
-
formattedResult.image_data = imageData;
|
|
401
|
-
formattedResult.image_format = 'base64';
|
|
402
|
-
}
|
|
403
|
-
catch (error) {
|
|
404
|
-
// If image retrieval fails, include error but don't fail the entire search
|
|
405
|
-
formattedResult.image_error = error instanceof Error ? error.message : 'Failed to retrieve image';
|
|
406
|
-
}
|
|
407
|
-
}
|
|
408
|
-
// Include metadata if available
|
|
409
|
-
if (result.metadata) {
|
|
410
|
-
formattedResult.metadata = result.metadata;
|
|
411
|
-
}
|
|
412
|
-
return formattedResult;
|
|
399
|
+
results: results.map((result, index) => ({
|
|
400
|
+
rank: index + 1,
|
|
401
|
+
score: Math.round(result.score * 100) / 100,
|
|
402
|
+
content_type: result.contentType,
|
|
403
|
+
document: {
|
|
404
|
+
id: result.document.id,
|
|
405
|
+
title: result.document.title,
|
|
406
|
+
source: result.document.source,
|
|
407
|
+
content_type: result.document.contentType
|
|
408
|
+
},
|
|
409
|
+
text: result.content,
|
|
410
|
+
metadata: result.metadata,
|
|
411
|
+
// Reference to image content if applicable
|
|
412
|
+
has_image: result.contentType === 'image' && !!result.document.contentId
|
|
413
413
|
}))
|
|
414
414
|
};
|
|
415
|
+
// Build MCP response content array
|
|
416
|
+
const responseContent = [
|
|
417
|
+
{
|
|
418
|
+
type: 'text',
|
|
419
|
+
text: JSON.stringify(textResults, null, 2)
|
|
420
|
+
}
|
|
421
|
+
];
|
|
422
|
+
// Add proper MCP image content for each image result
|
|
423
|
+
for (const result of results) {
|
|
424
|
+
if (result.contentType === 'image' && result.document.contentId) {
|
|
425
|
+
try {
|
|
426
|
+
const imageData = await this.searchEngine.getContent(result.document.contentId, 'base64');
|
|
427
|
+
const mimeType = getMimeTypeFromPath(result.document.source);
|
|
428
|
+
responseContent.push({
|
|
429
|
+
type: 'image',
|
|
430
|
+
data: imageData,
|
|
431
|
+
mimeType: mimeType,
|
|
432
|
+
annotations: {
|
|
433
|
+
audience: ['user'],
|
|
434
|
+
priority: 0.8,
|
|
435
|
+
title: result.document.title,
|
|
436
|
+
source: result.document.source
|
|
437
|
+
}
|
|
438
|
+
});
|
|
439
|
+
}
|
|
440
|
+
catch (error) {
|
|
441
|
+
// If image retrieval fails, log but don't fail the entire search
|
|
442
|
+
console.error(`Failed to retrieve image for ${result.document.source}:`, error);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
}
|
|
415
446
|
return {
|
|
416
|
-
content:
|
|
417
|
-
{
|
|
418
|
-
type: 'text',
|
|
419
|
-
text: JSON.stringify(formattedResults, null, 2),
|
|
420
|
-
},
|
|
421
|
-
],
|
|
447
|
+
content: responseContent
|
|
422
448
|
};
|
|
423
449
|
}
|
|
424
450
|
catch (error) {
|
|
@@ -549,7 +575,7 @@ class RagLiteMCPServer {
|
|
|
549
575
|
if (mode === 'text') {
|
|
550
576
|
throw new Error('Reranking strategy parameter is only supported in multimodal mode');
|
|
551
577
|
}
|
|
552
|
-
const validStrategies = ['text-derived', '
|
|
578
|
+
const validStrategies = ['text-derived', 'disabled'];
|
|
553
579
|
if (!validStrategies.includes(args.rerank_strategy)) {
|
|
554
580
|
throw new Error(`Invalid reranking strategy: ${args.rerank_strategy}. Supported strategies: ${validStrategies.join(', ')}`);
|
|
555
581
|
}
|
|
@@ -1223,48 +1249,61 @@ class RagLiteMCPServer {
|
|
|
1223
1249
|
const startTime = Date.now();
|
|
1224
1250
|
const results = await this.searchEngine.search(args.query, searchOptions);
|
|
1225
1251
|
const searchTime = Date.now() - startTime;
|
|
1226
|
-
// Format results for MCP response with
|
|
1227
|
-
const
|
|
1252
|
+
// Format results for MCP response with proper image content support
|
|
1253
|
+
const textResults = {
|
|
1228
1254
|
query: args.query,
|
|
1229
1255
|
content_type_filter: args.content_type || 'all',
|
|
1230
1256
|
results_count: results.length,
|
|
1231
1257
|
search_time_ms: searchTime,
|
|
1232
|
-
results:
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
// For image content, include base64-encoded image data for MCP clients
|
|
1247
|
-
if (result.contentType === 'image' && result.document.contentId) {
|
|
1248
|
-
try {
|
|
1249
|
-
const imageData = await this.searchEngine.getContent(result.document.contentId, 'base64');
|
|
1250
|
-
formattedResult.image_data = imageData;
|
|
1251
|
-
formattedResult.image_format = 'base64';
|
|
1252
|
-
}
|
|
1253
|
-
catch (error) {
|
|
1254
|
-
// If image retrieval fails, include error but don't fail the entire search
|
|
1255
|
-
formattedResult.image_error = error instanceof Error ? error.message : 'Failed to retrieve image';
|
|
1256
|
-
}
|
|
1257
|
-
}
|
|
1258
|
-
return formattedResult;
|
|
1258
|
+
results: results.map((result, index) => ({
|
|
1259
|
+
rank: index + 1,
|
|
1260
|
+
score: Math.round(result.score * 100) / 100,
|
|
1261
|
+
content_type: result.contentType,
|
|
1262
|
+
document: {
|
|
1263
|
+
id: result.document.id,
|
|
1264
|
+
title: result.document.title,
|
|
1265
|
+
source: result.document.source,
|
|
1266
|
+
content_type: result.document.contentType
|
|
1267
|
+
},
|
|
1268
|
+
text: result.content,
|
|
1269
|
+
metadata: result.metadata,
|
|
1270
|
+
// Reference to image content if applicable
|
|
1271
|
+
has_image: result.contentType === 'image' && !!result.document.contentId
|
|
1259
1272
|
}))
|
|
1260
1273
|
};
|
|
1274
|
+
// Build MCP response content array
|
|
1275
|
+
const responseContent = [
|
|
1276
|
+
{
|
|
1277
|
+
type: 'text',
|
|
1278
|
+
text: JSON.stringify(textResults, null, 2)
|
|
1279
|
+
}
|
|
1280
|
+
];
|
|
1281
|
+
// Add proper MCP image content for each image result
|
|
1282
|
+
for (const result of results) {
|
|
1283
|
+
if (result.contentType === 'image' && result.document.contentId) {
|
|
1284
|
+
try {
|
|
1285
|
+
const imageData = await this.searchEngine.getContent(result.document.contentId, 'base64');
|
|
1286
|
+
const mimeType = getMimeTypeFromPath(result.document.source);
|
|
1287
|
+
responseContent.push({
|
|
1288
|
+
type: 'image',
|
|
1289
|
+
data: imageData,
|
|
1290
|
+
mimeType: mimeType,
|
|
1291
|
+
annotations: {
|
|
1292
|
+
audience: ['user'],
|
|
1293
|
+
priority: 0.8,
|
|
1294
|
+
title: result.document.title,
|
|
1295
|
+
source: result.document.source
|
|
1296
|
+
}
|
|
1297
|
+
});
|
|
1298
|
+
}
|
|
1299
|
+
catch (error) {
|
|
1300
|
+
// If image retrieval fails, log but don't fail the entire search
|
|
1301
|
+
console.error(`Failed to retrieve image for ${result.document.source}:`, error);
|
|
1302
|
+
}
|
|
1303
|
+
}
|
|
1304
|
+
}
|
|
1261
1305
|
return {
|
|
1262
|
-
content:
|
|
1263
|
-
{
|
|
1264
|
-
type: 'text',
|
|
1265
|
-
text: JSON.stringify(formattedResults, null, 2),
|
|
1266
|
-
},
|
|
1267
|
-
],
|
|
1306
|
+
content: responseContent
|
|
1268
1307
|
};
|
|
1269
1308
|
}
|
|
1270
1309
|
catch (error) {
|
|
@@ -1418,23 +1457,6 @@ class RagLiteMCPServer {
|
|
|
1418
1457
|
strategyInfo.accuracy = 'high';
|
|
1419
1458
|
strategyInfo.use_cases = ['Mixed content with images', 'Visual documentation', 'Diagrams and charts'];
|
|
1420
1459
|
break;
|
|
1421
|
-
case 'metadata':
|
|
1422
|
-
strategyInfo.description = 'Uses file metadata, filenames, and content properties for scoring without model inference';
|
|
1423
|
-
strategyInfo.requirements = ['None - uses file system metadata only'];
|
|
1424
|
-
strategyInfo.supported_content_types = ['text', 'image', 'pdf', 'docx'];
|
|
1425
|
-
strategyInfo.performance_impact = 'low';
|
|
1426
|
-
strategyInfo.accuracy = 'medium';
|
|
1427
|
-
strategyInfo.use_cases = ['Fast retrieval', 'Filename-based search', 'Content type filtering'];
|
|
1428
|
-
break;
|
|
1429
|
-
case 'hybrid':
|
|
1430
|
-
strategyInfo.description = 'Combines multiple reranking signals (semantic + metadata) with configurable weights';
|
|
1431
|
-
strategyInfo.requirements = ['Text-derived reranker', 'Metadata reranker'];
|
|
1432
|
-
strategyInfo.supported_content_types = ['text', 'image', 'pdf', 'docx'];
|
|
1433
|
-
strategyInfo.performance_impact = 'high';
|
|
1434
|
-
strategyInfo.accuracy = 'very high';
|
|
1435
|
-
strategyInfo.use_cases = ['Best overall accuracy', 'Complex multimodal collections', 'Production systems'];
|
|
1436
|
-
strategyInfo.default_weights = { semantic: 0.7, metadata: 0.3 };
|
|
1437
|
-
break;
|
|
1438
1460
|
case 'disabled':
|
|
1439
1461
|
strategyInfo.description = 'No reranking applied - results ordered by vector similarity scores only';
|
|
1440
1462
|
strategyInfo.requirements = ['None'];
|
|
@@ -1455,8 +1477,8 @@ class RagLiteMCPServer {
|
|
|
1455
1477
|
strategies_by_mode: strategiesByMode,
|
|
1456
1478
|
recommendations: {
|
|
1457
1479
|
text_mode: 'Use cross-encoder for best accuracy, disabled for best performance',
|
|
1458
|
-
multimodal_mode: 'Use
|
|
1459
|
-
development: 'Start with disabled
|
|
1480
|
+
multimodal_mode: 'Use text-derived for best accuracy, disabled for best performance',
|
|
1481
|
+
development: 'Start with disabled for fast iteration, upgrade to cross-encoder/text-derived for production'
|
|
1460
1482
|
}
|
|
1461
1483
|
};
|
|
1462
1484
|
return {
|
|
@@ -1505,16 +1527,16 @@ class RagLiteMCPServer {
|
|
|
1505
1527
|
const db = await DatabaseConnectionManager.getConnection(config.db_file);
|
|
1506
1528
|
try {
|
|
1507
1529
|
// Get document count by content type
|
|
1508
|
-
const docsByType = await db.all(`
|
|
1509
|
-
SELECT content_type, COUNT(*) as count
|
|
1510
|
-
FROM documents
|
|
1511
|
-
GROUP BY content_type
|
|
1530
|
+
const docsByType = await db.all(`
|
|
1531
|
+
SELECT content_type, COUNT(*) as count
|
|
1532
|
+
FROM documents
|
|
1533
|
+
GROUP BY content_type
|
|
1512
1534
|
`);
|
|
1513
1535
|
// Get chunk count by content type
|
|
1514
|
-
const chunksByType = await db.all(`
|
|
1515
|
-
SELECT content_type, COUNT(*) as count
|
|
1516
|
-
FROM chunks
|
|
1517
|
-
GROUP BY content_type
|
|
1536
|
+
const chunksByType = await db.all(`
|
|
1537
|
+
SELECT content_type, COUNT(*) as count
|
|
1538
|
+
FROM chunks
|
|
1539
|
+
GROUP BY content_type
|
|
1518
1540
|
`);
|
|
1519
1541
|
enhancedStats.content_breakdown = {
|
|
1520
1542
|
documents_by_type: docsByType.reduce((acc, row) => {
|
|
@@ -386,6 +386,8 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
|
|
|
386
386
|
if (Math.abs(magnitudeAfterNorm - 1.0) > 0.01) {
|
|
387
387
|
console.warn(`Warning: Embedding normalization may be imprecise (magnitude: ${magnitudeAfterNorm.toFixed(6)})`);
|
|
388
388
|
}
|
|
389
|
+
// Log text embedding generation
|
|
390
|
+
console.log(`[CLIP] Generated text embedding for: "${processedText.substring(0, 30)}${processedText.length > 30 ? '...' : ''}"`);
|
|
389
391
|
// Generate unique embedding ID
|
|
390
392
|
const embeddingId = this.generateEmbeddingId(processedText, 'text');
|
|
391
393
|
return {
|
|
@@ -602,9 +604,11 @@ export class CLIPEmbedder extends BaseUniversalEmbedder {
|
|
|
602
604
|
const absolutePath = path.resolve(imagePath);
|
|
603
605
|
// Try to use Sharp for better Node.js support
|
|
604
606
|
try {
|
|
605
|
-
const
|
|
607
|
+
const sharpModule = await import('sharp');
|
|
608
|
+
const sharp = sharpModule.default;
|
|
609
|
+
sharp.concurrency(2);
|
|
606
610
|
// Use Sharp to load and get raw pixel data
|
|
607
|
-
const { data, info } = await sharp
|
|
611
|
+
const { data, info } = await sharp(absolutePath)
|
|
608
612
|
.resize(variant.imageSize, variant.imageSize, {
|
|
609
613
|
fit: 'cover',
|
|
610
614
|
position: 'center'
|
package/package.json
CHANGED