rag-lite-ts 1.0.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +651 -109
- package/dist/cli/indexer.js +262 -46
- package/dist/cli/search.js +54 -32
- package/dist/cli.js +185 -28
- package/dist/config.d.ts +34 -73
- package/dist/config.js +50 -255
- package/dist/core/abstract-embedder.d.ts +125 -0
- package/dist/core/abstract-embedder.js +264 -0
- package/dist/core/actionable-error-messages.d.ts +60 -0
- package/dist/core/actionable-error-messages.js +397 -0
- package/dist/core/adapters.d.ts +93 -0
- package/dist/core/adapters.js +139 -0
- package/dist/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/core/batch-processing-optimizer.js +541 -0
- package/dist/core/chunker.d.ts +119 -0
- package/dist/core/chunker.js +73 -0
- package/dist/core/cli-database-utils.d.ts +53 -0
- package/dist/core/cli-database-utils.js +239 -0
- package/dist/core/config.d.ts +102 -0
- package/dist/core/config.js +247 -0
- package/dist/core/content-errors.d.ts +111 -0
- package/dist/core/content-errors.js +362 -0
- package/dist/core/content-manager.d.ts +343 -0
- package/dist/core/content-manager.js +1504 -0
- package/dist/core/content-performance-optimizer.d.ts +150 -0
- package/dist/core/content-performance-optimizer.js +516 -0
- package/dist/core/content-resolver.d.ts +104 -0
- package/dist/core/content-resolver.js +285 -0
- package/dist/core/cross-modal-search.d.ts +164 -0
- package/dist/core/cross-modal-search.js +342 -0
- package/dist/core/database-connection-manager.d.ts +109 -0
- package/dist/core/database-connection-manager.js +304 -0
- package/dist/core/db.d.ts +245 -0
- package/dist/core/db.js +952 -0
- package/dist/core/embedder-factory.d.ts +176 -0
- package/dist/core/embedder-factory.js +338 -0
- package/dist/{error-handler.d.ts → core/error-handler.d.ts} +23 -2
- package/dist/{error-handler.js → core/error-handler.js} +51 -8
- package/dist/core/index.d.ts +59 -0
- package/dist/core/index.js +69 -0
- package/dist/core/ingestion.d.ts +213 -0
- package/dist/core/ingestion.js +812 -0
- package/dist/core/interfaces.d.ts +408 -0
- package/dist/core/interfaces.js +106 -0
- package/dist/core/lazy-dependency-loader.d.ts +152 -0
- package/dist/core/lazy-dependency-loader.js +453 -0
- package/dist/core/mode-detection-service.d.ts +150 -0
- package/dist/core/mode-detection-service.js +565 -0
- package/dist/core/mode-model-validator.d.ts +92 -0
- package/dist/core/mode-model-validator.js +203 -0
- package/dist/core/model-registry.d.ts +120 -0
- package/dist/core/model-registry.js +415 -0
- package/dist/core/model-validator.d.ts +217 -0
- package/dist/core/model-validator.js +782 -0
- package/dist/{path-manager.d.ts → core/path-manager.d.ts} +5 -0
- package/dist/{path-manager.js → core/path-manager.js} +5 -0
- package/dist/core/polymorphic-search-factory.d.ts +154 -0
- package/dist/core/polymorphic-search-factory.js +344 -0
- package/dist/core/raglite-paths.d.ts +121 -0
- package/dist/core/raglite-paths.js +145 -0
- package/dist/core/reranking-config.d.ts +42 -0
- package/dist/core/reranking-config.js +156 -0
- package/dist/core/reranking-factory.d.ts +92 -0
- package/dist/core/reranking-factory.js +591 -0
- package/dist/core/reranking-strategies.d.ts +325 -0
- package/dist/core/reranking-strategies.js +720 -0
- package/dist/core/resource-cleanup.d.ts +163 -0
- package/dist/core/resource-cleanup.js +371 -0
- package/dist/core/resource-manager.d.ts +212 -0
- package/dist/core/resource-manager.js +564 -0
- package/dist/core/search-pipeline.d.ts +111 -0
- package/dist/core/search-pipeline.js +287 -0
- package/dist/core/search.d.ts +131 -0
- package/dist/core/search.js +296 -0
- package/dist/core/streaming-operations.d.ts +145 -0
- package/dist/core/streaming-operations.js +409 -0
- package/dist/core/types.d.ts +66 -0
- package/dist/core/types.js +6 -0
- package/dist/core/universal-embedder.d.ts +177 -0
- package/dist/core/universal-embedder.js +139 -0
- package/dist/core/validation-messages.d.ts +99 -0
- package/dist/core/validation-messages.js +334 -0
- package/dist/{vector-index.d.ts → core/vector-index.d.ts} +4 -0
- package/dist/{vector-index.js → core/vector-index.js} +21 -3
- package/dist/dom-polyfills.d.ts +6 -0
- package/dist/dom-polyfills.js +40 -0
- package/dist/factories/index.d.ts +43 -0
- package/dist/factories/index.js +44 -0
- package/dist/factories/text-factory.d.ts +560 -0
- package/dist/factories/text-factory.js +968 -0
- package/dist/file-processor.d.ts +90 -4
- package/dist/file-processor.js +723 -20
- package/dist/index-manager.d.ts +3 -2
- package/dist/index-manager.js +13 -11
- package/dist/index.d.ts +72 -8
- package/dist/index.js +102 -16
- package/dist/indexer.js +1 -1
- package/dist/ingestion.d.ts +44 -154
- package/dist/ingestion.js +75 -671
- package/dist/mcp-server.d.ts +35 -3
- package/dist/mcp-server.js +1186 -79
- package/dist/multimodal/clip-embedder.d.ts +314 -0
- package/dist/multimodal/clip-embedder.js +945 -0
- package/dist/multimodal/index.d.ts +6 -0
- package/dist/multimodal/index.js +6 -0
- package/dist/preprocess.js +1 -1
- package/dist/run-error-recovery-tests.d.ts +7 -0
- package/dist/run-error-recovery-tests.js +101 -0
- package/dist/search-standalone.js +1 -1
- package/dist/search.d.ts +51 -69
- package/dist/search.js +117 -412
- package/dist/test-utils.d.ts +8 -26
- package/dist/text/chunker.d.ts +33 -0
- package/dist/{chunker.js → text/chunker.js} +98 -75
- package/dist/{embedder.d.ts → text/embedder.d.ts} +22 -1
- package/dist/{embedder.js → text/embedder.js} +84 -10
- package/dist/text/index.d.ts +8 -0
- package/dist/text/index.js +9 -0
- package/dist/text/preprocessors/index.d.ts +17 -0
- package/dist/text/preprocessors/index.js +38 -0
- package/dist/text/preprocessors/mdx.d.ts +25 -0
- package/dist/text/preprocessors/mdx.js +101 -0
- package/dist/text/preprocessors/mermaid.d.ts +68 -0
- package/dist/text/preprocessors/mermaid.js +330 -0
- package/dist/text/preprocessors/registry.d.ts +56 -0
- package/dist/text/preprocessors/registry.js +180 -0
- package/dist/text/reranker.d.ts +59 -0
- package/dist/{reranker.js → text/reranker.js} +138 -53
- package/dist/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/text/sentence-transformer-embedder.js +340 -0
- package/dist/{tokenizer.d.ts → text/tokenizer.d.ts} +1 -0
- package/dist/{tokenizer.js → text/tokenizer.js} +7 -2
- package/dist/types.d.ts +40 -1
- package/dist/utils/vector-math.d.ts +31 -0
- package/dist/utils/vector-math.js +70 -0
- package/package.json +16 -4
- package/dist/api-errors.d.ts.map +0 -1
- package/dist/api-errors.js.map +0 -1
- package/dist/chunker.d.ts +0 -47
- package/dist/chunker.d.ts.map +0 -1
- package/dist/chunker.js.map +0 -1
- package/dist/cli/indexer.d.ts.map +0 -1
- package/dist/cli/indexer.js.map +0 -1
- package/dist/cli/search.d.ts.map +0 -1
- package/dist/cli/search.js.map +0 -1
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/db.d.ts +0 -90
- package/dist/db.d.ts.map +0 -1
- package/dist/db.js +0 -340
- package/dist/db.js.map +0 -1
- package/dist/embedder.d.ts.map +0 -1
- package/dist/embedder.js.map +0 -1
- package/dist/error-handler.d.ts.map +0 -1
- package/dist/error-handler.js.map +0 -1
- package/dist/file-processor.d.ts.map +0 -1
- package/dist/file-processor.js.map +0 -1
- package/dist/index-manager.d.ts.map +0 -1
- package/dist/index-manager.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/indexer.d.ts.map +0 -1
- package/dist/indexer.js.map +0 -1
- package/dist/ingestion.d.ts.map +0 -1
- package/dist/ingestion.js.map +0 -1
- package/dist/mcp-server.d.ts.map +0 -1
- package/dist/mcp-server.js.map +0 -1
- package/dist/path-manager.d.ts.map +0 -1
- package/dist/path-manager.js.map +0 -1
- package/dist/preprocess.d.ts.map +0 -1
- package/dist/preprocess.js.map +0 -1
- package/dist/preprocessors/index.d.ts.map +0 -1
- package/dist/preprocessors/index.js.map +0 -1
- package/dist/preprocessors/mdx.d.ts.map +0 -1
- package/dist/preprocessors/mdx.js.map +0 -1
- package/dist/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/preprocessors/mermaid.js.map +0 -1
- package/dist/preprocessors/registry.d.ts.map +0 -1
- package/dist/preprocessors/registry.js.map +0 -1
- package/dist/reranker.d.ts +0 -40
- package/dist/reranker.d.ts.map +0 -1
- package/dist/reranker.js.map +0 -1
- package/dist/resource-manager-demo.d.ts +0 -7
- package/dist/resource-manager-demo.d.ts.map +0 -1
- package/dist/resource-manager-demo.js +0 -52
- package/dist/resource-manager-demo.js.map +0 -1
- package/dist/resource-manager.d.ts +0 -129
- package/dist/resource-manager.d.ts.map +0 -1
- package/dist/resource-manager.js +0 -389
- package/dist/resource-manager.js.map +0 -1
- package/dist/search-standalone.d.ts.map +0 -1
- package/dist/search-standalone.js.map +0 -1
- package/dist/search.d.ts.map +0 -1
- package/dist/search.js.map +0 -1
- package/dist/test-utils.d.ts.map +0 -1
- package/dist/test-utils.js.map +0 -1
- package/dist/tokenizer.d.ts.map +0 -1
- package/dist/tokenizer.js.map +0 -1
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js.map +0 -1
- package/dist/vector-index.d.ts.map +0 -1
- package/dist/vector-index.js.map +0 -1
package/dist/cli/indexer.js
CHANGED
|
@@ -1,7 +1,110 @@
|
|
|
1
1
|
import { existsSync, statSync } from 'fs';
|
|
2
2
|
import { resolve } from 'path';
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
3
|
+
import { TextIngestionFactory } from '../factories/text-factory.js';
|
|
4
|
+
import { withCLIDatabaseAccess, setupCLICleanup, isDatabaseBusy } from '../core/cli-database-utils.js';
|
|
5
|
+
import { EXIT_CODES, ConfigurationError } from '../core/config.js';
|
|
6
|
+
/**
|
|
7
|
+
* Validate mode-specific model and strategy combinations
|
|
8
|
+
* Ensures that the selected model is compatible with the chosen mode
|
|
9
|
+
* and that reranking strategies are valid for the mode
|
|
10
|
+
*/
|
|
11
|
+
async function validateModeConfiguration(options) {
|
|
12
|
+
const mode = options.mode || 'text';
|
|
13
|
+
const model = options.embeddingModel;
|
|
14
|
+
const rerankingStrategy = options.rerankingStrategy;
|
|
15
|
+
// Define supported models for each mode
|
|
16
|
+
const textModels = [
|
|
17
|
+
'sentence-transformers/all-MiniLM-L6-v2',
|
|
18
|
+
'Xenova/all-mpnet-base-v2'
|
|
19
|
+
];
|
|
20
|
+
const multimodalModels = [
|
|
21
|
+
'Xenova/clip-vit-base-patch32'
|
|
22
|
+
];
|
|
23
|
+
// Validate model compatibility with mode
|
|
24
|
+
if (model) {
|
|
25
|
+
if (mode === 'text' && !textModels.includes(model)) {
|
|
26
|
+
if (multimodalModels.includes(model)) {
|
|
27
|
+
throw new ConfigurationError(`Model '${model}' is a multimodal model but text mode was selected.\n` +
|
|
28
|
+
`\n` +
|
|
29
|
+
`To use this model, specify multimodal mode:\n` +
|
|
30
|
+
` raglite ingest <path> --mode multimodal --model ${model}\n` +
|
|
31
|
+
`\n` +
|
|
32
|
+
`Or choose a text model for text mode:\n` +
|
|
33
|
+
` ${textModels.map(m => `raglite ingest <path> --model ${m}`).join('\n ')}\n`, EXIT_CODES.INVALID_ARGUMENTS);
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
throw new ConfigurationError(`Model '${model}' is not supported for text mode.\n` +
|
|
37
|
+
`\n` +
|
|
38
|
+
`Supported models for text mode:\n` +
|
|
39
|
+
` ${textModels.join('\n ')}\n` +
|
|
40
|
+
`\n` +
|
|
41
|
+
`Examples:\n` +
|
|
42
|
+
` raglite ingest <path> --model sentence-transformers/all-MiniLM-L6-v2\n` +
|
|
43
|
+
` raglite ingest <path> --model Xenova/all-mpnet-base-v2\n`, EXIT_CODES.INVALID_ARGUMENTS);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (mode === 'multimodal' && !multimodalModels.includes(model)) {
|
|
47
|
+
if (textModels.includes(model)) {
|
|
48
|
+
throw new ConfigurationError(`Model '${model}' is a text-only model but multimodal mode was selected.\n` +
|
|
49
|
+
`\n` +
|
|
50
|
+
`To use this model, specify text mode:\n` +
|
|
51
|
+
` raglite ingest <path> --mode text --model ${model}\n` +
|
|
52
|
+
`\n` +
|
|
53
|
+
`Or choose a multimodal model for multimodal mode:\n` +
|
|
54
|
+
` ${multimodalModels.map(m => `raglite ingest <path> --mode multimodal --model ${m}`).join('\n ')}\n`, EXIT_CODES.INVALID_ARGUMENTS);
|
|
55
|
+
}
|
|
56
|
+
else {
|
|
57
|
+
throw new ConfigurationError(`Model '${model}' is not supported for multimodal mode.\n` +
|
|
58
|
+
`\n` +
|
|
59
|
+
`Supported models for multimodal mode:\n` +
|
|
60
|
+
` ${multimodalModels.join('\n ')}\n` +
|
|
61
|
+
`\n` +
|
|
62
|
+
`Example:\n` +
|
|
63
|
+
` raglite ingest <path> --mode multimodal --model Xenova/clip-vit-base-patch32\n`, EXIT_CODES.INVALID_ARGUMENTS);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
// Validate reranking strategy compatibility with mode
|
|
68
|
+
if (rerankingStrategy) {
|
|
69
|
+
const textStrategies = ['cross-encoder', 'disabled'];
|
|
70
|
+
const multimodalStrategies = ['text-derived', 'metadata', 'disabled'];
|
|
71
|
+
if (mode === 'text' && !textStrategies.includes(rerankingStrategy)) {
|
|
72
|
+
throw new ConfigurationError(`Reranking strategy '${rerankingStrategy}' is not supported for text mode.\n` +
|
|
73
|
+
`\n` +
|
|
74
|
+
`Supported strategies for text mode:\n` +
|
|
75
|
+
` cross-encoder Use cross-encoder model for reranking (default)\n` +
|
|
76
|
+
` disabled No reranking, use vector similarity only\n` +
|
|
77
|
+
`\n` +
|
|
78
|
+
`Examples:\n` +
|
|
79
|
+
` raglite ingest <path> --mode text --rerank-strategy cross-encoder\n` +
|
|
80
|
+
` raglite ingest <path> --mode text --rerank-strategy disabled\n`, EXIT_CODES.INVALID_ARGUMENTS);
|
|
81
|
+
}
|
|
82
|
+
if (mode === 'multimodal' && !multimodalStrategies.includes(rerankingStrategy)) {
|
|
83
|
+
throw new ConfigurationError(`Reranking strategy '${rerankingStrategy}' is not supported for multimodal mode.\n` +
|
|
84
|
+
`\n` +
|
|
85
|
+
`Supported strategies for multimodal mode:\n` +
|
|
86
|
+
` text-derived Convert images to text, then use cross-encoder (default)\n` +
|
|
87
|
+
` metadata Use filename and metadata-based scoring\n` +
|
|
88
|
+
` disabled No reranking, use vector similarity only\n` +
|
|
89
|
+
`\n` +
|
|
90
|
+
`Examples:\n` +
|
|
91
|
+
` raglite ingest <path> --mode multimodal --rerank-strategy text-derived\n` +
|
|
92
|
+
` raglite ingest <path> --mode multimodal --rerank-strategy metadata\n` +
|
|
93
|
+
` raglite ingest <path> --mode multimodal --rerank-strategy disabled\n`, EXIT_CODES.INVALID_ARGUMENTS);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
// Log the final configuration
|
|
97
|
+
console.log('✅ Mode configuration validated successfully');
|
|
98
|
+
if (mode !== 'text') {
|
|
99
|
+
console.log(` Mode: ${mode}`);
|
|
100
|
+
}
|
|
101
|
+
if (model) {
|
|
102
|
+
console.log(` Model: ${model}`);
|
|
103
|
+
}
|
|
104
|
+
if (rerankingStrategy) {
|
|
105
|
+
console.log(` Reranking: ${rerankingStrategy}`);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
5
108
|
/**
|
|
6
109
|
* Run document ingestion from CLI
|
|
7
110
|
* @param path - File or directory path to ingest
|
|
@@ -9,6 +112,7 @@ import { EXIT_CODES, ConfigurationError } from '../config.js';
|
|
|
9
112
|
*/
|
|
10
113
|
export async function runIngest(path, options = {}) {
|
|
11
114
|
try {
|
|
115
|
+
// Handle --rebuild-if-needed flag immediately to prevent dimension mismatch error
|
|
12
116
|
// Validate path exists
|
|
13
117
|
const resolvedPath = resolve(path);
|
|
14
118
|
if (!existsSync(resolvedPath)) {
|
|
@@ -58,50 +162,59 @@ export async function runIngest(path, options = {}) {
|
|
|
58
162
|
console.log(`Starting ingestion of ${pathType}: ${resolvedPath}`);
|
|
59
163
|
console.log('This may take a while for large document collections...');
|
|
60
164
|
console.log('');
|
|
61
|
-
//
|
|
62
|
-
const
|
|
165
|
+
// Prepare factory options
|
|
166
|
+
const factoryOptions = {};
|
|
63
167
|
if (options.model) {
|
|
64
|
-
|
|
168
|
+
factoryOptions.embeddingModel = options.model;
|
|
65
169
|
console.log(`Using embedding model: ${options.model}`);
|
|
66
170
|
}
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
if (options['path-strategy'] || options['path-base']) {
|
|
71
|
-
const strategy = options['path-strategy'] || 'relative';
|
|
72
|
-
const basePath = options['path-base'] || process.cwd();
|
|
73
|
-
pipeline.setPathStorageStrategy(strategy, basePath);
|
|
74
|
-
console.log(`Using path storage strategy: ${strategy} (base: ${basePath})`);
|
|
171
|
+
if (options.mode) {
|
|
172
|
+
factoryOptions.mode = options.mode;
|
|
173
|
+
console.log(`Using processing mode: ${options.mode}`);
|
|
75
174
|
}
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
175
|
+
if (options['rerank-strategy']) {
|
|
176
|
+
factoryOptions.rerankingStrategy = options['rerank-strategy'];
|
|
177
|
+
console.log(`Using reranking strategy: ${options['rerank-strategy']}`);
|
|
178
|
+
}
|
|
179
|
+
if (options.rebuildIfNeeded) {
|
|
180
|
+
factoryOptions.forceRebuild = true;
|
|
181
|
+
console.log('Force rebuild enabled due to rebuildIfNeeded option');
|
|
182
|
+
// Delete old index file immediately to prevent dimension mismatch errors
|
|
183
|
+
const indexPath = process.env.RAG_INDEX_FILE || './vector-index.bin';
|
|
184
|
+
const { existsSync, unlinkSync } = await import('fs');
|
|
185
|
+
if (existsSync(indexPath)) {
|
|
79
186
|
try {
|
|
80
|
-
|
|
187
|
+
unlinkSync(indexPath);
|
|
188
|
+
console.log('🗑️ Removed old index file to prevent dimension mismatch');
|
|
81
189
|
}
|
|
82
190
|
catch (error) {
|
|
83
|
-
|
|
84
|
-
console.log('⚠️ Model mismatch detected. Rebuilding index automatically...');
|
|
85
|
-
console.log('⚠️ WARNING: This will regenerate ALL embeddings and may take a while.');
|
|
86
|
-
console.log('');
|
|
87
|
-
// Create a new pipeline with the same config overrides for rebuild
|
|
88
|
-
const rebuildPipeline = new IngestionPipeline();
|
|
89
|
-
rebuildPipeline.setConfigOverrides(configOverrides);
|
|
90
|
-
try {
|
|
91
|
-
await rebuildPipeline.initialize();
|
|
92
|
-
await rebuildPipeline.rebuildIndex();
|
|
93
|
-
}
|
|
94
|
-
finally {
|
|
95
|
-
await rebuildPipeline.cleanup();
|
|
96
|
-
}
|
|
97
|
-
console.log('✅ Rebuild completed. Continuing with ingestion...');
|
|
98
|
-
console.log('');
|
|
99
|
-
}
|
|
100
|
-
else {
|
|
101
|
-
throw error;
|
|
102
|
-
}
|
|
191
|
+
console.warn(`⚠️ Could not remove old index file: ${error}`);
|
|
103
192
|
}
|
|
104
193
|
}
|
|
194
|
+
}
|
|
195
|
+
// Validate mode-specific model and strategy combinations
|
|
196
|
+
await validateModeConfiguration(factoryOptions);
|
|
197
|
+
const dbPath = process.env.RAG_DB_FILE || './db.sqlite';
|
|
198
|
+
const indexPath = process.env.RAG_INDEX_FILE || './vector-index.bin';
|
|
199
|
+
// Setup graceful cleanup
|
|
200
|
+
setupCLICleanup(dbPath);
|
|
201
|
+
// Check if database is busy before starting
|
|
202
|
+
const busyStatus = await isDatabaseBusy(dbPath);
|
|
203
|
+
if (busyStatus.isBusy) {
|
|
204
|
+
console.log('⚠️ Database appears to be in use by another process');
|
|
205
|
+
console.log(` Reason: ${busyStatus.reason}`);
|
|
206
|
+
console.log(' Attempting to proceed anyway...');
|
|
207
|
+
console.log('');
|
|
208
|
+
}
|
|
209
|
+
// Create ingestion pipeline using factory
|
|
210
|
+
let pipeline;
|
|
211
|
+
try {
|
|
212
|
+
// Create ingestion pipeline using TextIngestionFactory with database protection
|
|
213
|
+
pipeline = await withCLIDatabaseAccess(dbPath, () => TextIngestionFactory.create(dbPath, indexPath, factoryOptions), {
|
|
214
|
+
commandName: 'Ingestion command',
|
|
215
|
+
showProgress: true,
|
|
216
|
+
maxWaitMs: 15000 // Longer timeout for ingestion
|
|
217
|
+
});
|
|
105
218
|
const result = await pipeline.ingestPath(resolvedPath);
|
|
106
219
|
// Display final results
|
|
107
220
|
console.log('\n' + '='.repeat(50));
|
|
@@ -122,10 +235,26 @@ export async function runIngest(path, options = {}) {
|
|
|
122
235
|
console.log(`Processing rate: ${chunksPerSecond} chunks/second`);
|
|
123
236
|
}
|
|
124
237
|
console.log('\nIngestion completed successfully!');
|
|
238
|
+
// Display mode-specific information
|
|
239
|
+
const mode = options.mode || 'text';
|
|
240
|
+
if (mode === 'multimodal') {
|
|
241
|
+
console.log('✨ Multimodal mode enabled - you can now search across text and image content');
|
|
242
|
+
}
|
|
125
243
|
console.log('You can now search your documents using: raglite search "your query"');
|
|
244
|
+
console.log('');
|
|
245
|
+
console.log('💡 The search command will automatically detect and use the ingestion mode.');
|
|
126
246
|
}
|
|
127
247
|
finally {
|
|
128
|
-
|
|
248
|
+
if (pipeline) {
|
|
249
|
+
await pipeline.cleanup();
|
|
250
|
+
}
|
|
251
|
+
// Ensure clean exit for CLI commands
|
|
252
|
+
const { DatabaseConnectionManager } = await import('../core/database-connection-manager.js');
|
|
253
|
+
await DatabaseConnectionManager.closeAllConnections();
|
|
254
|
+
// Force exit for CLI commands to prevent hanging
|
|
255
|
+
setTimeout(() => {
|
|
256
|
+
process.exit(0);
|
|
257
|
+
}, 100);
|
|
129
258
|
}
|
|
130
259
|
}
|
|
131
260
|
catch (error) {
|
|
@@ -205,14 +334,101 @@ export async function runRebuild() {
|
|
|
205
334
|
console.log('');
|
|
206
335
|
console.log('Progress will be shown below...');
|
|
207
336
|
console.log('');
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
337
|
+
// Detect mode from existing database for rebuild
|
|
338
|
+
const dbPath = process.env.RAG_DB_FILE || './db.sqlite';
|
|
339
|
+
const indexPath = process.env.RAG_INDEX_FILE || './vector-index.bin';
|
|
340
|
+
let rebuildOptions = { forceRebuild: true };
|
|
341
|
+
if (existsSync(dbPath)) {
|
|
342
|
+
try {
|
|
343
|
+
// Import mode detection service
|
|
344
|
+
const { ModeDetectionService } = await import('../core/mode-detection-service.js');
|
|
345
|
+
const modeService = new ModeDetectionService(dbPath);
|
|
346
|
+
const systemInfo = await modeService.detectMode();
|
|
347
|
+
console.log(`🎯 Detected existing configuration:`);
|
|
348
|
+
console.log(` Mode: ${systemInfo.mode}`);
|
|
349
|
+
console.log(` Model: ${systemInfo.modelName}`);
|
|
350
|
+
console.log(` Reranking: ${systemInfo.rerankingStrategy}`);
|
|
351
|
+
console.log('');
|
|
352
|
+
// Use the detected configuration for rebuild
|
|
353
|
+
rebuildOptions.mode = systemInfo.mode;
|
|
354
|
+
rebuildOptions.embeddingModel = systemInfo.modelName;
|
|
355
|
+
rebuildOptions.rerankingStrategy = systemInfo.rerankingStrategy;
|
|
356
|
+
}
|
|
357
|
+
catch (error) {
|
|
358
|
+
console.warn('⚠️ Could not detect existing mode configuration, using defaults');
|
|
359
|
+
console.warn(` Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
// Create ingestion pipeline with force rebuild using factory
|
|
363
|
+
const pipeline = await TextIngestionFactory.create(dbPath, indexPath, rebuildOptions);
|
|
364
|
+
try {
|
|
365
|
+
// Get all documents from database and re-ingest them
|
|
366
|
+
const { openDatabase } = await import('../core/db.js');
|
|
367
|
+
const db = await openDatabase(dbPath);
|
|
368
|
+
try {
|
|
369
|
+
const documents = await db.all('SELECT DISTINCT source FROM documents ORDER BY source');
|
|
370
|
+
if (documents.length === 0) {
|
|
371
|
+
throw new Error('No documents found in database. Nothing to rebuild.');
|
|
372
|
+
}
|
|
373
|
+
console.log(`Found ${documents.length} document${documents.length === 1 ? '' : 's'} to rebuild`);
|
|
374
|
+
let totalResult = {
|
|
375
|
+
documentsProcessed: 0,
|
|
376
|
+
chunksCreated: 0,
|
|
377
|
+
embeddingsGenerated: 0,
|
|
378
|
+
documentErrors: 0,
|
|
379
|
+
embeddingErrors: 0,
|
|
380
|
+
processingTimeMs: 0
|
|
381
|
+
};
|
|
382
|
+
// Re-ingest each document
|
|
383
|
+
for (const doc of documents) {
|
|
384
|
+
if (existsSync(doc.source)) {
|
|
385
|
+
console.log(`Rebuilding: ${doc.source}`);
|
|
386
|
+
const result = await pipeline.ingestFile(doc.source);
|
|
387
|
+
totalResult.documentsProcessed += result.documentsProcessed;
|
|
388
|
+
totalResult.chunksCreated += result.chunksCreated;
|
|
389
|
+
totalResult.embeddingsGenerated += result.embeddingsGenerated;
|
|
390
|
+
totalResult.documentErrors += result.documentErrors;
|
|
391
|
+
totalResult.embeddingErrors += result.embeddingErrors;
|
|
392
|
+
totalResult.processingTimeMs += result.processingTimeMs;
|
|
393
|
+
}
|
|
394
|
+
else {
|
|
395
|
+
console.warn(`⚠️ Document not found, skipping: ${doc.source}`);
|
|
396
|
+
totalResult.documentErrors++;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
console.log('\n' + '='.repeat(50));
|
|
400
|
+
console.log('REBUILD COMPLETE');
|
|
401
|
+
console.log('='.repeat(50));
|
|
402
|
+
console.log(`Documents processed: ${totalResult.documentsProcessed}`);
|
|
403
|
+
console.log(`Chunks created: ${totalResult.chunksCreated}`);
|
|
404
|
+
console.log(`Embeddings generated: ${totalResult.embeddingsGenerated}`);
|
|
405
|
+
if (totalResult.documentErrors > 0) {
|
|
406
|
+
console.log(`Document errors: ${totalResult.documentErrors}`);
|
|
407
|
+
}
|
|
408
|
+
if (totalResult.embeddingErrors > 0) {
|
|
409
|
+
console.log(`Embedding errors: ${totalResult.embeddingErrors}`);
|
|
410
|
+
}
|
|
411
|
+
console.log(`Total processing time: ${(totalResult.processingTimeMs / 1000).toFixed(2)} seconds`);
|
|
412
|
+
console.log('');
|
|
413
|
+
console.log('The vector index has been successfully rebuilt.');
|
|
414
|
+
console.log('All embeddings have been regenerated with the current model.');
|
|
415
|
+
console.log('');
|
|
416
|
+
console.log('You can now search your documents using: raglite search "your query"');
|
|
417
|
+
}
|
|
418
|
+
finally {
|
|
419
|
+
await db.close();
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
finally {
|
|
423
|
+
await pipeline.cleanup();
|
|
424
|
+
// Ensure clean exit for CLI commands
|
|
425
|
+
const { DatabaseConnectionManager } = await import('../core/database-connection-manager.js');
|
|
426
|
+
await DatabaseConnectionManager.closeAllConnections();
|
|
427
|
+
// Force exit for CLI commands to prevent hanging
|
|
428
|
+
setTimeout(() => {
|
|
429
|
+
process.exit(0);
|
|
430
|
+
}, 100);
|
|
431
|
+
}
|
|
216
432
|
}
|
|
217
433
|
catch (error) {
|
|
218
434
|
console.error('\n' + '='.repeat(50));
|
package/dist/cli/search.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { existsSync } from 'fs';
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
2
|
+
import { PolymorphicSearchFactory } from '../core/polymorphic-search-factory.js';
|
|
3
|
+
import { withCLIDatabaseAccess, setupCLICleanup } from '../core/cli-database-utils.js';
|
|
4
|
+
import { config, EXIT_CODES, ConfigurationError } from '../core/config.js';
|
|
4
5
|
/**
|
|
5
6
|
* Run search from CLI
|
|
6
7
|
* @param query - Search query string
|
|
@@ -53,33 +54,17 @@ export async function runSearch(query, options = {}) {
|
|
|
53
54
|
process.exit(EXIT_CODES.INDEX_ERROR);
|
|
54
55
|
}
|
|
55
56
|
console.log(`Searching for: "${query}"`);
|
|
56
|
-
console.log('Loading search engine...');
|
|
57
57
|
console.log('');
|
|
58
|
-
//
|
|
59
|
-
|
|
58
|
+
// Setup graceful cleanup
|
|
59
|
+
setupCLICleanup(effectiveConfig.db_file);
|
|
60
|
+
// Initialize search engine using polymorphic factory with database protection
|
|
61
|
+
let searchEngine;
|
|
60
62
|
try {
|
|
61
|
-
//
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
const modelToUse = storedModel ? storedModel.modelName : effectiveConfig.embedding_model;
|
|
67
|
-
console.log(`Using model from ingestion: ${modelToUse}`);
|
|
68
|
-
// Initialize embedding engine with the correct model
|
|
69
|
-
const { EmbeddingEngine } = await import('../embedder.js');
|
|
70
|
-
embedder = new EmbeddingEngine(modelToUse);
|
|
71
|
-
await embedder.loadModel();
|
|
72
|
-
// Initialize index manager (which handles the vector index and hash mapping)
|
|
73
|
-
const { IndexManager } = await import('../index-manager.js');
|
|
74
|
-
const { getModelDefaults } = await import('../config.js');
|
|
75
|
-
const modelDefaults = getModelDefaults(modelToUse);
|
|
76
|
-
indexManager = new IndexManager(effectiveConfig.index_file, effectiveConfig.db_file, modelDefaults.dimensions, modelToUse);
|
|
77
|
-
await indexManager.initialize();
|
|
78
|
-
// Create search engine with index manager
|
|
79
|
-
const enableReranking = options.rerank !== undefined ? options.rerank : effectiveConfig.rerank_enabled;
|
|
80
|
-
const { SearchEngine } = await import('../search.js');
|
|
81
|
-
searchEngine = SearchEngine.createWithComponents(embedder, indexManager, db, enableReranking);
|
|
82
|
-
await searchEngine.initialize();
|
|
63
|
+
// Create search engine using PolymorphicSearchFactory (auto-detects mode)
|
|
64
|
+
searchEngine = await withCLIDatabaseAccess(effectiveConfig.db_file, () => PolymorphicSearchFactory.create(effectiveConfig.index_file, effectiveConfig.db_file), {
|
|
65
|
+
commandName: 'Search command',
|
|
66
|
+
showProgress: true
|
|
67
|
+
});
|
|
83
68
|
// Prepare search options
|
|
84
69
|
const searchOptions = {};
|
|
85
70
|
if (options['top-k'] !== undefined) {
|
|
@@ -90,8 +75,18 @@ export async function runSearch(query, options = {}) {
|
|
|
90
75
|
}
|
|
91
76
|
// Perform search
|
|
92
77
|
const startTime = Date.now();
|
|
93
|
-
|
|
78
|
+
let results = await searchEngine.search(query, searchOptions);
|
|
94
79
|
const searchTime = Date.now() - startTime;
|
|
80
|
+
// Apply content type filter if specified
|
|
81
|
+
const contentTypeFilter = options['content-type'];
|
|
82
|
+
if (contentTypeFilter && contentTypeFilter !== 'all') {
|
|
83
|
+
const originalCount = results.length;
|
|
84
|
+
results = results.filter(r => r.contentType === contentTypeFilter);
|
|
85
|
+
if (results.length < originalCount) {
|
|
86
|
+
console.log(`Filtered to ${results.length} ${contentTypeFilter} result${results.length === 1 ? '' : 's'} (from ${originalCount} total)`);
|
|
87
|
+
console.log('');
|
|
88
|
+
}
|
|
89
|
+
}
|
|
95
90
|
// Display results
|
|
96
91
|
if (results.length === 0) {
|
|
97
92
|
console.log('No results found.');
|
|
@@ -103,10 +98,30 @@ export async function runSearch(query, options = {}) {
|
|
|
103
98
|
else {
|
|
104
99
|
console.log(`Found ${results.length} result${results.length === 1 ? '' : 's'} in ${searchTime}ms:\n`);
|
|
105
100
|
results.forEach((result, index) => {
|
|
106
|
-
|
|
101
|
+
// Add content type icon for visual distinction
|
|
102
|
+
const contentTypeIcon = result.contentType === 'image' ? '🖼️ ' : '📄 ';
|
|
103
|
+
const contentTypeLabel = result.contentType === 'image' ? '[IMAGE]' : '[TEXT]';
|
|
104
|
+
console.log(`${index + 1}. ${contentTypeIcon}${result.document.title}`);
|
|
107
105
|
console.log(` Source: ${result.document.source}`);
|
|
106
|
+
console.log(` Type: ${contentTypeLabel}`);
|
|
108
107
|
console.log(` Score: ${(result.score * 100).toFixed(1)}%`);
|
|
109
|
-
|
|
108
|
+
// Display content differently based on type
|
|
109
|
+
if (result.contentType === 'image') {
|
|
110
|
+
// For images, show metadata if available
|
|
111
|
+
if (result.metadata?.description) {
|
|
112
|
+
console.log(` Description: ${truncateText(result.metadata.description, 200)}`);
|
|
113
|
+
}
|
|
114
|
+
if (result.metadata?.dimensions) {
|
|
115
|
+
console.log(` Dimensions: ${result.metadata.dimensions}`);
|
|
116
|
+
}
|
|
117
|
+
if (result.metadata?.format) {
|
|
118
|
+
console.log(` Format: ${result.metadata.format}`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
else {
|
|
122
|
+
// For text, show content preview
|
|
123
|
+
console.log(` Text: ${truncateText(result.content, 200)}`);
|
|
124
|
+
}
|
|
110
125
|
console.log('');
|
|
111
126
|
});
|
|
112
127
|
// Show search statistics
|
|
@@ -121,9 +136,16 @@ export async function runSearch(query, options = {}) {
|
|
|
121
136
|
}
|
|
122
137
|
finally {
|
|
123
138
|
// Cleanup resources
|
|
124
|
-
if (
|
|
125
|
-
await
|
|
139
|
+
if (searchEngine) {
|
|
140
|
+
await searchEngine.cleanup();
|
|
126
141
|
}
|
|
142
|
+
// Ensure clean exit for CLI commands
|
|
143
|
+
const { DatabaseConnectionManager } = await import('../core/database-connection-manager.js');
|
|
144
|
+
await DatabaseConnectionManager.closeAllConnections();
|
|
145
|
+
// Force exit for CLI commands to prevent hanging
|
|
146
|
+
setTimeout(() => {
|
|
147
|
+
process.exit(0);
|
|
148
|
+
}, 100);
|
|
127
149
|
}
|
|
128
150
|
}
|
|
129
151
|
catch (error) {
|