rag-lite-ts 1.0.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +651 -109
- package/dist/cli/indexer.js +262 -46
- package/dist/cli/search.js +54 -32
- package/dist/cli.js +185 -28
- package/dist/config.d.ts +34 -73
- package/dist/config.js +50 -255
- package/dist/core/abstract-embedder.d.ts +125 -0
- package/dist/core/abstract-embedder.js +264 -0
- package/dist/core/actionable-error-messages.d.ts +60 -0
- package/dist/core/actionable-error-messages.js +397 -0
- package/dist/core/adapters.d.ts +93 -0
- package/dist/core/adapters.js +139 -0
- package/dist/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/core/batch-processing-optimizer.js +541 -0
- package/dist/core/chunker.d.ts +119 -0
- package/dist/core/chunker.js +73 -0
- package/dist/core/cli-database-utils.d.ts +53 -0
- package/dist/core/cli-database-utils.js +239 -0
- package/dist/core/config.d.ts +102 -0
- package/dist/core/config.js +247 -0
- package/dist/core/content-errors.d.ts +111 -0
- package/dist/core/content-errors.js +362 -0
- package/dist/core/content-manager.d.ts +343 -0
- package/dist/core/content-manager.js +1504 -0
- package/dist/core/content-performance-optimizer.d.ts +150 -0
- package/dist/core/content-performance-optimizer.js +516 -0
- package/dist/core/content-resolver.d.ts +104 -0
- package/dist/core/content-resolver.js +285 -0
- package/dist/core/cross-modal-search.d.ts +164 -0
- package/dist/core/cross-modal-search.js +342 -0
- package/dist/core/database-connection-manager.d.ts +109 -0
- package/dist/core/database-connection-manager.js +304 -0
- package/dist/core/db.d.ts +245 -0
- package/dist/core/db.js +952 -0
- package/dist/core/embedder-factory.d.ts +176 -0
- package/dist/core/embedder-factory.js +338 -0
- package/dist/{error-handler.d.ts → core/error-handler.d.ts} +23 -2
- package/dist/{error-handler.js → core/error-handler.js} +51 -8
- package/dist/core/index.d.ts +59 -0
- package/dist/core/index.js +69 -0
- package/dist/core/ingestion.d.ts +213 -0
- package/dist/core/ingestion.js +812 -0
- package/dist/core/interfaces.d.ts +408 -0
- package/dist/core/interfaces.js +106 -0
- package/dist/core/lazy-dependency-loader.d.ts +152 -0
- package/dist/core/lazy-dependency-loader.js +453 -0
- package/dist/core/mode-detection-service.d.ts +150 -0
- package/dist/core/mode-detection-service.js +565 -0
- package/dist/core/mode-model-validator.d.ts +92 -0
- package/dist/core/mode-model-validator.js +203 -0
- package/dist/core/model-registry.d.ts +120 -0
- package/dist/core/model-registry.js +415 -0
- package/dist/core/model-validator.d.ts +217 -0
- package/dist/core/model-validator.js +782 -0
- package/dist/{path-manager.d.ts → core/path-manager.d.ts} +5 -0
- package/dist/{path-manager.js → core/path-manager.js} +5 -0
- package/dist/core/polymorphic-search-factory.d.ts +154 -0
- package/dist/core/polymorphic-search-factory.js +344 -0
- package/dist/core/raglite-paths.d.ts +121 -0
- package/dist/core/raglite-paths.js +145 -0
- package/dist/core/reranking-config.d.ts +42 -0
- package/dist/core/reranking-config.js +156 -0
- package/dist/core/reranking-factory.d.ts +92 -0
- package/dist/core/reranking-factory.js +591 -0
- package/dist/core/reranking-strategies.d.ts +325 -0
- package/dist/core/reranking-strategies.js +720 -0
- package/dist/core/resource-cleanup.d.ts +163 -0
- package/dist/core/resource-cleanup.js +371 -0
- package/dist/core/resource-manager.d.ts +212 -0
- package/dist/core/resource-manager.js +564 -0
- package/dist/core/search-pipeline.d.ts +111 -0
- package/dist/core/search-pipeline.js +287 -0
- package/dist/core/search.d.ts +131 -0
- package/dist/core/search.js +296 -0
- package/dist/core/streaming-operations.d.ts +145 -0
- package/dist/core/streaming-operations.js +409 -0
- package/dist/core/types.d.ts +66 -0
- package/dist/core/types.js +6 -0
- package/dist/core/universal-embedder.d.ts +177 -0
- package/dist/core/universal-embedder.js +139 -0
- package/dist/core/validation-messages.d.ts +99 -0
- package/dist/core/validation-messages.js +334 -0
- package/dist/{vector-index.d.ts → core/vector-index.d.ts} +4 -0
- package/dist/{vector-index.js → core/vector-index.js} +21 -3
- package/dist/dom-polyfills.d.ts +6 -0
- package/dist/dom-polyfills.js +40 -0
- package/dist/factories/index.d.ts +43 -0
- package/dist/factories/index.js +44 -0
- package/dist/factories/text-factory.d.ts +560 -0
- package/dist/factories/text-factory.js +968 -0
- package/dist/file-processor.d.ts +90 -4
- package/dist/file-processor.js +723 -20
- package/dist/index-manager.d.ts +3 -2
- package/dist/index-manager.js +13 -11
- package/dist/index.d.ts +72 -8
- package/dist/index.js +102 -16
- package/dist/indexer.js +1 -1
- package/dist/ingestion.d.ts +44 -154
- package/dist/ingestion.js +75 -671
- package/dist/mcp-server.d.ts +35 -3
- package/dist/mcp-server.js +1186 -79
- package/dist/multimodal/clip-embedder.d.ts +314 -0
- package/dist/multimodal/clip-embedder.js +945 -0
- package/dist/multimodal/index.d.ts +6 -0
- package/dist/multimodal/index.js +6 -0
- package/dist/preprocess.js +1 -1
- package/dist/run-error-recovery-tests.d.ts +7 -0
- package/dist/run-error-recovery-tests.js +101 -0
- package/dist/search-standalone.js +1 -1
- package/dist/search.d.ts +51 -69
- package/dist/search.js +117 -412
- package/dist/test-utils.d.ts +8 -26
- package/dist/text/chunker.d.ts +33 -0
- package/dist/{chunker.js → text/chunker.js} +98 -75
- package/dist/{embedder.d.ts → text/embedder.d.ts} +22 -1
- package/dist/{embedder.js → text/embedder.js} +84 -10
- package/dist/text/index.d.ts +8 -0
- package/dist/text/index.js +9 -0
- package/dist/text/preprocessors/index.d.ts +17 -0
- package/dist/text/preprocessors/index.js +38 -0
- package/dist/text/preprocessors/mdx.d.ts +25 -0
- package/dist/text/preprocessors/mdx.js +101 -0
- package/dist/text/preprocessors/mermaid.d.ts +68 -0
- package/dist/text/preprocessors/mermaid.js +330 -0
- package/dist/text/preprocessors/registry.d.ts +56 -0
- package/dist/text/preprocessors/registry.js +180 -0
- package/dist/text/reranker.d.ts +59 -0
- package/dist/{reranker.js → text/reranker.js} +138 -53
- package/dist/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/text/sentence-transformer-embedder.js +340 -0
- package/dist/{tokenizer.d.ts → text/tokenizer.d.ts} +1 -0
- package/dist/{tokenizer.js → text/tokenizer.js} +7 -2
- package/dist/types.d.ts +40 -1
- package/dist/utils/vector-math.d.ts +31 -0
- package/dist/utils/vector-math.js +70 -0
- package/package.json +16 -4
- package/dist/api-errors.d.ts.map +0 -1
- package/dist/api-errors.js.map +0 -1
- package/dist/chunker.d.ts +0 -47
- package/dist/chunker.d.ts.map +0 -1
- package/dist/chunker.js.map +0 -1
- package/dist/cli/indexer.d.ts.map +0 -1
- package/dist/cli/indexer.js.map +0 -1
- package/dist/cli/search.d.ts.map +0 -1
- package/dist/cli/search.js.map +0 -1
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/db.d.ts +0 -90
- package/dist/db.d.ts.map +0 -1
- package/dist/db.js +0 -340
- package/dist/db.js.map +0 -1
- package/dist/embedder.d.ts.map +0 -1
- package/dist/embedder.js.map +0 -1
- package/dist/error-handler.d.ts.map +0 -1
- package/dist/error-handler.js.map +0 -1
- package/dist/file-processor.d.ts.map +0 -1
- package/dist/file-processor.js.map +0 -1
- package/dist/index-manager.d.ts.map +0 -1
- package/dist/index-manager.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/indexer.d.ts.map +0 -1
- package/dist/indexer.js.map +0 -1
- package/dist/ingestion.d.ts.map +0 -1
- package/dist/ingestion.js.map +0 -1
- package/dist/mcp-server.d.ts.map +0 -1
- package/dist/mcp-server.js.map +0 -1
- package/dist/path-manager.d.ts.map +0 -1
- package/dist/path-manager.js.map +0 -1
- package/dist/preprocess.d.ts.map +0 -1
- package/dist/preprocess.js.map +0 -1
- package/dist/preprocessors/index.d.ts.map +0 -1
- package/dist/preprocessors/index.js.map +0 -1
- package/dist/preprocessors/mdx.d.ts.map +0 -1
- package/dist/preprocessors/mdx.js.map +0 -1
- package/dist/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/preprocessors/mermaid.js.map +0 -1
- package/dist/preprocessors/registry.d.ts.map +0 -1
- package/dist/preprocessors/registry.js.map +0 -1
- package/dist/reranker.d.ts +0 -40
- package/dist/reranker.d.ts.map +0 -1
- package/dist/reranker.js.map +0 -1
- package/dist/resource-manager-demo.d.ts +0 -7
- package/dist/resource-manager-demo.d.ts.map +0 -1
- package/dist/resource-manager-demo.js +0 -52
- package/dist/resource-manager-demo.js.map +0 -1
- package/dist/resource-manager.d.ts +0 -129
- package/dist/resource-manager.d.ts.map +0 -1
- package/dist/resource-manager.js +0 -389
- package/dist/resource-manager.js.map +0 -1
- package/dist/search-standalone.d.ts.map +0 -1
- package/dist/search-standalone.js.map +0 -1
- package/dist/search.d.ts.map +0 -1
- package/dist/search.js.map +0 -1
- package/dist/test-utils.d.ts.map +0 -1
- package/dist/test-utils.js.map +0 -1
- package/dist/tokenizer.d.ts.map +0 -1
- package/dist/tokenizer.js.map +0 -1
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js.map +0 -1
- package/dist/vector-index.d.ts.map +0 -1
- package/dist/vector-index.js.map +0 -1
package/dist/config.js
CHANGED
|
@@ -1,281 +1,76 @@
|
|
|
1
|
-
import { homedir } from 'os';
|
|
2
|
-
import { join } from 'path';
|
|
3
1
|
/**
|
|
4
|
-
*
|
|
2
|
+
* Main configuration file with text-specific settings
|
|
3
|
+
* Extends core configuration with implementation-specific properties
|
|
5
4
|
*/
|
|
6
|
-
|
|
7
|
-
mode: 'balanced'
|
|
8
|
-
};
|
|
5
|
+
import { getDefaultModelCachePath } from './core/config.js';
|
|
9
6
|
/**
|
|
10
|
-
*
|
|
11
|
-
* @returns Default cache path (~/.raglite/models/)
|
|
7
|
+
* Default configuration object with both core and text-specific settings
|
|
12
8
|
*/
|
|
13
|
-
function getDefaultModelCachePath() {
|
|
14
|
-
return join(homedir(), '.raglite', 'models');
|
|
15
|
-
}
|
|
16
|
-
/**
|
|
17
|
-
* Returns model-specific default configuration values
|
|
18
|
-
* @param modelName - The embedding model name
|
|
19
|
-
* @returns Model-specific defaults for dimensions, chunk_size, chunk_overlap, and batch_size
|
|
20
|
-
*/
|
|
21
|
-
export function getModelDefaults(modelName) {
|
|
22
|
-
if (modelName === 'Xenova/all-mpnet-base-v2') {
|
|
23
|
-
return {
|
|
24
|
-
dimensions: 768,
|
|
25
|
-
chunk_size: 400,
|
|
26
|
-
chunk_overlap: 80,
|
|
27
|
-
batch_size: 8
|
|
28
|
-
};
|
|
29
|
-
}
|
|
30
|
-
// Default to sentence-transformers/all-MiniLM-L6-v2 settings (current defaults)
|
|
31
|
-
return {
|
|
32
|
-
dimensions: 384,
|
|
33
|
-
chunk_size: 250,
|
|
34
|
-
chunk_overlap: 50,
|
|
35
|
-
batch_size: 16
|
|
36
|
-
};
|
|
37
|
-
}
|
|
38
|
-
// Create config with model-specific defaults
|
|
39
|
-
const embeddingModel = process.env.RAG_EMBEDDING_MODEL || 'sentence-transformers/all-MiniLM-L6-v2';
|
|
40
|
-
const modelDefaults = getModelDefaults(embeddingModel);
|
|
41
9
|
export const config = {
|
|
42
|
-
|
|
43
|
-
chunk_size: parseInt(process.env.RAG_CHUNK_SIZE ||
|
|
44
|
-
chunk_overlap: parseInt(process.env.RAG_CHUNK_OVERLAP ||
|
|
45
|
-
batch_size: parseInt(process.env.RAG_BATCH_SIZE ||
|
|
10
|
+
// Core settings
|
|
11
|
+
chunk_size: parseInt(process.env.RAG_CHUNK_SIZE || '250', 10),
|
|
12
|
+
chunk_overlap: parseInt(process.env.RAG_CHUNK_OVERLAP || '50', 10),
|
|
13
|
+
batch_size: parseInt(process.env.RAG_BATCH_SIZE || '16', 10),
|
|
46
14
|
top_k: parseInt(process.env.RAG_TOP_K || '10', 10),
|
|
47
15
|
db_file: process.env.RAG_DB_FILE || 'db.sqlite',
|
|
48
16
|
index_file: process.env.RAG_INDEX_FILE || 'vector-index.bin',
|
|
49
|
-
rerank_enabled: process.env.RAG_RERANK_ENABLED === 'true',
|
|
50
|
-
preprocessing: defaultPreprocessingConfig,
|
|
51
17
|
model_cache_path: process.env.RAG_MODEL_CACHE_PATH || getDefaultModelCachePath(),
|
|
52
|
-
path_storage_strategy: process.env.RAG_PATH_STORAGE_STRATEGY || 'relative'
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
DATABASE_ERROR: 5,
|
|
64
|
-
MODEL_ERROR: 6,
|
|
65
|
-
INDEX_ERROR: 7,
|
|
66
|
-
PERMISSION_ERROR: 8
|
|
67
|
-
};
|
|
68
|
-
/**
|
|
69
|
-
* Configuration validation error with specific exit code
|
|
70
|
-
*/
|
|
71
|
-
export class ConfigurationError extends Error {
|
|
72
|
-
exitCode;
|
|
73
|
-
constructor(message, exitCode = EXIT_CODES.CONFIGURATION_ERROR) {
|
|
74
|
-
super(message);
|
|
75
|
-
this.exitCode = exitCode;
|
|
76
|
-
this.name = 'ConfigurationError';
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
/**
|
|
80
|
-
* Validates preprocessing configuration
|
|
81
|
-
* @param config - Preprocessing configuration to validate
|
|
82
|
-
* @throws {ConfigurationError} If preprocessing configuration is invalid
|
|
83
|
-
*/
|
|
84
|
-
export function validatePreprocessingConfig(config) {
|
|
85
|
-
if (!config || typeof config !== 'object') {
|
|
86
|
-
throw new ConfigurationError('Preprocessing configuration must be an object');
|
|
87
|
-
}
|
|
88
|
-
// Validate mode
|
|
89
|
-
const validModes = ['strict', 'balanced', 'rich'];
|
|
90
|
-
if (!config.mode || !validModes.includes(config.mode)) {
|
|
91
|
-
throw new ConfigurationError(`Configuration error: preprocessing.mode must be one of: ${validModes.join(', ')}.\n` +
|
|
92
|
-
`Current value: ${JSON.stringify(config.mode)}\n` +
|
|
93
|
-
`Please set it to 'strict', 'balanced', or 'rich'.`);
|
|
18
|
+
path_storage_strategy: process.env.RAG_PATH_STORAGE_STRATEGY || 'relative',
|
|
19
|
+
// Text-specific settings
|
|
20
|
+
embedding_model: process.env.RAG_EMBEDDING_MODEL || 'sentence-transformers/all-MiniLM-L6-v2',
|
|
21
|
+
rerank_enabled: process.env.RAG_RERANK_ENABLED !== 'false', // Default to true unless explicitly disabled
|
|
22
|
+
rerank_model: process.env.RAG_RERANK_MODEL || 'cross-encoder/ms-marco-MiniLM-L-6-v2',
|
|
23
|
+
// Preprocessing settings
|
|
24
|
+
preprocessing: {
|
|
25
|
+
enabled: process.env.RAG_PREPROCESSING_ENABLED !== 'false',
|
|
26
|
+
mdx: process.env.RAG_PREPROCESSING_MDX !== 'false',
|
|
27
|
+
mermaid: process.env.RAG_PREPROCESSING_MERMAID !== 'false',
|
|
28
|
+
code_blocks: process.env.RAG_PREPROCESSING_CODE_BLOCKS !== 'false'
|
|
94
29
|
}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
throw new ConfigurationError(`Configuration error: preprocessing.overrides must be an object.\n` +
|
|
99
|
-
`Current value: ${JSON.stringify(config.overrides)}`);
|
|
100
|
-
}
|
|
101
|
-
// Validate MDX override
|
|
102
|
-
if (config.overrides.mdx !== undefined) {
|
|
103
|
-
const validMdxOptions = ['strip', 'keep', 'placeholder'];
|
|
104
|
-
if (!validMdxOptions.includes(config.overrides.mdx)) {
|
|
105
|
-
throw new ConfigurationError(`Configuration error: preprocessing.overrides.mdx must be one of: ${validMdxOptions.join(', ')}.\n` +
|
|
106
|
-
`Current value: ${JSON.stringify(config.overrides.mdx)}`);
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
// Validate Mermaid override
|
|
110
|
-
if (config.overrides.mermaid !== undefined) {
|
|
111
|
-
const validMermaidOptions = ['strip', 'extract', 'placeholder'];
|
|
112
|
-
if (!validMermaidOptions.includes(config.overrides.mermaid)) {
|
|
113
|
-
throw new ConfigurationError(`Configuration error: preprocessing.overrides.mermaid must be one of: ${validMermaidOptions.join(', ')}.\n` +
|
|
114
|
-
`Current value: ${JSON.stringify(config.overrides.mermaid)}`);
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
// Validate code override
|
|
118
|
-
if (config.overrides.code !== undefined) {
|
|
119
|
-
const validCodeOptions = ['strip', 'keep', 'placeholder'];
|
|
120
|
-
if (!validCodeOptions.includes(config.overrides.code)) {
|
|
121
|
-
throw new ConfigurationError(`Configuration error: preprocessing.overrides.code must be one of: ${validCodeOptions.join(', ')}.\n` +
|
|
122
|
-
`Current value: ${JSON.stringify(config.overrides.code)}`);
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
/**
|
|
128
|
-
* Merges preprocessing mode with overrides to create final configuration
|
|
129
|
-
* @param config - Base preprocessing configuration
|
|
130
|
-
* @returns Resolved preprocessing options for each content type
|
|
131
|
-
*/
|
|
132
|
-
export function mergePreprocessingConfig(config) {
|
|
133
|
-
// Define mode defaults
|
|
134
|
-
const modeDefaults = {
|
|
135
|
-
strict: {
|
|
136
|
-
mdx: 'strip',
|
|
137
|
-
mermaid: 'strip',
|
|
138
|
-
code: 'strip'
|
|
139
|
-
},
|
|
140
|
-
balanced: {
|
|
141
|
-
mdx: 'placeholder',
|
|
142
|
-
mermaid: 'placeholder',
|
|
143
|
-
code: 'keep'
|
|
144
|
-
},
|
|
145
|
-
rich: {
|
|
146
|
-
mdx: 'keep',
|
|
147
|
-
mermaid: 'extract',
|
|
148
|
-
code: 'keep'
|
|
149
|
-
}
|
|
150
|
-
};
|
|
151
|
-
// Start with mode defaults
|
|
152
|
-
const result = { ...modeDefaults[config.mode] };
|
|
153
|
-
// Apply overrides (shallow override only)
|
|
154
|
-
if (config.overrides) {
|
|
155
|
-
if (config.overrides.mdx !== undefined) {
|
|
156
|
-
result.mdx = config.overrides.mdx;
|
|
157
|
-
}
|
|
158
|
-
if (config.overrides.mermaid !== undefined) {
|
|
159
|
-
result.mermaid = config.overrides.mermaid;
|
|
160
|
-
}
|
|
161
|
-
if (config.overrides.code !== undefined) {
|
|
162
|
-
result.code = config.overrides.code;
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
return result;
|
|
166
|
-
}
|
|
30
|
+
};
|
|
31
|
+
// Re-export everything from core config
|
|
32
|
+
export * from './core/config.js';
|
|
167
33
|
/**
|
|
168
|
-
*
|
|
169
|
-
* @param config - Configuration object to validate
|
|
170
|
-
* @throws {ConfigurationError} If configuration is invalid
|
|
34
|
+
* Validate complete configuration including text-specific settings
|
|
171
35
|
*/
|
|
172
36
|
export function validateConfig(config) {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
//
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
if (!config[field] || typeof config[field] !== 'string') {
|
|
180
|
-
throw new ConfigurationError(`Configuration error: '${field}' must be a non-empty string.\n` +
|
|
181
|
-
`Current value: ${JSON.stringify(config[field])}\n` +
|
|
182
|
-
`Please check your configuration file.`);
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
// Validate path_storage_strategy
|
|
186
|
-
if (!['absolute', 'relative'].includes(config.path_storage_strategy)) {
|
|
187
|
-
throw new ConfigurationError(`Configuration error: 'path_storage_strategy' must be either 'absolute' or 'relative'.\n` +
|
|
188
|
-
`Current value: ${JSON.stringify(config.path_storage_strategy)}\n` +
|
|
189
|
-
`Please set it to 'absolute' or 'relative'.`);
|
|
37
|
+
// First validate core config
|
|
38
|
+
const { validateCoreConfig } = require('./core/config.js');
|
|
39
|
+
validateCoreConfig(config);
|
|
40
|
+
// Validate text-specific settings
|
|
41
|
+
if (!config.embedding_model || typeof config.embedding_model !== 'string') {
|
|
42
|
+
throw new Error('embedding_model must be a non-empty string');
|
|
190
43
|
}
|
|
191
|
-
// Check required numeric fields are positive
|
|
192
|
-
const requiredNumbers = ['chunk_size', 'chunk_overlap', 'batch_size', 'top_k'];
|
|
193
|
-
for (const field of requiredNumbers) {
|
|
194
|
-
if (typeof config[field] !== 'number' || config[field] <= 0) {
|
|
195
|
-
throw new ConfigurationError(`Configuration error: '${field}' must be a positive number.\n` +
|
|
196
|
-
`Current value: ${JSON.stringify(config[field])}\n` +
|
|
197
|
-
`Please ensure all numeric values are greater than 0.`);
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
// Check boolean fields
|
|
201
44
|
if (typeof config.rerank_enabled !== 'boolean') {
|
|
202
|
-
throw new
|
|
203
|
-
`Current value: ${JSON.stringify(config.rerank_enabled)}\n` +
|
|
204
|
-
`Please set it to either true or false.`);
|
|
205
|
-
}
|
|
206
|
-
// Validate preprocessing configuration
|
|
207
|
-
validatePreprocessingConfig(config.preprocessing);
|
|
208
|
-
// Validate optional model_cache_path field
|
|
209
|
-
if (config.model_cache_path !== undefined && (typeof config.model_cache_path !== 'string' || config.model_cache_path.trim() === '')) {
|
|
210
|
-
throw new ConfigurationError(`Configuration error: 'model_cache_path' must be a non-empty string when provided.\n` +
|
|
211
|
-
`Current value: ${JSON.stringify(config.model_cache_path)}\n` +
|
|
212
|
-
`Please provide a valid directory path or remove the field to use default caching.`);
|
|
213
|
-
}
|
|
214
|
-
// Validate chunk_overlap is less than chunk_size
|
|
215
|
-
if (config.chunk_overlap >= config.chunk_size) {
|
|
216
|
-
throw new ConfigurationError(`Configuration error: chunk_overlap (${config.chunk_overlap}) must be less than chunk_size (${config.chunk_size}).\n` +
|
|
217
|
-
`Recommended: Set chunk_overlap to about 20% of chunk_size (e.g., chunk_size: 250, chunk_overlap: 50).`);
|
|
45
|
+
throw new Error('rerank_enabled must be a boolean');
|
|
218
46
|
}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
console.warn(`Warning: Large chunk_size (${config.chunk_size}) may impact performance. Recommended range: 200-400 tokens.`);
|
|
47
|
+
if (!config.rerank_model || typeof config.rerank_model !== 'string') {
|
|
48
|
+
throw new Error('rerank_model must be a non-empty string');
|
|
222
49
|
}
|
|
223
|
-
if (config.
|
|
224
|
-
|
|
50
|
+
if (!config.preprocessing || typeof config.preprocessing !== 'object') {
|
|
51
|
+
throw new Error('preprocessing must be an object');
|
|
225
52
|
}
|
|
226
53
|
}
|
|
227
54
|
/**
|
|
228
|
-
*
|
|
229
|
-
* Logs error and exits immediately with appropriate exit code
|
|
230
|
-
* @param error - Error object or message
|
|
231
|
-
* @param context - Context where the error occurred
|
|
232
|
-
* @param exitCode - Exit code to use (defaults to GENERAL_ERROR)
|
|
55
|
+
* Validate preprocessing configuration
|
|
233
56
|
*/
|
|
234
|
-
export function
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
case EXIT_CODES.CONFIGURATION_ERROR:
|
|
242
|
-
console.error('\nPlease check your configuration and try again.');
|
|
243
|
-
break;
|
|
244
|
-
case EXIT_CODES.DATABASE_ERROR:
|
|
245
|
-
console.error('\nTry running "raglite rebuild" to fix database issues.');
|
|
246
|
-
break;
|
|
247
|
-
case EXIT_CODES.MODEL_ERROR:
|
|
248
|
-
console.error('\nEnsure you have internet connection for model download and sufficient disk space.');
|
|
249
|
-
break;
|
|
250
|
-
case EXIT_CODES.INDEX_ERROR:
|
|
251
|
-
console.error('\nTry running "raglite rebuild" to recreate the vector index.');
|
|
252
|
-
break;
|
|
253
|
-
case EXIT_CODES.FILE_NOT_FOUND:
|
|
254
|
-
console.error('\nPlease check that the specified files or directories exist and are accessible.');
|
|
255
|
-
break;
|
|
256
|
-
case EXIT_CODES.PERMISSION_ERROR:
|
|
257
|
-
console.error('\nPlease check file and directory permissions.');
|
|
258
|
-
break;
|
|
259
|
-
default:
|
|
260
|
-
console.error('\nIf this problem persists, please report it as a bug.');
|
|
261
|
-
}
|
|
262
|
-
process.exit(exitCode);
|
|
57
|
+
export function validatePreprocessingConfig(config) {
|
|
58
|
+
return config &&
|
|
59
|
+
typeof config === 'object' &&
|
|
60
|
+
typeof config.enabled === 'boolean' &&
|
|
61
|
+
typeof config.mdx === 'boolean' &&
|
|
62
|
+
typeof config.mermaid === 'boolean' &&
|
|
63
|
+
typeof config.code_blocks === 'boolean';
|
|
263
64
|
}
|
|
264
65
|
/**
|
|
265
|
-
*
|
|
266
|
-
* @param error - Error to log
|
|
267
|
-
* @param context - Context where error occurred
|
|
268
|
-
* @param skipError - Whether to skip this error and continue (default: false)
|
|
66
|
+
* Merge preprocessing configurations
|
|
269
67
|
*/
|
|
270
|
-
export function
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
}
|
|
68
|
+
export function mergePreprocessingConfig(base, override) {
|
|
69
|
+
return {
|
|
70
|
+
enabled: override.enabled !== undefined ? override.enabled : base.enabled,
|
|
71
|
+
mdx: override.mdx !== undefined ? override.mdx : base.mdx,
|
|
72
|
+
mermaid: override.mermaid !== undefined ? override.mermaid : base.mermaid,
|
|
73
|
+
code_blocks: override.code_blocks !== undefined ? override.code_blocks : base.code_blocks
|
|
74
|
+
};
|
|
278
75
|
}
|
|
279
|
-
// Validate the default config on module load
|
|
280
|
-
validateConfig(config);
|
|
281
76
|
//# sourceMappingURL=config.js.map
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CORE MODULE — Abstract Base Embedder
|
|
3
|
+
*
|
|
4
|
+
* Provides model-agnostic base functionality for all embedder implementations.
|
|
5
|
+
* This is an abstract base class, not a concrete implementation.
|
|
6
|
+
*
|
|
7
|
+
* ARCHITECTURAL NOTE:
|
|
8
|
+
* While this contains implementation logic, it remains in the core layer because:
|
|
9
|
+
* 1. It's model-agnostic (no knowledge of specific models or transformers.js)
|
|
10
|
+
* 2. It's shared by multiple implementation layers (text, multimodal)
|
|
11
|
+
* 3. It provides common infrastructure (lifecycle, validation, batch processing)
|
|
12
|
+
* 4. Moving it would create awkward cross-layer dependencies
|
|
13
|
+
*
|
|
14
|
+
* This follows the "shared base class" pattern common in framework design,
|
|
15
|
+
* similar to React.Component, Django Model, or other framework base classes.
|
|
16
|
+
*
|
|
17
|
+
* RESPONSIBILITIES:
|
|
18
|
+
* - Model lifecycle management (loading, cleanup, disposal)
|
|
19
|
+
* - Batch processing coordination
|
|
20
|
+
* - Input validation and text truncation
|
|
21
|
+
* - Error handling with helpful messages
|
|
22
|
+
* - Embedding ID generation
|
|
23
|
+
* - Common utility methods
|
|
24
|
+
*
|
|
25
|
+
* IMPLEMENTATION LAYERS:
|
|
26
|
+
* - Text: SentenceTransformerEmbedder extends this class
|
|
27
|
+
* - Multimodal: CLIPEmbedder extends this class
|
|
28
|
+
*/
|
|
29
|
+
import type { UniversalEmbedder, ModelInfo, ModelType, EmbeddingBatchItem } from './universal-embedder.js';
|
|
30
|
+
import type { EmbeddingResult } from '../types.js';
|
|
31
|
+
/**
|
|
32
|
+
* Abstract base class for universal embedders
|
|
33
|
+
* Provides common functionality and lifecycle management
|
|
34
|
+
*/
|
|
35
|
+
export declare abstract class BaseUniversalEmbedder implements UniversalEmbedder {
|
|
36
|
+
readonly modelName: string;
|
|
37
|
+
protected readonly options: EmbedderOptions;
|
|
38
|
+
protected _isLoaded: boolean;
|
|
39
|
+
protected _modelInfo: ModelInfo;
|
|
40
|
+
constructor(modelName: string, options?: EmbedderOptions);
|
|
41
|
+
get modelType(): ModelType;
|
|
42
|
+
get dimensions(): number;
|
|
43
|
+
get supportedContentTypes(): readonly string[];
|
|
44
|
+
isLoaded(): boolean;
|
|
45
|
+
getModelInfo(): ModelInfo;
|
|
46
|
+
/**
|
|
47
|
+
* Load the model - must be implemented by subclasses
|
|
48
|
+
*/
|
|
49
|
+
abstract loadModel(): Promise<void>;
|
|
50
|
+
/**
|
|
51
|
+
* Embed text content - must be implemented by subclasses
|
|
52
|
+
*/
|
|
53
|
+
abstract embedText(text: string): Promise<EmbeddingResult>;
|
|
54
|
+
/**
|
|
55
|
+
* Clean up resources - must be implemented by subclasses
|
|
56
|
+
*/
|
|
57
|
+
abstract cleanup(): Promise<void>;
|
|
58
|
+
/**
|
|
59
|
+
* Dispose of all resources and prepare for garbage collection
|
|
60
|
+
* This method should be called when the embedder is no longer needed
|
|
61
|
+
*/
|
|
62
|
+
dispose(): Promise<void>;
|
|
63
|
+
/**
|
|
64
|
+
* Embed image content - optional, only implemented by multimodal embedders
|
|
65
|
+
*/
|
|
66
|
+
embedImage?(imagePath: string): Promise<EmbeddingResult>;
|
|
67
|
+
/**
|
|
68
|
+
* Batch embedding with default implementation
|
|
69
|
+
* Subclasses can override for more efficient batch processing
|
|
70
|
+
*/
|
|
71
|
+
embedBatch(items: EmbeddingBatchItem[]): Promise<EmbeddingResult[]>;
|
|
72
|
+
/**
|
|
73
|
+
* Process a single batch of items
|
|
74
|
+
* Can be overridden by subclasses for more efficient batch processing
|
|
75
|
+
*/
|
|
76
|
+
protected processBatch(batch: EmbeddingBatchItem[]): Promise<EmbeddingResult[]>;
|
|
77
|
+
/**
|
|
78
|
+
* Validate that the model is loaded before operations
|
|
79
|
+
*/
|
|
80
|
+
protected ensureLoaded(): void;
|
|
81
|
+
/**
|
|
82
|
+
* Generate a unique embedding ID
|
|
83
|
+
*/
|
|
84
|
+
protected generateEmbeddingId(content: string, contentType?: string): string;
|
|
85
|
+
/**
|
|
86
|
+
* Simple hash function for content identification
|
|
87
|
+
*/
|
|
88
|
+
private simpleHash;
|
|
89
|
+
/**
|
|
90
|
+
* Validate text length against model constraints
|
|
91
|
+
*/
|
|
92
|
+
protected validateTextLength(text: string): void;
|
|
93
|
+
/**
|
|
94
|
+
* Truncate text to model's maximum length
|
|
95
|
+
*/
|
|
96
|
+
protected truncateText(text: string): string;
|
|
97
|
+
/**
|
|
98
|
+
* Log model loading progress
|
|
99
|
+
*/
|
|
100
|
+
protected logModelLoading(stage: string, details?: string): void;
|
|
101
|
+
/**
|
|
102
|
+
* Handle model loading errors with helpful messages
|
|
103
|
+
*/
|
|
104
|
+
protected handleLoadingError(error: Error): Error;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Options for configuring embedder instances
|
|
108
|
+
*/
|
|
109
|
+
export interface EmbedderOptions {
|
|
110
|
+
cachePath?: string;
|
|
111
|
+
maxBatchSize?: number;
|
|
112
|
+
timeout?: number;
|
|
113
|
+
enableGPU?: boolean;
|
|
114
|
+
customConfig?: Record<string, any>;
|
|
115
|
+
logLevel?: 'debug' | 'info' | 'warn' | 'error' | 'silent';
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Create embedder options with defaults
|
|
119
|
+
*/
|
|
120
|
+
export declare function createEmbedderOptions(options?: Partial<EmbedderOptions>): EmbedderOptions;
|
|
121
|
+
/**
|
|
122
|
+
* Validate embedder options
|
|
123
|
+
*/
|
|
124
|
+
export declare function validateEmbedderOptions(options: EmbedderOptions): void;
|
|
125
|
+
//# sourceMappingURL=abstract-embedder.d.ts.map
|