rag-lite-ts 1.0.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/README.md +651 -109
  2. package/dist/cli/indexer.js +262 -46
  3. package/dist/cli/search.js +54 -32
  4. package/dist/cli.js +185 -28
  5. package/dist/config.d.ts +34 -73
  6. package/dist/config.js +50 -255
  7. package/dist/core/abstract-embedder.d.ts +125 -0
  8. package/dist/core/abstract-embedder.js +264 -0
  9. package/dist/core/actionable-error-messages.d.ts +60 -0
  10. package/dist/core/actionable-error-messages.js +397 -0
  11. package/dist/core/adapters.d.ts +93 -0
  12. package/dist/core/adapters.js +139 -0
  13. package/dist/core/batch-processing-optimizer.d.ts +155 -0
  14. package/dist/core/batch-processing-optimizer.js +541 -0
  15. package/dist/core/chunker.d.ts +119 -0
  16. package/dist/core/chunker.js +73 -0
  17. package/dist/core/cli-database-utils.d.ts +53 -0
  18. package/dist/core/cli-database-utils.js +239 -0
  19. package/dist/core/config.d.ts +102 -0
  20. package/dist/core/config.js +247 -0
  21. package/dist/core/content-errors.d.ts +111 -0
  22. package/dist/core/content-errors.js +362 -0
  23. package/dist/core/content-manager.d.ts +343 -0
  24. package/dist/core/content-manager.js +1504 -0
  25. package/dist/core/content-performance-optimizer.d.ts +150 -0
  26. package/dist/core/content-performance-optimizer.js +516 -0
  27. package/dist/core/content-resolver.d.ts +104 -0
  28. package/dist/core/content-resolver.js +285 -0
  29. package/dist/core/cross-modal-search.d.ts +164 -0
  30. package/dist/core/cross-modal-search.js +342 -0
  31. package/dist/core/database-connection-manager.d.ts +109 -0
  32. package/dist/core/database-connection-manager.js +304 -0
  33. package/dist/core/db.d.ts +245 -0
  34. package/dist/core/db.js +952 -0
  35. package/dist/core/embedder-factory.d.ts +176 -0
  36. package/dist/core/embedder-factory.js +338 -0
  37. package/dist/{error-handler.d.ts → core/error-handler.d.ts} +23 -2
  38. package/dist/{error-handler.js → core/error-handler.js} +51 -8
  39. package/dist/core/index.d.ts +59 -0
  40. package/dist/core/index.js +69 -0
  41. package/dist/core/ingestion.d.ts +213 -0
  42. package/dist/core/ingestion.js +812 -0
  43. package/dist/core/interfaces.d.ts +408 -0
  44. package/dist/core/interfaces.js +106 -0
  45. package/dist/core/lazy-dependency-loader.d.ts +152 -0
  46. package/dist/core/lazy-dependency-loader.js +453 -0
  47. package/dist/core/mode-detection-service.d.ts +150 -0
  48. package/dist/core/mode-detection-service.js +565 -0
  49. package/dist/core/mode-model-validator.d.ts +92 -0
  50. package/dist/core/mode-model-validator.js +203 -0
  51. package/dist/core/model-registry.d.ts +120 -0
  52. package/dist/core/model-registry.js +415 -0
  53. package/dist/core/model-validator.d.ts +217 -0
  54. package/dist/core/model-validator.js +782 -0
  55. package/dist/{path-manager.d.ts → core/path-manager.d.ts} +5 -0
  56. package/dist/{path-manager.js → core/path-manager.js} +5 -0
  57. package/dist/core/polymorphic-search-factory.d.ts +154 -0
  58. package/dist/core/polymorphic-search-factory.js +344 -0
  59. package/dist/core/raglite-paths.d.ts +121 -0
  60. package/dist/core/raglite-paths.js +145 -0
  61. package/dist/core/reranking-config.d.ts +42 -0
  62. package/dist/core/reranking-config.js +156 -0
  63. package/dist/core/reranking-factory.d.ts +92 -0
  64. package/dist/core/reranking-factory.js +591 -0
  65. package/dist/core/reranking-strategies.d.ts +325 -0
  66. package/dist/core/reranking-strategies.js +720 -0
  67. package/dist/core/resource-cleanup.d.ts +163 -0
  68. package/dist/core/resource-cleanup.js +371 -0
  69. package/dist/core/resource-manager.d.ts +212 -0
  70. package/dist/core/resource-manager.js +564 -0
  71. package/dist/core/search-pipeline.d.ts +111 -0
  72. package/dist/core/search-pipeline.js +287 -0
  73. package/dist/core/search.d.ts +131 -0
  74. package/dist/core/search.js +296 -0
  75. package/dist/core/streaming-operations.d.ts +145 -0
  76. package/dist/core/streaming-operations.js +409 -0
  77. package/dist/core/types.d.ts +66 -0
  78. package/dist/core/types.js +6 -0
  79. package/dist/core/universal-embedder.d.ts +177 -0
  80. package/dist/core/universal-embedder.js +139 -0
  81. package/dist/core/validation-messages.d.ts +99 -0
  82. package/dist/core/validation-messages.js +334 -0
  83. package/dist/{vector-index.d.ts → core/vector-index.d.ts} +4 -0
  84. package/dist/{vector-index.js → core/vector-index.js} +21 -3
  85. package/dist/dom-polyfills.d.ts +6 -0
  86. package/dist/dom-polyfills.js +40 -0
  87. package/dist/factories/index.d.ts +43 -0
  88. package/dist/factories/index.js +44 -0
  89. package/dist/factories/text-factory.d.ts +560 -0
  90. package/dist/factories/text-factory.js +968 -0
  91. package/dist/file-processor.d.ts +90 -4
  92. package/dist/file-processor.js +723 -20
  93. package/dist/index-manager.d.ts +3 -2
  94. package/dist/index-manager.js +13 -11
  95. package/dist/index.d.ts +72 -8
  96. package/dist/index.js +102 -16
  97. package/dist/indexer.js +1 -1
  98. package/dist/ingestion.d.ts +44 -154
  99. package/dist/ingestion.js +75 -671
  100. package/dist/mcp-server.d.ts +35 -3
  101. package/dist/mcp-server.js +1186 -79
  102. package/dist/multimodal/clip-embedder.d.ts +314 -0
  103. package/dist/multimodal/clip-embedder.js +945 -0
  104. package/dist/multimodal/index.d.ts +6 -0
  105. package/dist/multimodal/index.js +6 -0
  106. package/dist/preprocess.js +1 -1
  107. package/dist/run-error-recovery-tests.d.ts +7 -0
  108. package/dist/run-error-recovery-tests.js +101 -0
  109. package/dist/search-standalone.js +1 -1
  110. package/dist/search.d.ts +51 -69
  111. package/dist/search.js +117 -412
  112. package/dist/test-utils.d.ts +8 -26
  113. package/dist/text/chunker.d.ts +33 -0
  114. package/dist/{chunker.js → text/chunker.js} +98 -75
  115. package/dist/{embedder.d.ts → text/embedder.d.ts} +22 -1
  116. package/dist/{embedder.js → text/embedder.js} +84 -10
  117. package/dist/text/index.d.ts +8 -0
  118. package/dist/text/index.js +9 -0
  119. package/dist/text/preprocessors/index.d.ts +17 -0
  120. package/dist/text/preprocessors/index.js +38 -0
  121. package/dist/text/preprocessors/mdx.d.ts +25 -0
  122. package/dist/text/preprocessors/mdx.js +101 -0
  123. package/dist/text/preprocessors/mermaid.d.ts +68 -0
  124. package/dist/text/preprocessors/mermaid.js +330 -0
  125. package/dist/text/preprocessors/registry.d.ts +56 -0
  126. package/dist/text/preprocessors/registry.js +180 -0
  127. package/dist/text/reranker.d.ts +59 -0
  128. package/dist/{reranker.js → text/reranker.js} +138 -53
  129. package/dist/text/sentence-transformer-embedder.d.ts +96 -0
  130. package/dist/text/sentence-transformer-embedder.js +340 -0
  131. package/dist/{tokenizer.d.ts → text/tokenizer.d.ts} +1 -0
  132. package/dist/{tokenizer.js → text/tokenizer.js} +7 -2
  133. package/dist/types.d.ts +40 -1
  134. package/dist/utils/vector-math.d.ts +31 -0
  135. package/dist/utils/vector-math.js +70 -0
  136. package/package.json +16 -4
  137. package/dist/api-errors.d.ts.map +0 -1
  138. package/dist/api-errors.js.map +0 -1
  139. package/dist/chunker.d.ts +0 -47
  140. package/dist/chunker.d.ts.map +0 -1
  141. package/dist/chunker.js.map +0 -1
  142. package/dist/cli/indexer.d.ts.map +0 -1
  143. package/dist/cli/indexer.js.map +0 -1
  144. package/dist/cli/search.d.ts.map +0 -1
  145. package/dist/cli/search.js.map +0 -1
  146. package/dist/cli.d.ts.map +0 -1
  147. package/dist/cli.js.map +0 -1
  148. package/dist/config.d.ts.map +0 -1
  149. package/dist/config.js.map +0 -1
  150. package/dist/db.d.ts +0 -90
  151. package/dist/db.d.ts.map +0 -1
  152. package/dist/db.js +0 -340
  153. package/dist/db.js.map +0 -1
  154. package/dist/embedder.d.ts.map +0 -1
  155. package/dist/embedder.js.map +0 -1
  156. package/dist/error-handler.d.ts.map +0 -1
  157. package/dist/error-handler.js.map +0 -1
  158. package/dist/file-processor.d.ts.map +0 -1
  159. package/dist/file-processor.js.map +0 -1
  160. package/dist/index-manager.d.ts.map +0 -1
  161. package/dist/index-manager.js.map +0 -1
  162. package/dist/index.d.ts.map +0 -1
  163. package/dist/index.js.map +0 -1
  164. package/dist/indexer.d.ts.map +0 -1
  165. package/dist/indexer.js.map +0 -1
  166. package/dist/ingestion.d.ts.map +0 -1
  167. package/dist/ingestion.js.map +0 -1
  168. package/dist/mcp-server.d.ts.map +0 -1
  169. package/dist/mcp-server.js.map +0 -1
  170. package/dist/path-manager.d.ts.map +0 -1
  171. package/dist/path-manager.js.map +0 -1
  172. package/dist/preprocess.d.ts.map +0 -1
  173. package/dist/preprocess.js.map +0 -1
  174. package/dist/preprocessors/index.d.ts.map +0 -1
  175. package/dist/preprocessors/index.js.map +0 -1
  176. package/dist/preprocessors/mdx.d.ts.map +0 -1
  177. package/dist/preprocessors/mdx.js.map +0 -1
  178. package/dist/preprocessors/mermaid.d.ts.map +0 -1
  179. package/dist/preprocessors/mermaid.js.map +0 -1
  180. package/dist/preprocessors/registry.d.ts.map +0 -1
  181. package/dist/preprocessors/registry.js.map +0 -1
  182. package/dist/reranker.d.ts +0 -40
  183. package/dist/reranker.d.ts.map +0 -1
  184. package/dist/reranker.js.map +0 -1
  185. package/dist/resource-manager-demo.d.ts +0 -7
  186. package/dist/resource-manager-demo.d.ts.map +0 -1
  187. package/dist/resource-manager-demo.js +0 -52
  188. package/dist/resource-manager-demo.js.map +0 -1
  189. package/dist/resource-manager.d.ts +0 -129
  190. package/dist/resource-manager.d.ts.map +0 -1
  191. package/dist/resource-manager.js +0 -389
  192. package/dist/resource-manager.js.map +0 -1
  193. package/dist/search-standalone.d.ts.map +0 -1
  194. package/dist/search-standalone.js.map +0 -1
  195. package/dist/search.d.ts.map +0 -1
  196. package/dist/search.js.map +0 -1
  197. package/dist/test-utils.d.ts.map +0 -1
  198. package/dist/test-utils.js.map +0 -1
  199. package/dist/tokenizer.d.ts.map +0 -1
  200. package/dist/tokenizer.js.map +0 -1
  201. package/dist/types.d.ts.map +0 -1
  202. package/dist/types.js.map +0 -1
  203. package/dist/vector-index.d.ts.map +0 -1
  204. package/dist/vector-index.js.map +0 -1
package/dist/config.js CHANGED
@@ -1,281 +1,76 @@
1
- import { homedir } from 'os';
2
- import { join } from 'path';
3
1
  /**
4
- * Default preprocessing configuration - "balanced" mode
2
+ * Main configuration file with text-specific settings
3
+ * Extends core configuration with implementation-specific properties
5
4
  */
6
- export const defaultPreprocessingConfig = {
7
- mode: 'balanced'
8
- };
5
+ import { getDefaultModelCachePath } from './core/config.js';
9
6
  /**
10
- * Get the default model cache path as specified in the requirements
11
- * @returns Default cache path (~/.raglite/models/)
7
+ * Default configuration object with both core and text-specific settings
12
8
  */
13
- function getDefaultModelCachePath() {
14
- return join(homedir(), '.raglite', 'models');
15
- }
16
- /**
17
- * Returns model-specific default configuration values
18
- * @param modelName - The embedding model name
19
- * @returns Model-specific defaults for dimensions, chunk_size, chunk_overlap, and batch_size
20
- */
21
- export function getModelDefaults(modelName) {
22
- if (modelName === 'Xenova/all-mpnet-base-v2') {
23
- return {
24
- dimensions: 768,
25
- chunk_size: 400,
26
- chunk_overlap: 80,
27
- batch_size: 8
28
- };
29
- }
30
- // Default to sentence-transformers/all-MiniLM-L6-v2 settings (current defaults)
31
- return {
32
- dimensions: 384,
33
- chunk_size: 250,
34
- chunk_overlap: 50,
35
- batch_size: 16
36
- };
37
- }
38
- // Create config with model-specific defaults
39
- const embeddingModel = process.env.RAG_EMBEDDING_MODEL || 'sentence-transformers/all-MiniLM-L6-v2';
40
- const modelDefaults = getModelDefaults(embeddingModel);
41
9
  export const config = {
42
- embedding_model: embeddingModel,
43
- chunk_size: parseInt(process.env.RAG_CHUNK_SIZE || modelDefaults.chunk_size.toString(), 10),
44
- chunk_overlap: parseInt(process.env.RAG_CHUNK_OVERLAP || modelDefaults.chunk_overlap.toString(), 10),
45
- batch_size: parseInt(process.env.RAG_BATCH_SIZE || modelDefaults.batch_size.toString(), 10),
10
+ // Core settings
11
+ chunk_size: parseInt(process.env.RAG_CHUNK_SIZE || '250', 10),
12
+ chunk_overlap: parseInt(process.env.RAG_CHUNK_OVERLAP || '50', 10),
13
+ batch_size: parseInt(process.env.RAG_BATCH_SIZE || '16', 10),
46
14
  top_k: parseInt(process.env.RAG_TOP_K || '10', 10),
47
15
  db_file: process.env.RAG_DB_FILE || 'db.sqlite',
48
16
  index_file: process.env.RAG_INDEX_FILE || 'vector-index.bin',
49
- rerank_enabled: process.env.RAG_RERANK_ENABLED === 'true',
50
- preprocessing: defaultPreprocessingConfig,
51
17
  model_cache_path: process.env.RAG_MODEL_CACHE_PATH || getDefaultModelCachePath(),
52
- path_storage_strategy: process.env.RAG_PATH_STORAGE_STRATEGY || 'relative'
53
- };
54
- /**
55
- * Standard exit codes for different error conditions
56
- */
57
- export const EXIT_CODES = {
58
- SUCCESS: 0,
59
- GENERAL_ERROR: 1,
60
- INVALID_ARGUMENTS: 2,
61
- CONFIGURATION_ERROR: 3,
62
- FILE_NOT_FOUND: 4,
63
- DATABASE_ERROR: 5,
64
- MODEL_ERROR: 6,
65
- INDEX_ERROR: 7,
66
- PERMISSION_ERROR: 8
67
- };
68
- /**
69
- * Configuration validation error with specific exit code
70
- */
71
- export class ConfigurationError extends Error {
72
- exitCode;
73
- constructor(message, exitCode = EXIT_CODES.CONFIGURATION_ERROR) {
74
- super(message);
75
- this.exitCode = exitCode;
76
- this.name = 'ConfigurationError';
77
- }
78
- }
79
- /**
80
- * Validates preprocessing configuration
81
- * @param config - Preprocessing configuration to validate
82
- * @throws {ConfigurationError} If preprocessing configuration is invalid
83
- */
84
- export function validatePreprocessingConfig(config) {
85
- if (!config || typeof config !== 'object') {
86
- throw new ConfigurationError('Preprocessing configuration must be an object');
87
- }
88
- // Validate mode
89
- const validModes = ['strict', 'balanced', 'rich'];
90
- if (!config.mode || !validModes.includes(config.mode)) {
91
- throw new ConfigurationError(`Configuration error: preprocessing.mode must be one of: ${validModes.join(', ')}.\n` +
92
- `Current value: ${JSON.stringify(config.mode)}\n` +
93
- `Please set it to 'strict', 'balanced', or 'rich'.`);
18
+ path_storage_strategy: process.env.RAG_PATH_STORAGE_STRATEGY || 'relative',
19
+ // Text-specific settings
20
+ embedding_model: process.env.RAG_EMBEDDING_MODEL || 'sentence-transformers/all-MiniLM-L6-v2',
21
+ rerank_enabled: process.env.RAG_RERANK_ENABLED !== 'false', // Default to true unless explicitly disabled
22
+ rerank_model: process.env.RAG_RERANK_MODEL || 'cross-encoder/ms-marco-MiniLM-L-6-v2',
23
+ // Preprocessing settings
24
+ preprocessing: {
25
+ enabled: process.env.RAG_PREPROCESSING_ENABLED !== 'false',
26
+ mdx: process.env.RAG_PREPROCESSING_MDX !== 'false',
27
+ mermaid: process.env.RAG_PREPROCESSING_MERMAID !== 'false',
28
+ code_blocks: process.env.RAG_PREPROCESSING_CODE_BLOCKS !== 'false'
94
29
  }
95
- // Validate overrides if present
96
- if (config.overrides !== undefined) {
97
- if (typeof config.overrides !== 'object' || config.overrides === null) {
98
- throw new ConfigurationError(`Configuration error: preprocessing.overrides must be an object.\n` +
99
- `Current value: ${JSON.stringify(config.overrides)}`);
100
- }
101
- // Validate MDX override
102
- if (config.overrides.mdx !== undefined) {
103
- const validMdxOptions = ['strip', 'keep', 'placeholder'];
104
- if (!validMdxOptions.includes(config.overrides.mdx)) {
105
- throw new ConfigurationError(`Configuration error: preprocessing.overrides.mdx must be one of: ${validMdxOptions.join(', ')}.\n` +
106
- `Current value: ${JSON.stringify(config.overrides.mdx)}`);
107
- }
108
- }
109
- // Validate Mermaid override
110
- if (config.overrides.mermaid !== undefined) {
111
- const validMermaidOptions = ['strip', 'extract', 'placeholder'];
112
- if (!validMermaidOptions.includes(config.overrides.mermaid)) {
113
- throw new ConfigurationError(`Configuration error: preprocessing.overrides.mermaid must be one of: ${validMermaidOptions.join(', ')}.\n` +
114
- `Current value: ${JSON.stringify(config.overrides.mermaid)}`);
115
- }
116
- }
117
- // Validate code override
118
- if (config.overrides.code !== undefined) {
119
- const validCodeOptions = ['strip', 'keep', 'placeholder'];
120
- if (!validCodeOptions.includes(config.overrides.code)) {
121
- throw new ConfigurationError(`Configuration error: preprocessing.overrides.code must be one of: ${validCodeOptions.join(', ')}.\n` +
122
- `Current value: ${JSON.stringify(config.overrides.code)}`);
123
- }
124
- }
125
- }
126
- }
127
- /**
128
- * Merges preprocessing mode with overrides to create final configuration
129
- * @param config - Base preprocessing configuration
130
- * @returns Resolved preprocessing options for each content type
131
- */
132
- export function mergePreprocessingConfig(config) {
133
- // Define mode defaults
134
- const modeDefaults = {
135
- strict: {
136
- mdx: 'strip',
137
- mermaid: 'strip',
138
- code: 'strip'
139
- },
140
- balanced: {
141
- mdx: 'placeholder',
142
- mermaid: 'placeholder',
143
- code: 'keep'
144
- },
145
- rich: {
146
- mdx: 'keep',
147
- mermaid: 'extract',
148
- code: 'keep'
149
- }
150
- };
151
- // Start with mode defaults
152
- const result = { ...modeDefaults[config.mode] };
153
- // Apply overrides (shallow override only)
154
- if (config.overrides) {
155
- if (config.overrides.mdx !== undefined) {
156
- result.mdx = config.overrides.mdx;
157
- }
158
- if (config.overrides.mermaid !== undefined) {
159
- result.mermaid = config.overrides.mermaid;
160
- }
161
- if (config.overrides.code !== undefined) {
162
- result.code = config.overrides.code;
163
- }
164
- }
165
- return result;
166
- }
30
+ };
31
+ // Re-export everything from core config
32
+ export * from './core/config.js';
167
33
  /**
168
- * Validates the configuration object
169
- * @param config - Configuration object to validate
170
- * @throws {ConfigurationError} If configuration is invalid
34
+ * Validate complete configuration including text-specific settings
171
35
  */
172
36
  export function validateConfig(config) {
173
- if (!config || typeof config !== 'object') {
174
- throw new ConfigurationError('Configuration must be an object');
175
- }
176
- // Check required string fields
177
- const requiredStrings = ['embedding_model', 'db_file', 'index_file'];
178
- for (const field of requiredStrings) {
179
- if (!config[field] || typeof config[field] !== 'string') {
180
- throw new ConfigurationError(`Configuration error: '${field}' must be a non-empty string.\n` +
181
- `Current value: ${JSON.stringify(config[field])}\n` +
182
- `Please check your configuration file.`);
183
- }
184
- }
185
- // Validate path_storage_strategy
186
- if (!['absolute', 'relative'].includes(config.path_storage_strategy)) {
187
- throw new ConfigurationError(`Configuration error: 'path_storage_strategy' must be either 'absolute' or 'relative'.\n` +
188
- `Current value: ${JSON.stringify(config.path_storage_strategy)}\n` +
189
- `Please set it to 'absolute' or 'relative'.`);
37
+ // First validate core config
38
+ const { validateCoreConfig } = require('./core/config.js');
39
+ validateCoreConfig(config);
40
+ // Validate text-specific settings
41
+ if (!config.embedding_model || typeof config.embedding_model !== 'string') {
42
+ throw new Error('embedding_model must be a non-empty string');
190
43
  }
191
- // Check required numeric fields are positive
192
- const requiredNumbers = ['chunk_size', 'chunk_overlap', 'batch_size', 'top_k'];
193
- for (const field of requiredNumbers) {
194
- if (typeof config[field] !== 'number' || config[field] <= 0) {
195
- throw new ConfigurationError(`Configuration error: '${field}' must be a positive number.\n` +
196
- `Current value: ${JSON.stringify(config[field])}\n` +
197
- `Please ensure all numeric values are greater than 0.`);
198
- }
199
- }
200
- // Check boolean fields
201
44
  if (typeof config.rerank_enabled !== 'boolean') {
202
- throw new ConfigurationError(`Configuration error: 'rerank_enabled' must be a boolean (true or false).\n` +
203
- `Current value: ${JSON.stringify(config.rerank_enabled)}\n` +
204
- `Please set it to either true or false.`);
205
- }
206
- // Validate preprocessing configuration
207
- validatePreprocessingConfig(config.preprocessing);
208
- // Validate optional model_cache_path field
209
- if (config.model_cache_path !== undefined && (typeof config.model_cache_path !== 'string' || config.model_cache_path.trim() === '')) {
210
- throw new ConfigurationError(`Configuration error: 'model_cache_path' must be a non-empty string when provided.\n` +
211
- `Current value: ${JSON.stringify(config.model_cache_path)}\n` +
212
- `Please provide a valid directory path or remove the field to use default caching.`);
213
- }
214
- // Validate chunk_overlap is less than chunk_size
215
- if (config.chunk_overlap >= config.chunk_size) {
216
- throw new ConfigurationError(`Configuration error: chunk_overlap (${config.chunk_overlap}) must be less than chunk_size (${config.chunk_size}).\n` +
217
- `Recommended: Set chunk_overlap to about 20% of chunk_size (e.g., chunk_size: 250, chunk_overlap: 50).`);
45
+ throw new Error('rerank_enabled must be a boolean');
218
46
  }
219
- // Validate reasonable ranges for performance
220
- if (config.chunk_size > 1000) {
221
- console.warn(`Warning: Large chunk_size (${config.chunk_size}) may impact performance. Recommended range: 200-400 tokens.`);
47
+ if (!config.rerank_model || typeof config.rerank_model !== 'string') {
48
+ throw new Error('rerank_model must be a non-empty string');
222
49
  }
223
- if (config.batch_size > 64) {
224
- console.warn(`Warning: Large batch_size (${config.batch_size}) may cause memory issues. Recommended range: 8-32.`);
50
+ if (!config.preprocessing || typeof config.preprocessing !== 'object') {
51
+ throw new Error('preprocessing must be an object');
225
52
  }
226
53
  }
227
54
  /**
228
- * Utility function to handle unrecoverable errors with descriptive messages
229
- * Logs error and exits immediately with appropriate exit code
230
- * @param error - Error object or message
231
- * @param context - Context where the error occurred
232
- * @param exitCode - Exit code to use (defaults to GENERAL_ERROR)
55
+ * Validate preprocessing configuration
233
56
  */
234
- export function handleUnrecoverableError(error, context, exitCode = EXIT_CODES.GENERAL_ERROR) {
235
- const errorMessage = error instanceof Error ? error.message : String(error);
236
- console.error(`\nFatal Error in ${context}:`);
237
- console.error(errorMessage);
238
- console.error('\nThe system cannot continue and will exit immediately.');
239
- // Provide context-specific guidance
240
- switch (exitCode) {
241
- case EXIT_CODES.CONFIGURATION_ERROR:
242
- console.error('\nPlease check your configuration and try again.');
243
- break;
244
- case EXIT_CODES.DATABASE_ERROR:
245
- console.error('\nTry running "raglite rebuild" to fix database issues.');
246
- break;
247
- case EXIT_CODES.MODEL_ERROR:
248
- console.error('\nEnsure you have internet connection for model download and sufficient disk space.');
249
- break;
250
- case EXIT_CODES.INDEX_ERROR:
251
- console.error('\nTry running "raglite rebuild" to recreate the vector index.');
252
- break;
253
- case EXIT_CODES.FILE_NOT_FOUND:
254
- console.error('\nPlease check that the specified files or directories exist and are accessible.');
255
- break;
256
- case EXIT_CODES.PERMISSION_ERROR:
257
- console.error('\nPlease check file and directory permissions.');
258
- break;
259
- default:
260
- console.error('\nIf this problem persists, please report it as a bug.');
261
- }
262
- process.exit(exitCode);
57
+ export function validatePreprocessingConfig(config) {
58
+ return config &&
59
+ typeof config === 'object' &&
60
+ typeof config.enabled === 'boolean' &&
61
+ typeof config.mdx === 'boolean' &&
62
+ typeof config.mermaid === 'boolean' &&
63
+ typeof config.code_blocks === 'boolean';
263
64
  }
264
65
  /**
265
- * Utility function for safe error logging with context
266
- * @param error - Error to log
267
- * @param context - Context where error occurred
268
- * @param skipError - Whether to skip this error and continue (default: false)
66
+ * Merge preprocessing configurations
269
67
  */
270
- export function logError(error, context, skipError = false) {
271
- const errorMessage = error instanceof Error ? error.message : String(error);
272
- if (skipError) {
273
- console.error(`Warning in ${context}: ${errorMessage} (skipping and continuing)`);
274
- }
275
- else {
276
- console.error(`Error in ${context}: ${errorMessage}`);
277
- }
68
+ export function mergePreprocessingConfig(base, override) {
69
+ return {
70
+ enabled: override.enabled !== undefined ? override.enabled : base.enabled,
71
+ mdx: override.mdx !== undefined ? override.mdx : base.mdx,
72
+ mermaid: override.mermaid !== undefined ? override.mermaid : base.mermaid,
73
+ code_blocks: override.code_blocks !== undefined ? override.code_blocks : base.code_blocks
74
+ };
278
75
  }
279
- // Validate the default config on module load
280
- validateConfig(config);
281
76
  //# sourceMappingURL=config.js.map
@@ -0,0 +1,125 @@
1
+ /**
2
+ * CORE MODULE — Abstract Base Embedder
3
+ *
4
+ * Provides model-agnostic base functionality for all embedder implementations.
5
+ * This is an abstract base class, not a concrete implementation.
6
+ *
7
+ * ARCHITECTURAL NOTE:
8
+ * While this contains implementation logic, it remains in the core layer because:
9
+ * 1. It's model-agnostic (no knowledge of specific models or transformers.js)
10
+ * 2. It's shared by multiple implementation layers (text, multimodal)
11
+ * 3. It provides common infrastructure (lifecycle, validation, batch processing)
12
+ * 4. Moving it would create awkward cross-layer dependencies
13
+ *
14
+ * This follows the "shared base class" pattern common in framework design,
15
+ * similar to React.Component, Django Model, or other framework base classes.
16
+ *
17
+ * RESPONSIBILITIES:
18
+ * - Model lifecycle management (loading, cleanup, disposal)
19
+ * - Batch processing coordination
20
+ * - Input validation and text truncation
21
+ * - Error handling with helpful messages
22
+ * - Embedding ID generation
23
+ * - Common utility methods
24
+ *
25
+ * IMPLEMENTATION LAYERS:
26
+ * - Text: SentenceTransformerEmbedder extends this class
27
+ * - Multimodal: CLIPEmbedder extends this class
28
+ */
29
+ import type { UniversalEmbedder, ModelInfo, ModelType, EmbeddingBatchItem } from './universal-embedder.js';
30
+ import type { EmbeddingResult } from '../types.js';
31
+ /**
32
+ * Abstract base class for universal embedders
33
+ * Provides common functionality and lifecycle management
34
+ */
35
+ export declare abstract class BaseUniversalEmbedder implements UniversalEmbedder {
36
+ readonly modelName: string;
37
+ protected readonly options: EmbedderOptions;
38
+ protected _isLoaded: boolean;
39
+ protected _modelInfo: ModelInfo;
40
+ constructor(modelName: string, options?: EmbedderOptions);
41
+ get modelType(): ModelType;
42
+ get dimensions(): number;
43
+ get supportedContentTypes(): readonly string[];
44
+ isLoaded(): boolean;
45
+ getModelInfo(): ModelInfo;
46
+ /**
47
+ * Load the model - must be implemented by subclasses
48
+ */
49
+ abstract loadModel(): Promise<void>;
50
+ /**
51
+ * Embed text content - must be implemented by subclasses
52
+ */
53
+ abstract embedText(text: string): Promise<EmbeddingResult>;
54
+ /**
55
+ * Clean up resources - must be implemented by subclasses
56
+ */
57
+ abstract cleanup(): Promise<void>;
58
+ /**
59
+ * Dispose of all resources and prepare for garbage collection
60
+ * This method should be called when the embedder is no longer needed
61
+ */
62
+ dispose(): Promise<void>;
63
+ /**
64
+ * Embed image content - optional, only implemented by multimodal embedders
65
+ */
66
+ embedImage?(imagePath: string): Promise<EmbeddingResult>;
67
+ /**
68
+ * Batch embedding with default implementation
69
+ * Subclasses can override for more efficient batch processing
70
+ */
71
+ embedBatch(items: EmbeddingBatchItem[]): Promise<EmbeddingResult[]>;
72
+ /**
73
+ * Process a single batch of items
74
+ * Can be overridden by subclasses for more efficient batch processing
75
+ */
76
+ protected processBatch(batch: EmbeddingBatchItem[]): Promise<EmbeddingResult[]>;
77
+ /**
78
+ * Validate that the model is loaded before operations
79
+ */
80
+ protected ensureLoaded(): void;
81
+ /**
82
+ * Generate a unique embedding ID
83
+ */
84
+ protected generateEmbeddingId(content: string, contentType?: string): string;
85
+ /**
86
+ * Simple hash function for content identification
87
+ */
88
+ private simpleHash;
89
+ /**
90
+ * Validate text length against model constraints
91
+ */
92
+ protected validateTextLength(text: string): void;
93
+ /**
94
+ * Truncate text to model's maximum length
95
+ */
96
+ protected truncateText(text: string): string;
97
+ /**
98
+ * Log model loading progress
99
+ */
100
+ protected logModelLoading(stage: string, details?: string): void;
101
+ /**
102
+ * Handle model loading errors with helpful messages
103
+ */
104
+ protected handleLoadingError(error: Error): Error;
105
+ }
106
+ /**
107
+ * Options for configuring embedder instances
108
+ */
109
+ export interface EmbedderOptions {
110
+ cachePath?: string;
111
+ maxBatchSize?: number;
112
+ timeout?: number;
113
+ enableGPU?: boolean;
114
+ customConfig?: Record<string, any>;
115
+ logLevel?: 'debug' | 'info' | 'warn' | 'error' | 'silent';
116
+ }
117
+ /**
118
+ * Create embedder options with defaults
119
+ */
120
+ export declare function createEmbedderOptions(options?: Partial<EmbedderOptions>): EmbedderOptions;
121
+ /**
122
+ * Validate embedder options
123
+ */
124
+ export declare function validateEmbedderOptions(options: EmbedderOptions): void;
125
+ //# sourceMappingURL=abstract-embedder.d.ts.map