rag-lite-ts 1.0.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/README.md +651 -109
  2. package/dist/cli/indexer.js +262 -46
  3. package/dist/cli/search.js +54 -32
  4. package/dist/cli.js +185 -28
  5. package/dist/config.d.ts +34 -73
  6. package/dist/config.js +50 -255
  7. package/dist/core/abstract-embedder.d.ts +125 -0
  8. package/dist/core/abstract-embedder.js +264 -0
  9. package/dist/core/actionable-error-messages.d.ts +60 -0
  10. package/dist/core/actionable-error-messages.js +397 -0
  11. package/dist/core/adapters.d.ts +93 -0
  12. package/dist/core/adapters.js +139 -0
  13. package/dist/core/batch-processing-optimizer.d.ts +155 -0
  14. package/dist/core/batch-processing-optimizer.js +541 -0
  15. package/dist/core/chunker.d.ts +119 -0
  16. package/dist/core/chunker.js +73 -0
  17. package/dist/core/cli-database-utils.d.ts +53 -0
  18. package/dist/core/cli-database-utils.js +239 -0
  19. package/dist/core/config.d.ts +102 -0
  20. package/dist/core/config.js +247 -0
  21. package/dist/core/content-errors.d.ts +111 -0
  22. package/dist/core/content-errors.js +362 -0
  23. package/dist/core/content-manager.d.ts +343 -0
  24. package/dist/core/content-manager.js +1504 -0
  25. package/dist/core/content-performance-optimizer.d.ts +150 -0
  26. package/dist/core/content-performance-optimizer.js +516 -0
  27. package/dist/core/content-resolver.d.ts +104 -0
  28. package/dist/core/content-resolver.js +285 -0
  29. package/dist/core/cross-modal-search.d.ts +164 -0
  30. package/dist/core/cross-modal-search.js +342 -0
  31. package/dist/core/database-connection-manager.d.ts +109 -0
  32. package/dist/core/database-connection-manager.js +304 -0
  33. package/dist/core/db.d.ts +245 -0
  34. package/dist/core/db.js +952 -0
  35. package/dist/core/embedder-factory.d.ts +176 -0
  36. package/dist/core/embedder-factory.js +338 -0
  37. package/dist/{error-handler.d.ts → core/error-handler.d.ts} +23 -2
  38. package/dist/{error-handler.js → core/error-handler.js} +51 -8
  39. package/dist/core/index.d.ts +59 -0
  40. package/dist/core/index.js +69 -0
  41. package/dist/core/ingestion.d.ts +213 -0
  42. package/dist/core/ingestion.js +812 -0
  43. package/dist/core/interfaces.d.ts +408 -0
  44. package/dist/core/interfaces.js +106 -0
  45. package/dist/core/lazy-dependency-loader.d.ts +152 -0
  46. package/dist/core/lazy-dependency-loader.js +453 -0
  47. package/dist/core/mode-detection-service.d.ts +150 -0
  48. package/dist/core/mode-detection-service.js +565 -0
  49. package/dist/core/mode-model-validator.d.ts +92 -0
  50. package/dist/core/mode-model-validator.js +203 -0
  51. package/dist/core/model-registry.d.ts +120 -0
  52. package/dist/core/model-registry.js +415 -0
  53. package/dist/core/model-validator.d.ts +217 -0
  54. package/dist/core/model-validator.js +782 -0
  55. package/dist/{path-manager.d.ts → core/path-manager.d.ts} +5 -0
  56. package/dist/{path-manager.js → core/path-manager.js} +5 -0
  57. package/dist/core/polymorphic-search-factory.d.ts +154 -0
  58. package/dist/core/polymorphic-search-factory.js +344 -0
  59. package/dist/core/raglite-paths.d.ts +121 -0
  60. package/dist/core/raglite-paths.js +145 -0
  61. package/dist/core/reranking-config.d.ts +42 -0
  62. package/dist/core/reranking-config.js +156 -0
  63. package/dist/core/reranking-factory.d.ts +92 -0
  64. package/dist/core/reranking-factory.js +591 -0
  65. package/dist/core/reranking-strategies.d.ts +325 -0
  66. package/dist/core/reranking-strategies.js +720 -0
  67. package/dist/core/resource-cleanup.d.ts +163 -0
  68. package/dist/core/resource-cleanup.js +371 -0
  69. package/dist/core/resource-manager.d.ts +212 -0
  70. package/dist/core/resource-manager.js +564 -0
  71. package/dist/core/search-pipeline.d.ts +111 -0
  72. package/dist/core/search-pipeline.js +287 -0
  73. package/dist/core/search.d.ts +131 -0
  74. package/dist/core/search.js +296 -0
  75. package/dist/core/streaming-operations.d.ts +145 -0
  76. package/dist/core/streaming-operations.js +409 -0
  77. package/dist/core/types.d.ts +66 -0
  78. package/dist/core/types.js +6 -0
  79. package/dist/core/universal-embedder.d.ts +177 -0
  80. package/dist/core/universal-embedder.js +139 -0
  81. package/dist/core/validation-messages.d.ts +99 -0
  82. package/dist/core/validation-messages.js +334 -0
  83. package/dist/{vector-index.d.ts → core/vector-index.d.ts} +4 -0
  84. package/dist/{vector-index.js → core/vector-index.js} +21 -3
  85. package/dist/dom-polyfills.d.ts +6 -0
  86. package/dist/dom-polyfills.js +40 -0
  87. package/dist/factories/index.d.ts +43 -0
  88. package/dist/factories/index.js +44 -0
  89. package/dist/factories/text-factory.d.ts +560 -0
  90. package/dist/factories/text-factory.js +968 -0
  91. package/dist/file-processor.d.ts +90 -4
  92. package/dist/file-processor.js +723 -20
  93. package/dist/index-manager.d.ts +3 -2
  94. package/dist/index-manager.js +13 -11
  95. package/dist/index.d.ts +72 -8
  96. package/dist/index.js +102 -16
  97. package/dist/indexer.js +1 -1
  98. package/dist/ingestion.d.ts +44 -154
  99. package/dist/ingestion.js +75 -671
  100. package/dist/mcp-server.d.ts +35 -3
  101. package/dist/mcp-server.js +1186 -79
  102. package/dist/multimodal/clip-embedder.d.ts +314 -0
  103. package/dist/multimodal/clip-embedder.js +945 -0
  104. package/dist/multimodal/index.d.ts +6 -0
  105. package/dist/multimodal/index.js +6 -0
  106. package/dist/preprocess.js +1 -1
  107. package/dist/run-error-recovery-tests.d.ts +7 -0
  108. package/dist/run-error-recovery-tests.js +101 -0
  109. package/dist/search-standalone.js +1 -1
  110. package/dist/search.d.ts +51 -69
  111. package/dist/search.js +117 -412
  112. package/dist/test-utils.d.ts +8 -26
  113. package/dist/text/chunker.d.ts +33 -0
  114. package/dist/{chunker.js → text/chunker.js} +98 -75
  115. package/dist/{embedder.d.ts → text/embedder.d.ts} +22 -1
  116. package/dist/{embedder.js → text/embedder.js} +84 -10
  117. package/dist/text/index.d.ts +8 -0
  118. package/dist/text/index.js +9 -0
  119. package/dist/text/preprocessors/index.d.ts +17 -0
  120. package/dist/text/preprocessors/index.js +38 -0
  121. package/dist/text/preprocessors/mdx.d.ts +25 -0
  122. package/dist/text/preprocessors/mdx.js +101 -0
  123. package/dist/text/preprocessors/mermaid.d.ts +68 -0
  124. package/dist/text/preprocessors/mermaid.js +330 -0
  125. package/dist/text/preprocessors/registry.d.ts +56 -0
  126. package/dist/text/preprocessors/registry.js +180 -0
  127. package/dist/text/reranker.d.ts +59 -0
  128. package/dist/{reranker.js → text/reranker.js} +138 -53
  129. package/dist/text/sentence-transformer-embedder.d.ts +96 -0
  130. package/dist/text/sentence-transformer-embedder.js +340 -0
  131. package/dist/{tokenizer.d.ts → text/tokenizer.d.ts} +1 -0
  132. package/dist/{tokenizer.js → text/tokenizer.js} +7 -2
  133. package/dist/types.d.ts +40 -1
  134. package/dist/utils/vector-math.d.ts +31 -0
  135. package/dist/utils/vector-math.js +70 -0
  136. package/package.json +16 -4
  137. package/dist/api-errors.d.ts.map +0 -1
  138. package/dist/api-errors.js.map +0 -1
  139. package/dist/chunker.d.ts +0 -47
  140. package/dist/chunker.d.ts.map +0 -1
  141. package/dist/chunker.js.map +0 -1
  142. package/dist/cli/indexer.d.ts.map +0 -1
  143. package/dist/cli/indexer.js.map +0 -1
  144. package/dist/cli/search.d.ts.map +0 -1
  145. package/dist/cli/search.js.map +0 -1
  146. package/dist/cli.d.ts.map +0 -1
  147. package/dist/cli.js.map +0 -1
  148. package/dist/config.d.ts.map +0 -1
  149. package/dist/config.js.map +0 -1
  150. package/dist/db.d.ts +0 -90
  151. package/dist/db.d.ts.map +0 -1
  152. package/dist/db.js +0 -340
  153. package/dist/db.js.map +0 -1
  154. package/dist/embedder.d.ts.map +0 -1
  155. package/dist/embedder.js.map +0 -1
  156. package/dist/error-handler.d.ts.map +0 -1
  157. package/dist/error-handler.js.map +0 -1
  158. package/dist/file-processor.d.ts.map +0 -1
  159. package/dist/file-processor.js.map +0 -1
  160. package/dist/index-manager.d.ts.map +0 -1
  161. package/dist/index-manager.js.map +0 -1
  162. package/dist/index.d.ts.map +0 -1
  163. package/dist/index.js.map +0 -1
  164. package/dist/indexer.d.ts.map +0 -1
  165. package/dist/indexer.js.map +0 -1
  166. package/dist/ingestion.d.ts.map +0 -1
  167. package/dist/ingestion.js.map +0 -1
  168. package/dist/mcp-server.d.ts.map +0 -1
  169. package/dist/mcp-server.js.map +0 -1
  170. package/dist/path-manager.d.ts.map +0 -1
  171. package/dist/path-manager.js.map +0 -1
  172. package/dist/preprocess.d.ts.map +0 -1
  173. package/dist/preprocess.js.map +0 -1
  174. package/dist/preprocessors/index.d.ts.map +0 -1
  175. package/dist/preprocessors/index.js.map +0 -1
  176. package/dist/preprocessors/mdx.d.ts.map +0 -1
  177. package/dist/preprocessors/mdx.js.map +0 -1
  178. package/dist/preprocessors/mermaid.d.ts.map +0 -1
  179. package/dist/preprocessors/mermaid.js.map +0 -1
  180. package/dist/preprocessors/registry.d.ts.map +0 -1
  181. package/dist/preprocessors/registry.js.map +0 -1
  182. package/dist/reranker.d.ts +0 -40
  183. package/dist/reranker.d.ts.map +0 -1
  184. package/dist/reranker.js.map +0 -1
  185. package/dist/resource-manager-demo.d.ts +0 -7
  186. package/dist/resource-manager-demo.d.ts.map +0 -1
  187. package/dist/resource-manager-demo.js +0 -52
  188. package/dist/resource-manager-demo.js.map +0 -1
  189. package/dist/resource-manager.d.ts +0 -129
  190. package/dist/resource-manager.d.ts.map +0 -1
  191. package/dist/resource-manager.js +0 -389
  192. package/dist/resource-manager.js.map +0 -1
  193. package/dist/search-standalone.d.ts.map +0 -1
  194. package/dist/search-standalone.js.map +0 -1
  195. package/dist/search.d.ts.map +0 -1
  196. package/dist/search.js.map +0 -1
  197. package/dist/test-utils.d.ts.map +0 -1
  198. package/dist/test-utils.js.map +0 -1
  199. package/dist/tokenizer.d.ts.map +0 -1
  200. package/dist/tokenizer.js.map +0 -1
  201. package/dist/types.d.ts.map +0 -1
  202. package/dist/types.js.map +0 -1
  203. package/dist/vector-index.d.ts.map +0 -1
  204. package/dist/vector-index.js.map +0 -1
package/dist/search.js CHANGED
@@ -1,453 +1,158 @@
1
- import { initializeEmbeddingEngine } from './embedder.js';
2
- import { IndexManager } from './index-manager.js';
3
- import { getChunksByEmbeddingIds, openDatabase, getStoredModelInfo } from './db.js';
4
- import { CrossEncoderReranker } from './reranker.js';
5
- import { config } from './config.js';
6
- import { join, resolve } from 'path';
7
- import { existsSync } from 'fs';
8
1
  /**
9
- * User-friendly error class with actionable suggestions
10
- */
11
- export class SearchError extends Error {
12
- code;
13
- suggestions;
14
- constructor(message, code, suggestions) {
15
- super(message);
16
- this.code = code;
17
- this.suggestions = suggestions;
18
- this.name = 'SearchError';
19
- }
20
- }
21
- /**
22
- * Resolves paths for the search engine based on provided paths or defaults
23
- * @param indexPath - Path to vector index file (optional)
24
- * @param dbPath - Path to database file (optional)
25
- * @returns Resolved paths for index and database files
26
- */
27
- function resolveSearchPaths(indexPath, dbPath) {
28
- const currentDir = process.cwd();
29
- return {
30
- indexPath: indexPath ? resolve(indexPath) : join(currentDir, 'vector-index.bin'),
31
- dbPath: dbPath ? resolve(dbPath) : join(currentDir, 'db.sqlite')
32
- };
33
- }
34
- /**
35
- * Search engine that provides semantic search capabilities
36
- * Implements the core search pipeline: query embedding → vector search → metadata retrieval → optional reranking
37
- * Supports concurrent read operations for multiple simultaneous queries
2
+ * Public API SearchEngine - Simple constructor interface with internal factory usage
3
+ *
4
+ * This class provides a clean, simple API while using the new core architecture
5
+ * internally. It handles dependency injection automatically.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * // Simple usage
10
+ * const search = new SearchEngine('./index.bin', './db.sqlite');
11
+ * const results = await search.search('query');
12
+ *
13
+ * // With options
14
+ * const search = new SearchEngine('./index.bin', './db.sqlite', {
15
+ * embeddingModel: 'all-MiniLM-L6-v2',
16
+ * enableReranking: true
17
+ * });
18
+ * ```
38
19
  */
20
+ import { SearchEngine as CoreSearchEngine } from './core/search.js';
21
+ import { TextSearchFactory } from './factories/index.js';
39
22
  export class SearchEngine {
40
- // Static properties for automatic resource management (Requirement 5.1, 5.2)
41
- static instances = new Set();
42
- static cleanupHandlersSet = false;
43
- embedder = null;
44
- indexManager = null;
45
- dbConnection = null;
46
- reranker = null;
47
- isInitialized = false;
48
23
  indexPath;
49
24
  dbPath;
50
- enableReranking = false;
51
- /**
52
- * Creates a new SearchEngine with simplified constructor
53
- * Search engine is ready to use immediately without requiring initialization calls (Requirement 3.5)
54
- * @param indexPath - Path to vector index file (defaults to './vector-index.bin')
55
- * @param dbPath - Path to database file (defaults to './db.sqlite')
56
- */
57
- constructor(indexPath, dbPath) {
58
- // Validate parameters
59
- if (indexPath !== undefined && (typeof indexPath !== 'string' || indexPath.trim() === '')) {
60
- throw new Error('indexPath must be a non-empty string when provided');
61
- }
62
- if (dbPath !== undefined && (typeof dbPath !== 'string' || dbPath.trim() === '')) {
63
- throw new Error('dbPath must be a non-empty string when provided');
64
- }
65
- // Resolve paths automatically
66
- const pathConfig = resolveSearchPaths(indexPath, dbPath);
67
- this.indexPath = pathConfig.indexPath;
68
- this.dbPath = pathConfig.dbPath;
69
- // Set up automatic cleanup on process exit (Requirement 5.5)
70
- this.setupAutomaticCleanup();
71
- }
72
- /**
73
- * Legacy constructor for backward compatibility
74
- * @deprecated Use the simple constructor new SearchEngine(indexPath?, dbPath?) instead
75
- */
76
- static createWithComponents(embedder, indexManager, dbConnection, enableReranking = false) {
77
- const engine = new SearchEngine();
78
- engine.embedder = embedder;
79
- engine.indexManager = indexManager;
80
- engine.dbConnection = dbConnection;
81
- engine.enableReranking = enableReranking;
82
- // Initialize reranker if enabled
83
- if (enableReranking) {
84
- engine.reranker = new CrossEncoderReranker();
25
+ options;
26
+ coreEngine = null;
27
+ initPromise = null;
28
+ constructor(indexPath, dbPath, options = {}) {
29
+ this.indexPath = indexPath;
30
+ this.dbPath = dbPath;
31
+ this.options = options;
32
+ // Validate required parameters
33
+ if (!indexPath || typeof indexPath !== 'string' || indexPath.trim() === '') {
34
+ throw new Error('Both indexPath and dbPath are required.\n' +
35
+ 'Example: const search = new SearchEngine("./index.bin", "./db.sqlite");\n' +
36
+ 'Or use: const search = await SearchFactory.create("./index.bin", "./db.sqlite");');
37
+ }
38
+ if (!dbPath || typeof dbPath !== 'string' || dbPath.trim() === '') {
39
+ throw new Error('Both indexPath and dbPath are required.\n' +
40
+ 'Example: const search = new SearchEngine("./index.bin", "./db.sqlite");\n' +
41
+ 'Or use: const search = await SearchFactory.create("./index.bin", "./db.sqlite");');
85
42
  }
86
- engine.isInitialized = true;
87
- return engine;
88
43
  }
89
44
  /**
90
- * Automatically initialize resources on first use with user-friendly error handling
91
- * Implements lazy initialization as required by Requirements 3.5, 4.3, 5.1, 5.2
45
+ * Initialize the search engine using the factory or direct injection
92
46
  */
93
- async ensureInitialized() {
94
- if (this.isInitialized) {
95
- return;
96
- }
97
- // Check if required files exist first (before any initialization attempts)
98
- if (!existsSync(this.dbPath)) {
99
- throw this.createUserFriendlyError(new Error(`Database file not found: ${this.dbPath}`), 'missing_database');
100
- }
101
- if (!existsSync(this.indexPath)) {
102
- throw this.createUserFriendlyError(new Error(`Vector index file not found: ${this.indexPath}`), 'missing_index');
103
- }
104
- try {
105
- console.log('Initializing search engine...');
106
- // Initialize database connection
107
- console.log('Opening database connection...');
108
- this.dbConnection = await openDatabase(this.dbPath);
109
- // Read stored model info from database (Requirement 4.3)
110
- console.log('Reading stored model information...');
111
- const storedModelInfo = await getStoredModelInfo(this.dbConnection);
112
- if (!storedModelInfo) {
113
- throw this.createUserFriendlyError(new Error('No model information found in database'), 'missing_model_info');
114
- }
115
- // Initialize embedder with stored model info (Requirement 3.5)
116
- console.log(`Loading embedding model: ${storedModelInfo.modelName}...`);
117
- try {
118
- this.embedder = await initializeEmbeddingEngine(storedModelInfo.modelName);
119
- }
120
- catch (error) {
121
- throw this.createUserFriendlyError(error, 'model_loading');
122
- }
123
- // Initialize index manager with model compatibility validation
124
- console.log('Initializing index manager...');
125
- try {
126
- this.indexManager = new IndexManager(this.indexPath, this.dbPath, storedModelInfo.dimensions, storedModelInfo.modelName);
127
- await this.indexManager.initialize();
128
- }
129
- catch (error) {
130
- // Check if this is a model compatibility issue
131
- const errorMessage = error instanceof Error ? error.message : String(error);
132
- if (errorMessage.includes('mismatch') || errorMessage.includes('version') || errorMessage.includes('model')) {
133
- throw this.createUserFriendlyError(error, 'model_compatibility');
134
- }
135
- throw error;
136
- }
137
- // Load reranker model if enabled
138
- if (this.enableReranking) {
139
- this.reranker = new CrossEncoderReranker();
140
- console.log('Loading reranker model...');
141
- try {
142
- await this.reranker.loadModel();
47
+ async initialize() {
48
+ if (this.coreEngine) {
49
+ return; // Already initialized
50
+ }
51
+ if (this.initPromise) {
52
+ return this.initPromise; // Initialization in progress
53
+ }
54
+ this.initPromise = (async () => {
55
+ // If custom functions are provided, use direct dependency injection
56
+ if (this.options.embedFn || this.options.rerankFn) {
57
+ const { IndexManager } = await import('./index-manager.js');
58
+ const { openDatabase } = await import('./core/db.js');
59
+ const { createTextEmbedFunction } = await import('./text/embedder.js');
60
+ const { existsSync } = await import('fs');
61
+ // Validate files exist
62
+ if (!existsSync(this.indexPath)) {
63
+ throw new Error(`Vector index not found at: ${this.indexPath}`);
143
64
  }
144
- catch (error) {
145
- console.warn(`Reranker initialization failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
146
- console.warn('Continuing with vector search only (reranking disabled)');
147
- this.reranker = null; // Disable reranker for this session
65
+ if (!existsSync(this.dbPath)) {
66
+ throw new Error(`Database not found at: ${this.dbPath}`);
148
67
  }
149
- }
150
- this.isInitialized = true;
151
- const stats = await this.indexManager.getStats();
152
- console.log(`Search engine initialized with ${stats.totalVectors} chunks${this.reranker && this.reranker.isLoaded() ? ' and reranking enabled' : ''}`);
153
- }
154
- catch (error) {
155
- await this.cleanup();
156
- if (error instanceof SearchError) {
157
- throw error;
68
+ // Use custom embedFn or create default
69
+ const embedFn = this.options.embedFn || createTextEmbedFunction(this.options.embeddingModel);
70
+ // Get model defaults for dimensions
71
+ const { getModelDefaults, config } = await import('./core/config.js');
72
+ const modelDefaults = getModelDefaults(this.options.embeddingModel || config.embedding_model);
73
+ // Initialize dependencies
74
+ const db = await openDatabase(this.dbPath);
75
+ const indexManager = new IndexManager(this.indexPath, this.dbPath, modelDefaults.dimensions, this.options.embeddingModel);
76
+ await indexManager.initialize();
77
+ // Create ContentResolver for unified content system
78
+ const { ContentResolver } = await import('./core/content-resolver.js');
79
+ const contentResolver = new ContentResolver(db);
80
+ // Create core engine with dependency injection
81
+ this.coreEngine = new CoreSearchEngine(embedFn, indexManager, db, this.options.rerankFn, contentResolver);
158
82
  }
159
83
  else {
160
- throw this.createUserFriendlyError(error, 'initialization');
84
+ // Use factory for standard initialization
85
+ this.coreEngine = await TextSearchFactory.create(this.indexPath, this.dbPath, this.options);
161
86
  }
162
- }
87
+ })();
88
+ return this.initPromise;
163
89
  }
164
90
  /**
165
- * Create user-friendly error messages with actionable suggestions
166
- * Implements requirement 5.3: Clear, actionable error messages with specific next steps
91
+ * Perform semantic search
167
92
  */
168
- createUserFriendlyError(error, context) {
169
- const errorMessage = error instanceof Error ? error.message : String(error);
170
- // Handle common error scenarios with specific guidance
171
- if (context === 'missing_database') {
172
- return new SearchError(`Database file not found: ${this.dbPath}`, 'DATABASE_NOT_FOUND', [
173
- 'Run ingestion first to create the database: pipeline.ingestDirectory("./docs/")',
174
- 'Check that the database path is correct',
175
- 'Ensure the ingestion process completed successfully'
176
- ]);
93
+ async search(query, options) {
94
+ await this.initialize();
95
+ if (!this.coreEngine) {
96
+ throw new Error('SearchEngine failed to initialize');
177
97
  }
178
- if (context === 'missing_index') {
179
- return new SearchError(`Vector index file not found: ${this.indexPath}`, 'INDEX_NOT_FOUND', [
180
- 'Run ingestion first to create the index: pipeline.ingestDirectory("./docs/")',
181
- 'Check that the index path is correct',
182
- 'Ensure the ingestion process completed successfully'
183
- ]);
184
- }
185
- if (context === 'missing_model_info') {
186
- return new SearchError('No embedding model information found in database. The database may be from an older version or corrupted.', 'MODEL_INFO_NOT_FOUND', [
187
- 'Run ingestion again to store model information: pipeline.ingestDirectory("./docs/")',
188
- 'If the problem persists, delete the database and index files and run ingestion from scratch',
189
- 'Check that the database was created with a compatible version of the library'
190
- ]);
191
- }
192
- if (context === 'model_loading') {
193
- return new SearchError(`Failed to load embedding model: ${errorMessage}`, 'MODEL_LOADING_FAILED', [
194
- 'Check that the model name is correct and supported',
195
- 'Ensure you have internet connection for model download',
196
- 'Try running ingestion again with a supported model',
197
- 'Check the model configuration in your setup'
198
- ]);
199
- }
200
- if (context === 'model_compatibility' || (errorMessage.includes('model') && errorMessage.includes('mismatch'))) {
201
- return new SearchError(`Model compatibility issue detected: ${errorMessage}`, 'MODEL_COMPATIBILITY', [
202
- 'The stored model information doesn\'t match the current configuration',
203
- 'Run pipeline.rebuildIndex() to rebuild with the current model',
204
- 'Or ensure you\'re using the same model that was used during ingestion',
205
- 'Check that the index and database files are from the same ingestion run'
206
- ]);
207
- }
208
- if (errorMessage.includes('ENOENT') || errorMessage.includes('no such file')) {
209
- return new SearchError(`Required files not found: ${errorMessage}`, 'FILES_NOT_FOUND', [
210
- 'Run ingestion first to create the required files',
211
- 'Check that the file paths are correct',
212
- 'Ensure you have read permissions for the files'
213
- ]);
214
- }
215
- if (errorMessage.includes('EACCES') || errorMessage.includes('permission denied')) {
216
- return new SearchError(`Permission denied: ${errorMessage}`, 'PERMISSION_DENIED', [
217
- 'Check that you have read permissions for the database and index files',
218
- 'Ensure the files are not locked by another process',
219
- 'Try running with appropriate permissions'
220
- ]);
221
- }
222
- if (errorMessage.includes('database') || errorMessage.includes('sqlite')) {
223
- return new SearchError(`Database error: ${errorMessage}`, 'DATABASE_ERROR', [
224
- 'Check that the database file is not corrupted',
225
- 'Ensure no other processes are using the database',
226
- 'Try recreating the database by running ingestion again'
227
- ]);
228
- }
229
- // Generic error with basic suggestions
230
- return new SearchError(`Search engine ${context} failed: ${errorMessage}`, 'GENERAL_ERROR', [
231
- 'Check the error message above for specific details',
232
- 'Ensure all required files exist and are accessible',
233
- 'Try running ingestion first if you haven\'t already',
234
- 'Contact support if the issue persists'
235
- ]);
98
+ return this.coreEngine.search(query, options);
236
99
  }
237
100
  /**
238
- * Initialize the search engine (public method for backward compatibility)
239
- * Sets up database, index manager, and embedding engine
101
+ * Retrieve content by ID in the specified format
102
+ * @param contentId - Content ID to retrieve
103
+ * @param format - Format to return ('file' for CLI clients, 'base64' for MCP clients)
104
+ * @returns Promise that resolves to content in requested format
240
105
  */
241
- async initialize() {
242
- await this.ensureInitialized();
243
- }
244
- /**
245
- * Perform semantic search on the indexed documents (matches README API)
246
- * Automatically initializes resources on first use (Requirements 4.1, 4.2, 4.4, 4.5)
247
- * Supports concurrent read operations for multiple simultaneous queries
248
- * @param query - Search query string
249
- * @param options - Search options including top_k and rerank settings
250
- * @returns Promise resolving to array of search results
251
- */
252
- async search(query, options = {}) {
253
- // Automatic initialization on first use (Requirement 4.1, 4.2)
254
- await this.ensureInitialized();
255
- if (!query || query.trim().length === 0) {
256
- return [];
257
- }
258
- const startTime = performance.now();
259
- const topK = options.top_k || config.top_k || 10;
260
- const shouldRerank = options.rerank !== undefined ? options.rerank : config.rerank_enabled;
261
- try {
262
- // Ensure all components are initialized
263
- if (!this.embedder || !this.indexManager || !this.dbConnection) {
264
- throw new Error('Search engine components not properly initialized');
265
- }
266
- // Step 1: Build query embedding using same model as document chunks
267
- const embeddingStartTime = performance.now();
268
- const queryEmbedding = await this.embedder.embedSingle(query);
269
- const embeddingTime = performance.now() - embeddingStartTime;
270
- // Step 2: Search using IndexManager (which handles hash mapping properly)
271
- const searchStartTime = performance.now();
272
- let searchResult;
273
- try {
274
- searchResult = this.indexManager.search(queryEmbedding.vector, topK);
275
- }
276
- catch (error) {
277
- if (error instanceof Error && error.message.includes('No embedding ID found for hash')) {
278
- console.warn(`Hash mapping issue detected: ${error.message}`);
279
- console.warn('This may indicate index/database synchronization issues. Consider running: raglite rebuild');
280
- return [];
281
- }
282
- throw error;
283
- }
284
- const vectorSearchTime = performance.now() - searchStartTime;
285
- if (searchResult.embeddingIds.length === 0) {
286
- const totalTime = performance.now() - startTime;
287
- console.log(`No similar documents found (${totalTime.toFixed(2)}ms total)`);
288
- return [];
289
- }
290
- // Step 3: Retrieve chunks from database using embedding IDs
291
- const retrievalStartTime = performance.now();
292
- const chunks = await getChunksByEmbeddingIds(this.dbConnection, searchResult.embeddingIds);
293
- const retrievalTime = performance.now() - retrievalStartTime;
294
- // Step 4: Format results as JSON with text, score, and document metadata
295
- let results = this.formatSearchResults(chunks, searchResult.distances, searchResult.embeddingIds);
296
- // Step 5: Optional reranking with cross-encoder when enabled
297
- let rerankTime = 0;
298
- if (shouldRerank && this.reranker && this.reranker.isLoaded() && results.length > 1) {
299
- try {
300
- const rerankStartTime = performance.now();
301
- results = await this.reranker.rerank(query, results);
302
- rerankTime = performance.now() - rerankStartTime;
303
- }
304
- catch (error) {
305
- // Fallback to vector search results and log the error
306
- console.warn(`Reranking failed, using vector search results: ${error instanceof Error ? error.message : 'Unknown error'}`);
307
- }
308
- }
309
- const totalTime = performance.now() - startTime;
310
- // Measure latency without premature optimization - just log for monitoring
311
- console.log(`Search completed: ${results.length} results in ${totalTime.toFixed(2)}ms ` +
312
- `(embed: ${embeddingTime.toFixed(2)}ms, vector: ${vectorSearchTime.toFixed(2)}ms, ` +
313
- `retrieval: ${retrievalTime.toFixed(2)}ms${rerankTime > 0 ? `, rerank: ${rerankTime.toFixed(2)}ms` : ''})`);
314
- return results;
315
- }
316
- catch (error) {
317
- throw new Error(`Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
106
+ async getContent(contentId, format = 'file') {
107
+ await this.initialize();
108
+ if (!this.coreEngine) {
109
+ throw new Error('SearchEngine failed to initialize');
318
110
  }
111
+ return this.coreEngine.getContent(contentId, format);
319
112
  }
320
113
  /**
321
- * Format search results with proper structure
322
- * @param chunks - Database chunks with metadata
323
- * @param distances - Similarity distances from vector search
324
- * @param embeddingIds - Embedding IDs in search result order
325
- * @returns Formatted search results
114
+ * Retrieve multiple content items efficiently in batch
115
+ * @param contentIds - Array of content IDs to retrieve
116
+ * @param format - Format to return ('file' for CLI clients, 'base64' for MCP clients)
117
+ * @returns Promise that resolves to array of content in requested format
326
118
  */
327
- formatSearchResults(chunks, distances, embeddingIds) {
328
- const results = [];
329
- // Create a map for quick chunk lookup by embedding_id
330
- const chunkMap = new Map();
331
- chunks.forEach(chunk => {
332
- chunkMap.set(chunk.embedding_id, chunk);
333
- });
334
- // Build results in the order of search results
335
- for (let i = 0; i < embeddingIds.length; i++) {
336
- const embeddingId = embeddingIds[i];
337
- const chunk = chunkMap.get(embeddingId);
338
- if (chunk) {
339
- // Convert cosine distance to similarity score (1 - distance)
340
- // hnswlib-wasm returns cosine distance, we want similarity
341
- const score = Math.max(0, 1 - distances[i]);
342
- results.push({
343
- text: chunk.text,
344
- score: score,
345
- document: {
346
- id: chunk.document_id,
347
- source: chunk.document_source,
348
- title: chunk.document_title
349
- }
350
- });
351
- }
119
+ async getContentBatch(contentIds, format = 'file') {
120
+ await this.initialize();
121
+ if (!this.coreEngine) {
122
+ throw new Error('SearchEngine failed to initialize');
352
123
  }
353
- return results;
124
+ return this.coreEngine.getContentBatch(contentIds, format);
354
125
  }
355
126
  /**
356
- * Get search engine statistics
357
- * @returns Object with current search engine stats
127
+ * Retrieve content metadata for result enhancement
128
+ * @param contentId - Content ID to get metadata for
129
+ * @returns Promise that resolves to content metadata
358
130
  */
359
- async getStats() {
360
- if (!this.isInitialized) {
361
- return {
362
- totalChunks: 0,
363
- indexSize: 0,
364
- rerankingEnabled: false,
365
- isInitialized: false
366
- };
131
+ async getContentMetadata(contentId) {
132
+ await this.initialize();
133
+ if (!this.coreEngine) {
134
+ throw new Error('SearchEngine failed to initialize');
367
135
  }
368
- const indexStats = await this.indexManager.getStats();
369
- return {
370
- totalChunks: indexStats.totalVectors,
371
- indexSize: indexStats.totalVectors,
372
- rerankingEnabled: this.reranker !== null && this.reranker.isLoaded(),
373
- isInitialized: this.isInitialized
374
- };
136
+ return this.coreEngine.getContentMetadata(contentId);
375
137
  }
376
138
  /**
377
- * Set up automatic cleanup on process exit (Requirement 5.5)
139
+ * Verify that content exists and is accessible
140
+ * @param contentId - Content ID to verify
141
+ * @returns Promise that resolves to true if content exists, false otherwise
378
142
  */
379
- setupAutomaticCleanup() {
380
- // Track this instance for cleanup
381
- SearchEngine.instances.add(this);
382
- // Set up process exit handlers only once
383
- if (!SearchEngine.cleanupHandlersSet) {
384
- SearchEngine.cleanupHandlersSet = true;
385
- const cleanupAll = async () => {
386
- const instances = Array.from(SearchEngine.instances);
387
- await Promise.all(instances.map(instance => instance.cleanup()));
388
- };
389
- // Handle various exit scenarios
390
- process.on('exit', () => {
391
- // Synchronous cleanup for exit event
392
- for (const instance of SearchEngine.instances) {
393
- try {
394
- if (instance.dbConnection) {
395
- // Synchronous close for exit handler
396
- instance.dbConnection = null;
397
- }
398
- if (instance.indexManager) {
399
- instance.indexManager = null;
400
- }
401
- instance.embedder = null;
402
- instance.reranker = null;
403
- instance.isInitialized = false;
404
- }
405
- catch (error) {
406
- // Silent cleanup on exit
407
- }
408
- }
409
- });
410
- process.on('SIGINT', async () => {
411
- await cleanupAll();
412
- process.exit(0);
413
- });
414
- process.on('SIGTERM', async () => {
415
- await cleanupAll();
416
- process.exit(0);
417
- });
418
- process.on('uncaughtException', async (error) => {
419
- console.error('Uncaught exception:', error);
420
- await cleanupAll();
421
- process.exit(1);
422
- });
423
- process.on('unhandledRejection', async (reason) => {
424
- console.error('Unhandled rejection:', reason);
425
- await cleanupAll();
426
- process.exit(1);
427
- });
143
+ async verifyContentExists(contentId) {
144
+ await this.initialize();
145
+ if (!this.coreEngine) {
146
+ throw new Error('SearchEngine failed to initialize');
428
147
  }
148
+ return this.coreEngine.verifyContentExists(contentId);
429
149
  }
430
150
  /**
431
- * Clean up resources (Requirement 5.5)
151
+ * Clean up resources
432
152
  */
433
153
  async cleanup() {
434
- try {
435
- if (this.dbConnection) {
436
- await this.dbConnection.close();
437
- this.dbConnection = null;
438
- }
439
- if (this.indexManager) {
440
- await this.indexManager.close();
441
- this.indexManager = null;
442
- }
443
- this.embedder = null;
444
- this.reranker = null;
445
- this.isInitialized = false;
446
- // Remove from instances set
447
- SearchEngine.instances.delete(this);
448
- }
449
- catch (error) {
450
- console.error('Error during SearchEngine cleanup:', error instanceof Error ? error.message : String(error));
154
+ if (this.coreEngine) {
155
+ await this.coreEngine.cleanup();
451
156
  }
452
157
  }
453
158
  }
@@ -2,35 +2,17 @@
2
2
  * Test utilities for multi-model support
3
3
  * Provides common configurations and helpers for testing with different embedding models
4
4
  */
5
- export declare const TEST_MODELS: readonly [{
6
- readonly name: "sentence-transformers/all-MiniLM-L6-v2";
7
- readonly dimensions: 384;
8
- readonly chunkSize: 250;
9
- readonly batchSize: 16;
10
- }, {
11
- readonly name: "Xenova/all-mpnet-base-v2";
12
- readonly dimensions: 768;
13
- readonly chunkSize: 400;
14
- readonly batchSize: 8;
15
- }];
5
+ export interface TestModel {
6
+ name: string;
7
+ dimensions: number;
8
+ chunkSize: number;
9
+ batchSize: number;
10
+ }
11
+ export declare const TEST_MODELS: TestModel[];
16
12
  /**
17
13
  * Retrieve model configuration by name
18
14
  * @param modelName - The name of the model to retrieve
19
15
  * @returns Model configuration object or undefined if not found
20
16
  */
21
- export declare function getTestModel(modelName: string): {
22
- readonly name: "sentence-transformers/all-MiniLM-L6-v2";
23
- readonly dimensions: 384;
24
- readonly chunkSize: 250;
25
- readonly batchSize: 16;
26
- } | {
27
- readonly name: "Xenova/all-mpnet-base-v2";
28
- readonly dimensions: 768;
29
- readonly chunkSize: 400;
30
- readonly batchSize: 8;
31
- } | undefined;
32
- /**
33
- * Type for test model configuration
34
- */
35
- export type TestModel = typeof TEST_MODELS[number];
17
+ export declare function getTestModel(modelName: string): TestModel | undefined;
36
18
  //# sourceMappingURL=test-utils.d.ts.map
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Text-specific chunking implementation
3
+ * Implements the ChunkingStrategy interface for text content
4
+ */
5
+ import '../dom-polyfills.js';
6
+ import { ChunkingStrategy, GenericDocument, GenericChunk, ChunkConfig } from '../core/chunker.js';
7
+ /**
8
+ * Document interface for text chunking
9
+ */
10
+ export interface Document {
11
+ source: string;
12
+ title: string;
13
+ content: string;
14
+ metadata?: Record<string, any>;
15
+ }
16
+ export interface Chunk {
17
+ text: string;
18
+ chunkIndex: number;
19
+ tokenCount: number;
20
+ }
21
+ /**
22
+ * Text chunking strategy implementation
23
+ */
24
+ export declare class TextChunkingStrategy implements ChunkingStrategy {
25
+ appliesTo(contentType: string): boolean;
26
+ chunk(document: GenericDocument, config: ChunkConfig): Promise<GenericChunk[]>;
27
+ }
28
+ /**
29
+ * Text document chunking function
30
+ * Converts between text-specific and generic interfaces
31
+ */
32
+ export declare function chunkDocument(document: Document, config?: ChunkConfig): Promise<Chunk[]>;
33
+ //# sourceMappingURL=chunker.d.ts.map