rag-lite-ts 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. package/README.md +94 -65
  2. package/dist/cli/indexer.d.ts.map +1 -1
  3. package/dist/cli/indexer.js +78 -50
  4. package/dist/cli/indexer.js.map +1 -1
  5. package/dist/cli/search.d.ts.map +1 -1
  6. package/dist/cli/search.js +13 -30
  7. package/dist/cli/search.js.map +1 -1
  8. package/dist/cli.js +2 -2
  9. package/dist/cli.js.map +1 -1
  10. package/dist/config.d.ts +34 -73
  11. package/dist/config.d.ts.map +1 -1
  12. package/dist/config.js +50 -255
  13. package/dist/config.js.map +1 -1
  14. package/dist/core/adapters.d.ts +93 -0
  15. package/dist/core/adapters.d.ts.map +1 -0
  16. package/dist/core/adapters.js +139 -0
  17. package/dist/core/adapters.js.map +1 -0
  18. package/dist/core/chunker.d.ts +117 -0
  19. package/dist/core/chunker.d.ts.map +1 -0
  20. package/dist/core/chunker.js +73 -0
  21. package/dist/core/chunker.js.map +1 -0
  22. package/dist/core/config.d.ts +102 -0
  23. package/dist/core/config.d.ts.map +1 -0
  24. package/dist/core/config.js +240 -0
  25. package/dist/core/config.js.map +1 -0
  26. package/dist/{db.d.ts → core/db.d.ts} +25 -9
  27. package/dist/core/db.d.ts.map +1 -0
  28. package/dist/{db.js → core/db.js} +86 -16
  29. package/dist/core/db.js.map +1 -0
  30. package/dist/{error-handler.d.ts → core/error-handler.d.ts} +23 -2
  31. package/dist/core/error-handler.d.ts.map +1 -0
  32. package/dist/{error-handler.js → core/error-handler.js} +51 -8
  33. package/dist/core/error-handler.js.map +1 -0
  34. package/dist/core/index.d.ts +57 -0
  35. package/dist/core/index.d.ts.map +1 -0
  36. package/dist/core/index.js +66 -0
  37. package/dist/core/index.js.map +1 -0
  38. package/dist/core/ingestion.d.ts +143 -0
  39. package/dist/core/ingestion.d.ts.map +1 -0
  40. package/dist/core/ingestion.js +347 -0
  41. package/dist/core/ingestion.js.map +1 -0
  42. package/dist/core/interfaces.d.ts +408 -0
  43. package/dist/core/interfaces.d.ts.map +1 -0
  44. package/dist/core/interfaces.js +106 -0
  45. package/dist/core/interfaces.js.map +1 -0
  46. package/dist/{path-manager.d.ts → core/path-manager.d.ts} +5 -0
  47. package/dist/core/path-manager.d.ts.map +1 -0
  48. package/dist/{path-manager.js → core/path-manager.js} +5 -0
  49. package/dist/core/path-manager.js.map +1 -0
  50. package/dist/core/search-example.d.ts +25 -0
  51. package/dist/core/search-example.d.ts.map +1 -0
  52. package/dist/core/search-example.js +138 -0
  53. package/dist/core/search-example.js.map +1 -0
  54. package/dist/core/search-pipeline-example.d.ts +21 -0
  55. package/dist/core/search-pipeline-example.d.ts.map +1 -0
  56. package/dist/core/search-pipeline-example.js +188 -0
  57. package/dist/core/search-pipeline-example.js.map +1 -0
  58. package/dist/core/search-pipeline.d.ts +111 -0
  59. package/dist/core/search-pipeline.d.ts.map +1 -0
  60. package/dist/core/search-pipeline.js +287 -0
  61. package/dist/core/search-pipeline.js.map +1 -0
  62. package/dist/core/search.d.ts +104 -0
  63. package/dist/core/search.d.ts.map +1 -0
  64. package/dist/core/search.js +218 -0
  65. package/dist/core/search.js.map +1 -0
  66. package/dist/core/types.d.ts +63 -0
  67. package/dist/core/types.d.ts.map +1 -0
  68. package/dist/core/types.js +6 -0
  69. package/dist/core/types.js.map +1 -0
  70. package/dist/{vector-index.d.ts → core/vector-index.d.ts} +4 -0
  71. package/dist/core/vector-index.d.ts.map +1 -0
  72. package/dist/{vector-index.js → core/vector-index.js} +19 -0
  73. package/dist/core/vector-index.js.map +1 -0
  74. package/dist/dom-polyfills.d.ts +6 -0
  75. package/dist/dom-polyfills.d.ts.map +1 -0
  76. package/dist/dom-polyfills.js +40 -0
  77. package/dist/dom-polyfills.js.map +1 -0
  78. package/dist/examples/clean-api-examples.d.ts +44 -0
  79. package/dist/examples/clean-api-examples.d.ts.map +1 -0
  80. package/dist/examples/clean-api-examples.js +206 -0
  81. package/dist/examples/clean-api-examples.js.map +1 -0
  82. package/dist/factories/index.d.ts +43 -0
  83. package/dist/factories/index.d.ts.map +1 -0
  84. package/dist/factories/index.js +44 -0
  85. package/dist/factories/index.js.map +1 -0
  86. package/dist/factories/text-factory.d.ts +466 -0
  87. package/dist/factories/text-factory.d.ts.map +1 -0
  88. package/dist/factories/text-factory.js +719 -0
  89. package/dist/factories/text-factory.js.map +1 -0
  90. package/dist/file-processor.d.ts +2 -2
  91. package/dist/file-processor.d.ts.map +1 -1
  92. package/dist/file-processor.js +3 -3
  93. package/dist/file-processor.js.map +1 -1
  94. package/dist/index-manager.d.ts +3 -2
  95. package/dist/index-manager.d.ts.map +1 -1
  96. package/dist/index-manager.js +13 -11
  97. package/dist/index-manager.js.map +1 -1
  98. package/dist/index.d.ts +63 -8
  99. package/dist/index.d.ts.map +1 -1
  100. package/dist/index.js +91 -16
  101. package/dist/index.js.map +1 -1
  102. package/dist/indexer.js +1 -1
  103. package/dist/indexer.js.map +1 -1
  104. package/dist/ingestion.d.ts +30 -156
  105. package/dist/ingestion.d.ts.map +1 -1
  106. package/dist/ingestion.js +58 -675
  107. package/dist/ingestion.js.map +1 -1
  108. package/dist/mcp-server.js +86 -55
  109. package/dist/mcp-server.js.map +1 -1
  110. package/dist/preprocess.js +1 -1
  111. package/dist/preprocess.js.map +1 -1
  112. package/dist/search-standalone.js +1 -1
  113. package/dist/search-standalone.js.map +1 -1
  114. package/dist/search.d.ts +32 -76
  115. package/dist/search.d.ts.map +1 -1
  116. package/dist/search.js +80 -428
  117. package/dist/search.js.map +1 -1
  118. package/dist/text/chunker.d.ts +32 -0
  119. package/dist/text/chunker.d.ts.map +1 -0
  120. package/dist/{chunker.js → text/chunker.js} +98 -75
  121. package/dist/text/chunker.js.map +1 -0
  122. package/dist/{embedder.d.ts → text/embedder.d.ts} +22 -1
  123. package/dist/text/embedder.d.ts.map +1 -0
  124. package/dist/{embedder.js → text/embedder.js} +71 -4
  125. package/dist/text/embedder.js.map +1 -0
  126. package/dist/text/index.d.ts +7 -0
  127. package/dist/text/index.d.ts.map +1 -0
  128. package/dist/text/index.js +8 -0
  129. package/dist/text/index.js.map +1 -0
  130. package/dist/text/preprocessors/index.d.ts +17 -0
  131. package/dist/text/preprocessors/index.d.ts.map +1 -0
  132. package/dist/text/preprocessors/index.js +38 -0
  133. package/dist/text/preprocessors/index.js.map +1 -0
  134. package/dist/text/preprocessors/mdx.d.ts +25 -0
  135. package/dist/text/preprocessors/mdx.d.ts.map +1 -0
  136. package/dist/text/preprocessors/mdx.js +101 -0
  137. package/dist/text/preprocessors/mdx.js.map +1 -0
  138. package/dist/text/preprocessors/mermaid.d.ts +68 -0
  139. package/dist/text/preprocessors/mermaid.d.ts.map +1 -0
  140. package/dist/text/preprocessors/mermaid.js +330 -0
  141. package/dist/text/preprocessors/mermaid.js.map +1 -0
  142. package/dist/text/preprocessors/registry.d.ts +56 -0
  143. package/dist/text/preprocessors/registry.d.ts.map +1 -0
  144. package/dist/text/preprocessors/registry.js +180 -0
  145. package/dist/text/preprocessors/registry.js.map +1 -0
  146. package/dist/text/reranker.d.ts +60 -0
  147. package/dist/text/reranker.d.ts.map +1 -0
  148. package/dist/{reranker.js → text/reranker.js} +134 -19
  149. package/dist/text/reranker.js.map +1 -0
  150. package/dist/{tokenizer.d.ts → text/tokenizer.d.ts} +1 -0
  151. package/dist/text/tokenizer.d.ts.map +1 -0
  152. package/dist/{tokenizer.js → text/tokenizer.js} +7 -2
  153. package/dist/text/tokenizer.js.map +1 -0
  154. package/dist/types.d.ts +1 -1
  155. package/dist/types.d.ts.map +1 -1
  156. package/package.json +2 -2
  157. package/dist/chunker.d.ts +0 -47
  158. package/dist/chunker.d.ts.map +0 -1
  159. package/dist/chunker.js.map +0 -1
  160. package/dist/db.d.ts.map +0 -1
  161. package/dist/db.js.map +0 -1
  162. package/dist/embedder.d.ts.map +0 -1
  163. package/dist/embedder.js.map +0 -1
  164. package/dist/error-handler.d.ts.map +0 -1
  165. package/dist/error-handler.js.map +0 -1
  166. package/dist/path-manager.d.ts.map +0 -1
  167. package/dist/path-manager.js.map +0 -1
  168. package/dist/reranker.d.ts +0 -40
  169. package/dist/reranker.d.ts.map +0 -1
  170. package/dist/reranker.js.map +0 -1
  171. package/dist/resource-manager-demo.d.ts +0 -7
  172. package/dist/resource-manager-demo.d.ts.map +0 -1
  173. package/dist/resource-manager-demo.js +0 -52
  174. package/dist/resource-manager-demo.js.map +0 -1
  175. package/dist/resource-manager.d.ts +0 -129
  176. package/dist/resource-manager.d.ts.map +0 -1
  177. package/dist/resource-manager.js +0 -389
  178. package/dist/resource-manager.js.map +0 -1
  179. package/dist/tokenizer.d.ts.map +0 -1
  180. package/dist/tokenizer.js.map +0 -1
  181. package/dist/vector-index.d.ts.map +0 -1
  182. package/dist/vector-index.js.map +0 -1
@@ -0,0 +1,719 @@
1
+ /**
2
+ * Factory functions for creating text-specific search and ingestion instances
3
+ * Handles complex initialization logic while providing clean API for common use cases
4
+ *
5
+ * FACTORY PATTERN BENEFITS:
6
+ * - Abstracts complex initialization (model loading, database setup, index initialization)
7
+ * - Provides simple API for common use cases while preserving access to dependency injection
8
+ * - Handles error recovery and validation
9
+ * - Supports different embedding models and configurations
10
+ * - Enables clean separation between simple usage and advanced customization
11
+ *
12
+ * USAGE PATTERNS:
13
+ *
14
+ * 1. Simple Search Setup:
15
+ * ```typescript
16
+ * // Create search engine with defaults
17
+ * const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
18
+ * const results = await search.search('query');
19
+ * ```
20
+ *
21
+ * 2. Custom Configuration:
22
+ * ```typescript
23
+ * // Create with custom options
24
+ * const search = await TextSearchFactory.create('./index.bin', './db.sqlite', {
25
+ * embeddingModel: 'all-MiniLM-L6-v2',
26
+ * enableReranking: true,
27
+ * topK: 20
28
+ * });
29
+ * ```
30
+ *
31
+ * 3. Complete RAG System:
32
+ * ```typescript
33
+ * // Create both ingestion and search
34
+ * const { searchEngine, ingestionPipeline } = await TextRAGFactory.createBoth(
35
+ * './index.bin',
36
+ * './db.sqlite'
37
+ * );
38
+ *
39
+ * // Ingest documents
40
+ * await ingestionPipeline.ingestDirectory('./docs');
41
+ *
42
+ * // Search documents
43
+ * const results = await searchEngine.search('query');
44
+ * ```
45
+ *
46
+ * 4. Error Recovery:
47
+ * ```typescript
48
+ * // Create with automatic fallback options
49
+ * const search = await TextFactoryHelpers.createSearchWithFallback(
50
+ * './index.bin',
51
+ * './db.sqlite',
52
+ * { enableReranking: true } // Will fallback to disabled if reranking fails
53
+ * );
54
+ * ```
55
+ */
56
+ import { SearchEngine } from '../core/search.js';
57
+ import { IngestionPipeline } from '../core/ingestion.js';
58
+ import { IndexManager } from '../index-manager.js';
59
+ import { openDatabase } from '../core/db.js';
60
+ import { createTextEmbedFunction } from '../text/embedder.js';
61
+ import { createTextRerankFunction } from '../text/reranker.js';
62
+ import { config, getModelDefaults } from '../core/config.js';
63
+ import { existsSync } from 'fs';
64
+ import { dirname } from 'path';
65
+ import { mkdirSync } from 'fs';
66
+ /**
67
+ * Factory for creating text-based SearchEngine instances
68
+ * Handles model loading, database initialization, and index setup
69
+ *
70
+ * This factory abstracts the complex initialization process required for text search:
71
+ * 1. Loads and validates text embedding models
72
+ * 2. Optionally loads reranking models with fallback handling
73
+ * 3. Establishes database connections and initializes schema
74
+ * 4. Loads vector indexes with proper model compatibility checking
75
+ * 5. Creates SearchEngine with proper dependency injection
76
+ *
77
+ * @example
78
+ * ```typescript
79
+ * // Basic usage
80
+ * const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
81
+ * const results = await search.search('What is machine learning?');
82
+ *
83
+ * // With custom configuration
84
+ * const search = await TextSearchFactory.create('./index.bin', './db.sqlite', {
85
+ * embeddingModel: 'all-MiniLM-L6-v2',
86
+ * enableReranking: true,
87
+ * topK: 15
88
+ * });
89
+ *
90
+ * // With defaults (uses config file paths)
91
+ * const search = await TextSearchFactory.createWithDefaults({
92
+ * enableReranking: false // Faster search
93
+ * });
94
+ * ```
95
+ */
96
+ export class TextSearchFactory {
97
+ /**
98
+ * Create a SearchEngine configured for text search
99
+ *
100
+ * This method handles the complete initialization process:
101
+ * - Validates that required files exist
102
+ * - Loads text embedding model (with lazy initialization)
103
+ * - Optionally loads reranking model (with graceful fallback)
104
+ * - Opens database connection and initializes schema
105
+ * - Loads vector index with compatibility validation
106
+ * - Creates SearchEngine with dependency injection
107
+ * - Validates the complete setup
108
+ *
109
+ * @param indexPath - Path to the vector index file (must exist)
110
+ * @param dbPath - Path to the SQLite database file (must exist)
111
+ * @param options - Optional configuration overrides
112
+ * @param options.embeddingModel - Override embedding model (default: from config)
113
+ * @param options.batchSize - Override embedding batch size (default: from config)
114
+ * @param options.rerankingModel - Override reranking model (default: from config)
115
+ * @param options.enableReranking - Enable/disable reranking (default: true)
116
+ * @param options.topK - Number of results to return (default: from config)
117
+ * @returns Promise resolving to configured SearchEngine
118
+ * @throws {Error} If required files don't exist or initialization fails
119
+ *
120
+ * @example
121
+ * ```typescript
122
+ * // Create search engine for existing index
123
+ * const search = await TextSearchFactory.create('./my-index.bin', './my-db.sqlite');
124
+ *
125
+ * // Search with the created engine
126
+ * const results = await search.search('artificial intelligence');
127
+ * console.log(`Found ${results.length} results`);
128
+ *
129
+ * // Clean up when done
130
+ * await search.cleanup();
131
+ * ```
132
+ */
133
+ static async create(indexPath, dbPath, options = {}) {
134
+ try {
135
+ console.log('🏭 TextSearchFactory: Initializing text search engine...');
136
+ // Validate input paths
137
+ if (!indexPath || !dbPath) {
138
+ throw new Error('Both indexPath and dbPath are required');
139
+ }
140
+ // Check if required files exist
141
+ if (!existsSync(indexPath)) {
142
+ throw new Error(`Vector index not found at: ${indexPath}\n` +
143
+ 'Run ingestion first to create the index, or check the path.\n' +
144
+ 'Example: const ingestion = await IngestionFactory.create(dbPath, indexPath);');
145
+ }
146
+ if (!existsSync(dbPath)) {
147
+ throw new Error(`Database not found at: ${dbPath}\n` +
148
+ 'Run ingestion first to create the database, or check the path.\n' +
149
+ 'Example: const ingestion = await IngestionFactory.create(dbPath, indexPath);');
150
+ }
151
+ // Step 1: Auto-detect embedding model from database
152
+ let embeddingModel = options.embeddingModel;
153
+ let modelDimensions;
154
+ if (!embeddingModel) {
155
+ // Auto-detect model from database
156
+ const { openDatabase, getStoredModelInfo } = await import('../core/db.js');
157
+ const db = await openDatabase(dbPath);
158
+ try {
159
+ const storedModelInfo = await getStoredModelInfo(db);
160
+ if (storedModelInfo) {
161
+ embeddingModel = storedModelInfo.modelName;
162
+ modelDimensions = storedModelInfo.dimensions;
163
+ console.log(`📊 Auto-detected embedding model: ${embeddingModel} (${modelDimensions} dimensions)`);
164
+ }
165
+ else {
166
+ // Fallback to config default
167
+ embeddingModel = config.embedding_model;
168
+ const modelDefaults = getModelDefaults(embeddingModel);
169
+ modelDimensions = modelDefaults.dimensions;
170
+ console.log(`📊 Using default embedding model: ${embeddingModel} (${modelDimensions} dimensions)`);
171
+ }
172
+ }
173
+ finally {
174
+ await db.close();
175
+ }
176
+ }
177
+ else {
178
+ // Use provided model
179
+ const modelDefaults = getModelDefaults(embeddingModel);
180
+ modelDimensions = modelDefaults.dimensions;
181
+ console.log(`📊 Using specified embedding model: ${embeddingModel} (${modelDimensions} dimensions)`);
182
+ }
183
+ // Step 2: Initialize embedding function
184
+ console.log('📊 Loading text embedding model...');
185
+ const embedFn = createTextEmbedFunction(embeddingModel, options.batchSize);
186
+ // Embedding function created successfully (will be tested on first use)
187
+ console.log('✓ Text embedding function created successfully');
188
+ // Step 3: Initialize reranking function (optional)
189
+ let rerankFn;
190
+ if (options.enableReranking === true) { // Default to disabled for local-first, fast RAG-lite
191
+ try {
192
+ console.log('🔄 Loading text reranking model...');
193
+ rerankFn = createTextRerankFunction(options.rerankingModel);
194
+ // Test reranking function
195
+ await rerankFn('test query', []);
196
+ console.log('✓ Text reranking model loaded successfully');
197
+ }
198
+ catch (error) {
199
+ console.warn(`Failed to load reranking model, continuing without reranking: ${error instanceof Error ? error.message : 'Unknown error'}`);
200
+ rerankFn = undefined;
201
+ }
202
+ }
203
+ else {
204
+ console.log('🔄 Reranking disabled by default (local-first, fast mode)');
205
+ }
206
+ // Step 5: Initialize database connection
207
+ console.log('💾 Opening database connection...');
208
+ const db = await openDatabase(dbPath);
209
+ // Initialize database schema if needed
210
+ const { initializeSchema } = await import('../core/db.js');
211
+ await initializeSchema(db);
212
+ console.log('✓ Database connection established');
213
+ // Step 6: Initialize index manager
214
+ console.log('📇 Loading vector index...');
215
+ const indexManager = new IndexManager(indexPath, dbPath, modelDimensions, embeddingModel);
216
+ await indexManager.initialize();
217
+ console.log('✓ Vector index loaded successfully');
218
+ // Step 7: Create SearchEngine with dependency injection
219
+ const searchEngine = new SearchEngine(embedFn, indexManager, db, rerankFn);
220
+ // Step 8: Validate the setup
221
+ const stats = await searchEngine.getStats();
222
+ console.log(`✓ Search engine ready: ${stats.totalChunks} chunks indexed, reranking ${stats.rerankingEnabled ? 'enabled' : 'disabled'}`);
223
+ console.log('🎉 TextSearchFactory: Search engine initialized successfully');
224
+ return searchEngine;
225
+ }
226
+ catch (error) {
227
+ console.error('❌ TextSearchFactory: Failed to create search engine');
228
+ throw new Error(`TextSearchFactory.create failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
229
+ }
230
+ }
231
+ /**
232
+ * Create a SearchEngine with automatic path resolution
233
+ * Uses default paths from configuration (config.index_file, config.db_file)
234
+ *
235
+ * This is a convenience method that uses the default file paths from the configuration,
236
+ * making it easy to create a search engine without specifying paths explicitly.
237
+ *
238
+ * @param options - Optional configuration overrides
239
+ * @param options.embeddingModel - Override embedding model
240
+ * @param options.enableReranking - Enable/disable reranking
241
+ * @param options.topK - Number of results to return
242
+ * @returns Promise resolving to configured SearchEngine
243
+ * @throws {Error} If default files don't exist or initialization fails
244
+ *
245
+ * @example
246
+ * ```typescript
247
+ * // Use default paths from config
248
+ * const search = await TextSearchFactory.createWithDefaults();
249
+ *
250
+ * // Use defaults with custom options
251
+ * const search = await TextSearchFactory.createWithDefaults({
252
+ * enableReranking: false,
253
+ * topK: 5
254
+ * });
255
+ * ```
256
+ */
257
+ static async createWithDefaults(options = {}) {
258
+ const indexPath = config.index_file || './index.bin';
259
+ const dbPath = config.db_file || './database.sqlite';
260
+ return this.create(indexPath, dbPath, options);
261
+ }
262
+ }
263
+ /**
264
+ * Factory for creating text-based IngestionPipeline instances
265
+ * Handles model loading, database initialization, and index setup
266
+ *
267
+ * This factory abstracts the complex initialization process required for text ingestion:
268
+ * 1. Creates necessary directories if they don't exist
269
+ * 2. Loads and validates text embedding models
270
+ * 3. Establishes database connections and initializes schema
271
+ * 4. Creates or loads vector indexes with proper configuration
272
+ * 5. Creates IngestionPipeline with proper dependency injection
273
+ *
274
+ * @example
275
+ * ```typescript
276
+ * // Basic usage
277
+ * const ingestion = await TextIngestionFactory.create('./db.sqlite', './index.bin');
278
+ * await ingestion.ingestDirectory('./documents');
279
+ *
280
+ * // With custom configuration
281
+ * const ingestion = await TextIngestionFactory.create('./db.sqlite', './index.bin', {
282
+ * embeddingModel: 'all-MiniLM-L6-v2',
283
+ * chunkSize: 512,
284
+ * chunkOverlap: 50,
285
+ * forceRebuild: true
286
+ * });
287
+ *
288
+ * // With defaults
289
+ * const ingestion = await TextIngestionFactory.createWithDefaults({
290
+ * batchSize: 32 // Faster processing
291
+ * });
292
+ * ```
293
+ */
294
+ export class TextIngestionFactory {
295
+ /**
296
+ * Create an IngestionPipeline configured for text ingestion
297
+ *
298
+ * This method handles the complete initialization process:
299
+ * - Creates necessary directories if they don't exist
300
+ * - Loads text embedding model (with lazy initialization)
301
+ * - Opens database connection and initializes schema
302
+ * - Creates or loads vector index (with force rebuild option)
303
+ * - Creates IngestionPipeline with dependency injection
304
+ * - Validates the complete setup
305
+ *
306
+ * @param dbPath - Path to the SQLite database file (will be created if doesn't exist)
307
+ * @param indexPath - Path to the vector index file (will be created if doesn't exist)
308
+ * @param options - Optional configuration overrides
309
+ * @param options.embeddingModel - Override embedding model (default: from config)
310
+ * @param options.batchSize - Override embedding batch size (default: from config)
311
+ * @param options.chunkSize - Override chunk size (default: from config)
312
+ * @param options.chunkOverlap - Override chunk overlap (default: from config)
313
+ * @param options.forceRebuild - Force rebuild of existing index (default: false)
314
+ * @returns Promise resolving to configured IngestionPipeline
315
+ * @throws {Error} If initialization fails
316
+ *
317
+ * @example
318
+ * ```typescript
319
+ * // Create ingestion pipeline
320
+ * const ingestion = await TextIngestionFactory.create('./my-db.sqlite', './my-index.bin');
321
+ *
322
+ * // Ingest documents from directory
323
+ * const result = await ingestion.ingestDirectory('./documents');
324
+ * console.log(`Processed ${result.documentsProcessed} documents`);
325
+ *
326
+ * // Ingest single file
327
+ * await ingestion.ingestFile('./document.pdf');
328
+ *
329
+ * // Clean up when done
330
+ * await ingestion.cleanup();
331
+ * ```
332
+ */
333
+ static async create(dbPath, indexPath, options = {}) {
334
+ try {
335
+ console.log('🏭 TextIngestionFactory: Initializing text ingestion pipeline...');
336
+ // Validate input paths
337
+ if (!dbPath || !indexPath) {
338
+ throw new Error('Both dbPath and indexPath are required');
339
+ }
340
+ // Ensure directories exist
341
+ const dbDir = dirname(dbPath);
342
+ const indexDir = dirname(indexPath);
343
+ if (!existsSync(dbDir)) {
344
+ console.log(`📁 Creating database directory: ${dbDir}`);
345
+ mkdirSync(dbDir, { recursive: true });
346
+ }
347
+ if (!existsSync(indexDir)) {
348
+ console.log(`📁 Creating index directory: ${indexDir}`);
349
+ mkdirSync(indexDir, { recursive: true });
350
+ }
351
+ // Step 1: Get model-specific defaults and merge with options
352
+ const modelDefaults = getModelDefaults(options.embeddingModel || config.embedding_model);
353
+ const effectiveBatchSize = options.batchSize ?? modelDefaults.batch_size;
354
+ const effectiveChunkSize = options.chunkSize ?? modelDefaults.chunk_size;
355
+ const effectiveChunkOverlap = options.chunkOverlap ?? modelDefaults.chunk_overlap;
356
+ // Step 2: Initialize embedding function
357
+ console.log('📊 Loading text embedding model...');
358
+ const embedFn = createTextEmbedFunction(options.embeddingModel, effectiveBatchSize);
359
+ // Test embedding function to ensure it works
360
+ // Embedding function created successfully (will be tested on first use)
361
+ console.log('✓ Text embedding function created successfully');
362
+ // Step 3: Initialize database connection
363
+ console.log('💾 Opening database connection...');
364
+ const db = await openDatabase(dbPath);
365
+ // Initialize database schema if needed
366
+ const { initializeSchema } = await import('../core/db.js');
367
+ await initializeSchema(db);
368
+ console.log('✓ Database connection established');
369
+ // Step 4: Initialize index manager
370
+ console.log('📇 Initializing vector index...');
371
+ const indexManager = new IndexManager(indexPath, dbPath, modelDefaults.dimensions, options.embeddingModel || config.embedding_model);
372
+ // Check if we need to force recreation due to model change
373
+ let forceRecreate = false;
374
+ if (options.forceRebuild && existsSync(indexPath) && existsSync(dbPath)) {
375
+ // Check if model has changed during rebuild
376
+ const { getStoredModelInfo } = await import('../core/db.js');
377
+ const tempDb = await openDatabase(dbPath);
378
+ try {
379
+ const storedModel = await getStoredModelInfo(tempDb);
380
+ const currentModel = options.embeddingModel || config.embedding_model;
381
+ if (storedModel && storedModel.modelName !== currentModel) {
382
+ console.log(`🔄 Model change detected: ${storedModel.modelName} → ${currentModel}`);
383
+ console.log(`🔄 Dimensions change: ${storedModel.dimensions} → ${modelDefaults.dimensions}`);
384
+ forceRecreate = true;
385
+ }
386
+ }
387
+ finally {
388
+ await tempDb.close();
389
+ }
390
+ }
391
+ // Handle force rebuild or create new index
392
+ if (options.forceRebuild || !existsSync(indexPath)) {
393
+ if (options.forceRebuild && existsSync(indexPath)) {
394
+ console.log('🔄 Force rebuild requested, recreating index...');
395
+ }
396
+ else {
397
+ console.log('📇 Creating new vector index...');
398
+ }
399
+ // Initialize with skipModelCheck and forceRecreate for rebuilds
400
+ await indexManager.initialize(options.forceRebuild, forceRecreate);
401
+ // Update stored model info when rebuilding or creating new index
402
+ if (options.forceRebuild || forceRecreate) {
403
+ const { setStoredModelInfo } = await import('../core/db.js');
404
+ const currentModel = options.embeddingModel || config.embedding_model;
405
+ await setStoredModelInfo(db, currentModel, modelDefaults.dimensions);
406
+ console.log(`✓ Updated stored model info: ${currentModel} (${modelDefaults.dimensions} dimensions)`);
407
+ }
408
+ }
409
+ else {
410
+ // Load existing index
411
+ await indexManager.initialize();
412
+ }
413
+ console.log('✓ Vector index ready');
414
+ // Step 4: Create IngestionPipeline with dependency injection and chunk configuration
415
+ const chunkConfig = {
416
+ chunkSize: effectiveChunkSize,
417
+ chunkOverlap: effectiveChunkOverlap
418
+ };
419
+ const ingestionPipeline = new IngestionPipeline(embedFn, indexManager, db, chunkConfig);
420
+ console.log('🎉 TextIngestionFactory: Ingestion pipeline initialized successfully');
421
+ return ingestionPipeline;
422
+ }
423
+ catch (error) {
424
+ console.error('❌ TextIngestionFactory: Failed to create ingestion pipeline');
425
+ throw new Error(`TextIngestionFactory.create failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
426
+ }
427
+ }
428
+ /**
429
+ * Create an IngestionPipeline with automatic path resolution
430
+ * Uses default paths based on current working directory
431
+ * @param options - Optional configuration overrides
432
+ * @returns Promise resolving to configured IngestionPipeline
433
+ */
434
+ static async createWithDefaults(options = {}) {
435
+ const dbPath = config.db_file || './database.sqlite';
436
+ const indexPath = config.index_file || './index.bin';
437
+ return this.create(dbPath, indexPath, options);
438
+ }
439
+ }
440
+ /**
441
+ * Convenience factory to create both search and ingestion instances
442
+ * Useful for applications that need both capabilities with shared configuration
443
+ *
444
+ * This factory creates a complete RAG (Retrieval-Augmented Generation) system
445
+ * by initializing both ingestion and search capabilities with shared resources.
446
+ * The ingestion pipeline is created first to handle directory creation and
447
+ * initial setup, then the search engine is created to use the same resources.
448
+ *
449
+ * @example
450
+ * ```typescript
451
+ * // Create complete RAG system
452
+ * const { searchEngine, ingestionPipeline } = await TextRAGFactory.createBoth(
453
+ * './index.bin',
454
+ * './db.sqlite'
455
+ * );
456
+ *
457
+ * // First, ingest some documents
458
+ * await ingestionPipeline.ingestDirectory('./knowledge-base');
459
+ *
460
+ * // Then search the ingested content
461
+ * const results = await searchEngine.search('What is the main topic?');
462
+ *
463
+ * // Clean up both instances
464
+ * await Promise.all([
465
+ * searchEngine.cleanup(),
466
+ * ingestionPipeline.cleanup()
467
+ * ]);
468
+ * ```
469
+ */
470
+ export class TextRAGFactory {
471
+ /**
472
+ * Create both SearchEngine and IngestionPipeline instances
473
+ *
474
+ * This method creates a complete RAG system by:
475
+ * 1. Creating an ingestion pipeline (handles directory creation)
476
+ * 2. Creating a search engine (uses the same database and index)
477
+ * 3. Ensuring both instances use compatible configurations
478
+ *
479
+ * The ingestion pipeline is created first because it handles directory
480
+ * creation and initial setup, while the search engine requires existing
481
+ * files to validate the setup.
482
+ *
483
+ * @param indexPath - Path to the vector index file
484
+ * @param dbPath - Path to the SQLite database file
485
+ * @param searchOptions - Optional search configuration
486
+ * @param searchOptions.enableReranking - Enable reranking for better results
487
+ * @param searchOptions.topK - Number of search results to return
488
+ * @param ingestionOptions - Optional ingestion configuration
489
+ * @param ingestionOptions.chunkSize - Size of text chunks for processing
490
+ * @param ingestionOptions.forceRebuild - Force rebuild of existing index
491
+ * @returns Promise resolving to both configured instances
492
+ * @throws {Error} If initialization of either component fails
493
+ *
494
+ * @example
495
+ * ```typescript
496
+ * // Create with custom options for both components
497
+ * const { searchEngine, ingestionPipeline } = await TextRAGFactory.createBoth(
498
+ * './index.bin',
499
+ * './db.sqlite',
500
+ * { enableReranking: true, topK: 15 }, // Search options
501
+ * { chunkSize: 512, forceRebuild: true } // Ingestion options
502
+ * );
503
+ *
504
+ * // Use the complete system
505
+ * await ingestionPipeline.ingestDirectory('./docs');
506
+ * const results = await searchEngine.search('machine learning');
507
+ * ```
508
+ */
509
+ static async createBoth(indexPath, dbPath, searchOptions = {}, ingestionOptions = {}) {
510
+ console.log('🏭 TextRAGFactory: Creating complete RAG system...');
511
+ // Create ingestion pipeline first (handles directory creation)
512
+ const ingestionPipeline = await TextIngestionFactory.create(dbPath, indexPath, ingestionOptions);
513
+ // Create search engine (requires existing files)
514
+ const searchEngine = await TextSearchFactory.create(indexPath, dbPath, searchOptions);
515
+ console.log('🎉 TextRAGFactory: Complete RAG system ready');
516
+ return { searchEngine, ingestionPipeline };
517
+ }
518
+ /**
519
+ * Create both instances with default paths
520
+ * @param searchOptions - Optional search configuration
521
+ * @param ingestionOptions - Optional ingestion configuration
522
+ * @returns Promise resolving to both instances
523
+ */
524
+ static async createBothWithDefaults(searchOptions = {}, ingestionOptions = {}) {
525
+ const indexPath = config.index_file || './index.bin';
526
+ const dbPath = config.db_file || './database.sqlite';
527
+ return this.createBoth(indexPath, dbPath, searchOptions, ingestionOptions);
528
+ }
529
+ }
530
+ /**
531
+ * Helper functions for common factory patterns and error recovery
532
+ *
533
+ * This class provides utility functions that support the main factory classes
534
+ * with validation, configuration recommendations, and error recovery patterns.
535
+ * These helpers enable more robust factory usage and better error handling.
536
+ *
537
+ * @example
538
+ * ```typescript
539
+ * // Validate files before creating search engine
540
+ * try {
541
+ * TextFactoryHelpers.validateSearchFiles('./index.bin', './db.sqlite');
542
+ * const search = await TextSearchFactory.create('./index.bin', './db.sqlite');
543
+ * } catch (error) {
544
+ * console.error('Files not ready for search:', error.message);
545
+ * }
546
+ *
547
+ * // Get recommended configuration for different use cases
548
+ * const { searchOptions, ingestionOptions } = TextFactoryHelpers.getRecommendedConfig('quality');
549
+ * const search = await TextSearchFactory.create('./index.bin', './db.sqlite', searchOptions);
550
+ *
551
+ * // Create with automatic error recovery
552
+ * const search = await TextFactoryHelpers.createSearchWithFallback('./index.bin', './db.sqlite', {
553
+ * enableReranking: true // Will fallback to disabled if reranking fails
554
+ * });
555
+ * ```
556
+ */
557
+ export class TextFactoryHelpers {
558
+ /**
559
+ * Validate that required files exist for search operations
560
+ *
561
+ * This method checks that both the vector index and database files exist
562
+ * and provides helpful error messages with suggestions for resolution.
563
+ * Use this before attempting to create a search engine to get better
564
+ * error messages than the generic file not found errors.
565
+ *
566
+ * @param indexPath - Path to vector index file
567
+ * @param dbPath - Path to database file
568
+ * @throws {Error} If either file doesn't exist, with helpful resolution steps
569
+ *
570
+ * @example
571
+ * ```typescript
572
+ * // Validate before creating search engine
573
+ * try {
574
+ * TextFactoryHelpers.validateSearchFiles('./index.bin', './db.sqlite');
575
+ * console.log('Files are ready for search');
576
+ * } catch (error) {
577
+ * console.error('Search files not ready:', error.message);
578
+ * // Error message includes suggestions like "Run ingestion first"
579
+ * }
580
+ * ```
581
+ */
582
+ static validateSearchFiles(indexPath, dbPath) {
583
+ if (!existsSync(indexPath)) {
584
+ throw new Error(`Vector index not found: ${indexPath}\n` +
585
+ 'Run ingestion first: raglite ingest <directory>\n' +
586
+ 'Or use: const ingestion = await IngestionFactory.create(dbPath, indexPath);\n' +
587
+ 'Or check if the path is correct.');
588
+ }
589
+ if (!existsSync(dbPath)) {
590
+ throw new Error(`Database not found: ${dbPath}\n` +
591
+ 'Run ingestion first: raglite ingest <directory>\n' +
592
+ 'Or use: const ingestion = await IngestionFactory.create(dbPath, indexPath);\n' +
593
+ 'Or check if the path is correct.');
594
+ }
595
+ }
596
+ /**
597
+ * Get recommended configuration for different use cases
598
+ *
599
+ * This method provides pre-configured options optimized for different
600
+ * performance vs quality trade-offs. Use these as starting points
601
+ * and adjust based on your specific requirements.
602
+ *
603
+ * @param useCase - The intended use case scenario
604
+ * @param useCase.fast - Optimized for speed (no reranking, smaller chunks)
605
+ * @param useCase.balanced - Good balance of speed and quality (default)
606
+ * @param useCase.quality - Optimized for best results (reranking enabled, larger chunks)
607
+ * @returns Recommended configuration for both search and ingestion
608
+ *
609
+ * @example
610
+ * ```typescript
611
+ * // Get configuration for quality-focused use case
612
+ * const { searchOptions, ingestionOptions } = TextFactoryHelpers.getRecommendedConfig('quality');
613
+ *
614
+ * // Create instances with recommended settings
615
+ * const ingestion = await TextIngestionFactory.create('./db.sqlite', './index.bin', ingestionOptions);
616
+ * const search = await TextSearchFactory.create('./index.bin', './db.sqlite', searchOptions);
617
+ *
618
+ * // Or use with RAG factory
619
+ * const { searchEngine, ingestionPipeline } = await TextRAGFactory.createBoth(
620
+ * './index.bin',
621
+ * './db.sqlite',
622
+ * searchOptions,
623
+ * ingestionOptions
624
+ * );
625
+ * ```
626
+ */
627
+ static getRecommendedConfig(useCase) {
628
+ switch (useCase) {
629
+ case 'fast':
630
+ return {
631
+ searchOptions: {
632
+ enableReranking: false,
633
+ topK: 5
634
+ },
635
+ ingestionOptions: {
636
+ batchSize: 32,
637
+ chunkSize: 512
638
+ }
639
+ };
640
+ case 'balanced':
641
+ return {
642
+ searchOptions: {
643
+ enableReranking: true,
644
+ topK: 10
645
+ },
646
+ ingestionOptions: {
647
+ batchSize: 16,
648
+ chunkSize: 1024
649
+ }
650
+ };
651
+ case 'quality':
652
+ return {
653
+ searchOptions: {
654
+ enableReranking: true,
655
+ topK: 20
656
+ },
657
+ ingestionOptions: {
658
+ batchSize: 8,
659
+ chunkSize: 2048
660
+ }
661
+ };
662
+ default:
663
+ return this.getRecommendedConfig('balanced');
664
+ }
665
+ }
666
+ /**
667
+ * Create a search engine with automatic error recovery
668
+ *
669
+ * This method attempts to create a search engine with the provided options,
670
+ * and if that fails, it tries again with fallback options (primarily
671
+ * disabling reranking, which is a common source of initialization failures).
672
+ * This provides a more robust way to create search engines in environments
673
+ * where reranking models might not be available or might fail to load.
674
+ *
675
+ * @param indexPath - Path to vector index file
676
+ * @param dbPath - Path to database file
677
+ * @param options - Initial options to try
678
+ * @returns Promise resolving to SearchEngine (possibly with fallback options)
679
+ * @throws {Error} If both original and fallback creation attempts fail
680
+ *
681
+ * @example
682
+ * ```typescript
683
+ * // Try to create with reranking, fallback to without if it fails
684
+ * const search = await TextFactoryHelpers.createSearchWithFallback(
685
+ * './index.bin',
686
+ * './db.sqlite',
687
+ * { enableReranking: true, topK: 20 }
688
+ * );
689
+ *
690
+ * // The search engine will work even if reranking model fails to load
691
+ * const results = await search.search('query');
692
+ * console.log(`Search created successfully with ${results.length} results`);
693
+ * ```
694
+ */
695
+ static async createSearchWithFallback(indexPath, dbPath, options = {}) {
696
+ try {
697
+ // Try with original options
698
+ return await TextSearchFactory.create(indexPath, dbPath, options);
699
+ }
700
+ catch (error) {
701
+ console.warn(`Initial search creation failed, trying fallback options: ${error instanceof Error ? error.message : 'Unknown error'}`);
702
+ // Try with reranking disabled as fallback
703
+ const fallbackOptions = {
704
+ ...options,
705
+ enableReranking: false
706
+ };
707
+ try {
708
+ return await TextSearchFactory.create(indexPath, dbPath, fallbackOptions);
709
+ }
710
+ catch (fallbackError) {
711
+ console.error('Fallback search creation also failed');
712
+ throw new Error(`Failed to create search engine with both original and fallback options:\n` +
713
+ `Original error: ${error instanceof Error ? error.message : 'Unknown error'}\n` +
714
+ `Fallback error: ${fallbackError instanceof Error ? fallbackError.message : 'Unknown error'}`);
715
+ }
716
+ }
717
+ }
718
+ }
719
+ //# sourceMappingURL=text-factory.js.map