rag-lite-ts 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +240 -0
  3. package/dist/api-errors.d.ts +90 -0
  4. package/dist/api-errors.d.ts.map +1 -0
  5. package/dist/api-errors.js +320 -0
  6. package/dist/api-errors.js.map +1 -0
  7. package/dist/chunker.d.ts +47 -0
  8. package/dist/chunker.d.ts.map +1 -0
  9. package/dist/chunker.js +256 -0
  10. package/dist/chunker.js.map +1 -0
  11. package/dist/cli/indexer.d.ts +11 -0
  12. package/dist/cli/indexer.d.ts.map +1 -0
  13. package/dist/cli/indexer.js +272 -0
  14. package/dist/cli/indexer.js.map +1 -0
  15. package/dist/cli/search.d.ts +7 -0
  16. package/dist/cli/search.d.ts.map +1 -0
  17. package/dist/cli/search.js +206 -0
  18. package/dist/cli/search.js.map +1 -0
  19. package/dist/cli.d.ts +3 -0
  20. package/dist/cli.d.ts.map +1 -0
  21. package/dist/cli.js +362 -0
  22. package/dist/cli.js.map +1 -0
  23. package/dist/config.d.ts +90 -0
  24. package/dist/config.d.ts.map +1 -0
  25. package/dist/config.js +281 -0
  26. package/dist/config.js.map +1 -0
  27. package/dist/db.d.ts +90 -0
  28. package/dist/db.d.ts.map +1 -0
  29. package/dist/db.js +340 -0
  30. package/dist/db.js.map +1 -0
  31. package/dist/embedder.d.ts +101 -0
  32. package/dist/embedder.d.ts.map +1 -0
  33. package/dist/embedder.js +323 -0
  34. package/dist/embedder.js.map +1 -0
  35. package/dist/error-handler.d.ts +91 -0
  36. package/dist/error-handler.d.ts.map +1 -0
  37. package/dist/error-handler.js +196 -0
  38. package/dist/error-handler.js.map +1 -0
  39. package/dist/file-processor.d.ts +59 -0
  40. package/dist/file-processor.d.ts.map +1 -0
  41. package/dist/file-processor.js +312 -0
  42. package/dist/file-processor.js.map +1 -0
  43. package/dist/index-manager.d.ts +99 -0
  44. package/dist/index-manager.d.ts.map +1 -0
  45. package/dist/index-manager.js +444 -0
  46. package/dist/index-manager.js.map +1 -0
  47. package/dist/index.d.ts +13 -0
  48. package/dist/index.d.ts.map +1 -0
  49. package/dist/index.js +21 -0
  50. package/dist/index.js.map +1 -0
  51. package/dist/indexer.d.ts +7 -0
  52. package/dist/indexer.d.ts.map +1 -0
  53. package/dist/indexer.js +51 -0
  54. package/dist/indexer.js.map +1 -0
  55. package/dist/ingestion.d.ts +175 -0
  56. package/dist/ingestion.d.ts.map +1 -0
  57. package/dist/ingestion.js +705 -0
  58. package/dist/ingestion.js.map +1 -0
  59. package/dist/mcp-server.d.ts +14 -0
  60. package/dist/mcp-server.d.ts.map +1 -0
  61. package/dist/mcp-server.js +680 -0
  62. package/dist/mcp-server.js.map +1 -0
  63. package/dist/path-manager.d.ts +42 -0
  64. package/dist/path-manager.d.ts.map +1 -0
  65. package/dist/path-manager.js +66 -0
  66. package/dist/path-manager.js.map +1 -0
  67. package/dist/preprocess.d.ts +19 -0
  68. package/dist/preprocess.d.ts.map +1 -0
  69. package/dist/preprocess.js +203 -0
  70. package/dist/preprocess.js.map +1 -0
  71. package/dist/preprocessors/index.d.ts +17 -0
  72. package/dist/preprocessors/index.d.ts.map +1 -0
  73. package/dist/preprocessors/index.js +38 -0
  74. package/dist/preprocessors/index.js.map +1 -0
  75. package/dist/preprocessors/mdx.d.ts +25 -0
  76. package/dist/preprocessors/mdx.d.ts.map +1 -0
  77. package/dist/preprocessors/mdx.js +101 -0
  78. package/dist/preprocessors/mdx.js.map +1 -0
  79. package/dist/preprocessors/mermaid.d.ts +68 -0
  80. package/dist/preprocessors/mermaid.d.ts.map +1 -0
  81. package/dist/preprocessors/mermaid.js +329 -0
  82. package/dist/preprocessors/mermaid.js.map +1 -0
  83. package/dist/preprocessors/registry.d.ts +56 -0
  84. package/dist/preprocessors/registry.d.ts.map +1 -0
  85. package/dist/preprocessors/registry.js +179 -0
  86. package/dist/preprocessors/registry.js.map +1 -0
  87. package/dist/reranker.d.ts +40 -0
  88. package/dist/reranker.d.ts.map +1 -0
  89. package/dist/reranker.js +212 -0
  90. package/dist/reranker.js.map +1 -0
  91. package/dist/resource-manager-demo.d.ts +7 -0
  92. package/dist/resource-manager-demo.d.ts.map +1 -0
  93. package/dist/resource-manager-demo.js +52 -0
  94. package/dist/resource-manager-demo.js.map +1 -0
  95. package/dist/resource-manager.d.ts +129 -0
  96. package/dist/resource-manager.d.ts.map +1 -0
  97. package/dist/resource-manager.js +389 -0
  98. package/dist/resource-manager.js.map +1 -0
  99. package/dist/search-standalone.d.ts +7 -0
  100. package/dist/search-standalone.d.ts.map +1 -0
  101. package/dist/search-standalone.js +117 -0
  102. package/dist/search-standalone.js.map +1 -0
  103. package/dist/search.d.ts +92 -0
  104. package/dist/search.d.ts.map +1 -0
  105. package/dist/search.js +454 -0
  106. package/dist/search.js.map +1 -0
  107. package/dist/test-utils.d.ts +36 -0
  108. package/dist/test-utils.d.ts.map +1 -0
  109. package/dist/test-utils.js +27 -0
  110. package/dist/test-utils.js.map +1 -0
  111. package/dist/tokenizer.d.ts +21 -0
  112. package/dist/tokenizer.d.ts.map +1 -0
  113. package/dist/tokenizer.js +59 -0
  114. package/dist/tokenizer.js.map +1 -0
  115. package/dist/types.d.ts +44 -0
  116. package/dist/types.d.ts.map +1 -0
  117. package/dist/types.js +3 -0
  118. package/dist/types.js.map +1 -0
  119. package/dist/vector-index.d.ts +64 -0
  120. package/dist/vector-index.d.ts.map +1 -0
  121. package/dist/vector-index.js +308 -0
  122. package/dist/vector-index.js.map +1 -0
  123. package/package.json +80 -0
@@ -0,0 +1,705 @@
1
+ import { discoverAndProcessFiles } from './file-processor.js';
2
+ import { chunkDocument } from './chunker.js';
3
+ import { IndexManager } from './index-manager.js';
4
+ import { openDatabase, initializeSchema, insertChunk, upsertDocument } from './db.js';
5
+ import { config, validateConfig, getModelDefaults } from './config.js';
6
+ import { DocumentPathManager } from './path-manager.js';
7
+ import { join, resolve } from 'path';
8
+ import { existsSync } from 'fs';
9
+ /**
10
+ * User-friendly error class with actionable suggestions
11
+ */
12
+ export class IngestionError extends Error {
13
+ code;
14
+ suggestions;
15
+ constructor(message, code, suggestions) {
16
+ super(message);
17
+ this.code = code;
18
+ this.suggestions = suggestions;
19
+ this.name = 'IngestionError';
20
+ }
21
+ }
22
+ /**
23
+ * Resolves paths for the ingestion pipeline based on basePath
24
+ * @param basePath - Base directory path (defaults to current directory)
25
+ * @returns Resolved paths for database and index files
26
+ */
27
+ function resolveIngestionPaths(basePath) {
28
+ const resolvedBasePath = basePath ? resolve(basePath) : process.cwd();
29
+ return {
30
+ basePath: resolvedBasePath,
31
+ dbPath: join(resolvedBasePath, 'db.sqlite'),
32
+ indexPath: join(resolvedBasePath, 'vector-index.bin')
33
+ };
34
+ }
35
+ /**
36
+ * Main ingestion pipeline class
37
+ * Coordinates the entire process from file discovery to vector storage
38
+ */
39
+ export class IngestionPipeline {
40
+ // Static properties for automatic resource management (Requirement 5.4, 5.5)
41
+ static instances = new Set();
42
+ static cleanupHandlersSet = false;
43
+ db = null;
44
+ indexManager = null;
45
+ embeddingEngine = null;
46
+ pathManager = null;
47
+ isInitialized = false;
48
+ dbPath;
49
+ indexPath;
50
+ basePath;
51
+ configOverrides = {};
52
+ /**
53
+ * Creates a new IngestionPipeline with simplified constructor
54
+ * Pipeline is ready to use immediately without requiring initialization calls (Requirement 1.5)
55
+ * @param basePath - Base directory path for database and index files (defaults to current directory)
56
+ * @param embedder - Pre-initialized embedding engine (optional, will use default if not provided)
57
+ */
58
+ constructor(basePath, embedder) {
59
+ // Validate parameters
60
+ if (basePath !== undefined && (typeof basePath !== 'string' || basePath.trim() === '')) {
61
+ throw new Error('basePath must be a non-empty string when provided');
62
+ }
63
+ if (embedder !== undefined && (typeof embedder !== 'object' || embedder === null)) {
64
+ throw new Error('embedder must be a valid EmbeddingEngine instance when provided');
65
+ }
66
+ // Resolve paths automatically
67
+ const pathConfig = resolveIngestionPaths(basePath);
68
+ this.basePath = pathConfig.basePath;
69
+ this.dbPath = pathConfig.dbPath;
70
+ this.indexPath = pathConfig.indexPath;
71
+ // Store the provided embedder for later use
72
+ if (embedder) {
73
+ this.embeddingEngine = embedder;
74
+ }
75
+ // Initialize path manager with default configuration
76
+ const effectiveConfig = this.getEffectiveConfig();
77
+ this.pathManager = new DocumentPathManager(effectiveConfig.path_storage_strategy, this.basePath);
78
+ // Set up automatic cleanup on process exit (Requirement 5.5)
79
+ this.setupAutomaticCleanup();
80
+ }
81
+ /**
82
+ * Set configuration overrides (for internal use)
83
+ * @param overrides - Configuration overrides to apply
84
+ */
85
+ setConfigOverrides(overrides) {
86
+ this.configOverrides = overrides;
87
+ }
88
+ /**
89
+ * Set path storage strategy
90
+ * @param strategy - Path storage strategy ('absolute' or 'relative')
91
+ * @param basePath - Base path for relative paths (optional, defaults to current base path)
92
+ */
93
+ setPathStorageStrategy(strategy, basePath) {
94
+ const effectiveBasePath = basePath || this.basePath;
95
+ this.pathManager = new DocumentPathManager(strategy, effectiveBasePath);
96
+ }
97
+ /**
98
+ * Get effective configuration with overrides applied
99
+ */
100
+ getEffectiveConfig() {
101
+ const baseConfig = { ...config, ...this.configOverrides };
102
+ // If model is overridden, apply model-specific defaults for chunk_size, chunk_overlap, and batch_size
103
+ // unless they are explicitly overridden
104
+ if (this.configOverrides.embedding_model && this.configOverrides.embedding_model !== config.embedding_model) {
105
+ const modelDefaults = getModelDefaults(this.configOverrides.embedding_model);
106
+ // Apply model-specific defaults only if not explicitly overridden
107
+ if (!this.configOverrides.chunk_size) {
108
+ baseConfig.chunk_size = modelDefaults.chunk_size;
109
+ }
110
+ if (!this.configOverrides.chunk_overlap) {
111
+ baseConfig.chunk_overlap = modelDefaults.chunk_overlap;
112
+ }
113
+ if (!this.configOverrides.batch_size) {
114
+ baseConfig.batch_size = modelDefaults.batch_size;
115
+ }
116
+ }
117
+ return baseConfig;
118
+ }
119
+ /**
120
+ * Automatically initialize resources on first use with user-friendly error handling
121
+ * Implements lazy initialization as required by 5.2
122
+ */
123
+ async ensureInitialized() {
124
+ if (this.isInitialized) {
125
+ return;
126
+ }
127
+ try {
128
+ console.log('Initializing ingestion pipeline...');
129
+ const effectiveConfig = this.getEffectiveConfig();
130
+ // Validate configuration
131
+ validateConfig(effectiveConfig);
132
+ // Initialize database
133
+ console.log('Opening database connection...');
134
+ this.db = await openDatabase(this.dbPath);
135
+ await initializeSchema(this.db);
136
+ // Initialize index manager
137
+ console.log('Initializing index manager...');
138
+ const { getModelDefaults } = await import('./config.js');
139
+ const modelDefaults = getModelDefaults(effectiveConfig.embedding_model);
140
+ this.indexManager = new IndexManager(this.indexPath, this.dbPath, modelDefaults.dimensions, effectiveConfig.embedding_model);
141
+ await this.indexManager.initialize();
142
+ // Initialize embedding engine (use provided one or create new)
143
+ if (!this.embeddingEngine) {
144
+ console.log('Loading embedding model...');
145
+ const { initializeEmbeddingEngine } = await import('./embedder.js');
146
+ this.embeddingEngine = await initializeEmbeddingEngine(effectiveConfig.embedding_model, effectiveConfig.batch_size);
147
+ }
148
+ else {
149
+ console.log('Using provided embedding engine...');
150
+ }
151
+ // Check model version compatibility
152
+ const currentModelVersion = this.embeddingEngine.getModelVersion();
153
+ await this.indexManager.validateModelVersionOrExit(currentModelVersion);
154
+ this.isInitialized = true;
155
+ console.log('Ingestion pipeline initialized successfully');
156
+ }
157
+ catch (error) {
158
+ await this.cleanup();
159
+ throw this.createUserFriendlyError(error, 'initialization');
160
+ }
161
+ }
162
+ /**
163
+ * Create user-friendly error messages with actionable suggestions
164
+ * Implements requirement 5.3: Clear, actionable error messages with specific next steps
165
+ */
166
+ createUserFriendlyError(error, context) {
167
+ const errorMessage = error instanceof Error ? error.message : String(error);
168
+ // Handle common error scenarios with specific guidance
169
+ if (errorMessage.includes('ENOENT') || errorMessage.includes('no such file')) {
170
+ if (context === 'path_validation') {
171
+ return new IngestionError(`Directory or file path does not exist: ${errorMessage}`, 'PATH_NOT_FOUND', [
172
+ 'Check that the path exists and is accessible',
173
+ 'Ensure you have read permissions for the directory',
174
+ 'Use an absolute path if the relative path is not working'
175
+ ]);
176
+ }
177
+ else {
178
+ return new IngestionError(`Required files not found during ${context}`, 'FILES_NOT_FOUND', [
179
+ 'Ensure the base directory exists and is writable',
180
+ 'Check file permissions in the target directory',
181
+ 'Try using an absolute path instead of a relative path'
182
+ ]);
183
+ }
184
+ }
185
+ if (errorMessage.includes('EACCES') || errorMessage.includes('permission denied')) {
186
+ return new IngestionError(`Permission denied during ${context}`, 'PERMISSION_DENIED', [
187
+ 'Check that you have write permissions to the directory',
188
+ 'Try running with appropriate permissions',
189
+ 'Ensure the directory is not read-only'
190
+ ]);
191
+ }
192
+ if (errorMessage.includes('ENOSPC') || errorMessage.includes('no space left')) {
193
+ return new IngestionError(`Insufficient disk space during ${context}`, 'DISK_SPACE_FULL', [
194
+ 'Free up disk space in the target directory',
195
+ 'Choose a different location with more available space',
196
+ 'Check disk usage with your system tools'
197
+ ]);
198
+ }
199
+ if (errorMessage.includes('model') && errorMessage.includes('version')) {
200
+ return new IngestionError(`Embedding model compatibility issue: ${errorMessage}`, 'MODEL_COMPATIBILITY', [
201
+ 'Run pipeline.rebuildIndex() to rebuild with the current model',
202
+ 'Or specify the same model that was used during original ingestion',
203
+ 'Check the model configuration in your setup'
204
+ ]);
205
+ }
206
+ if (errorMessage.includes('embedding') || errorMessage.includes('model')) {
207
+ return new IngestionError(`Embedding model initialization failed: ${errorMessage}`, 'MODEL_INIT_FAILED', [
208
+ 'Check your internet connection for model downloads',
209
+ 'Ensure you have sufficient memory available',
210
+ 'Try specifying a different embedding model',
211
+ 'Check that the model name is correct and supported'
212
+ ]);
213
+ }
214
+ if (errorMessage.includes('database') || errorMessage.includes('sqlite')) {
215
+ return new IngestionError(`Database initialization failed: ${errorMessage}`, 'DATABASE_ERROR', [
216
+ 'Check that the database file is not corrupted',
217
+ 'Ensure the directory is writable',
218
+ 'Try deleting the database file to start fresh',
219
+ 'Check for sufficient disk space'
220
+ ]);
221
+ }
222
+ // Generic error with basic suggestions
223
+ return new IngestionError(`${context} failed: ${errorMessage}`, 'GENERAL_ERROR', [
224
+ 'Check the error message above for specific details',
225
+ 'Ensure all file paths are correct and accessible',
226
+ 'Verify you have necessary permissions',
227
+ 'Try the operation again or contact support if the issue persists'
228
+ ]);
229
+ }
230
+ /**
231
+ * Initialize the ingestion pipeline (public method for backward compatibility)
232
+ * Sets up database, index manager, and embedding engine
233
+ */
234
+ async initialize() {
235
+ await this.ensureInitialized();
236
+ }
237
+ /**
238
+ * Ingest documents from a directory (matches README API)
239
+ * Automatically initializes resources on first use (Requirements 2.1, 2.3, 5.2)
240
+ * @param directoryPath - Path to directory containing documents
241
+ * @param options - Optional ingestion configuration
242
+ * @returns Promise resolving to ingestion results
243
+ */
244
+ async ingestDirectory(directoryPath, options = {}) {
245
+ // Validate path exists before initialization
246
+ if (!existsSync(directoryPath)) {
247
+ throw this.createUserFriendlyError(new Error(`Directory not found: ${directoryPath}`), 'path_validation');
248
+ }
249
+ // Automatic initialization on first use (Requirement 5.2)
250
+ await this.ensureInitialized();
251
+ return this.ingestPath(directoryPath, options);
252
+ }
253
+ /**
254
+ * Ingest a single file (matches README API)
255
+ * Automatically initializes resources on first use (Requirements 2.2, 2.3, 5.2)
256
+ * @param filePath - Path to the file to ingest
257
+ * @param options - Optional ingestion configuration
258
+ * @returns Promise resolving to ingestion results
259
+ */
260
+ async ingestFile(filePath, options = {}) {
261
+ // Validate path exists before initialization
262
+ if (!existsSync(filePath)) {
263
+ throw this.createUserFriendlyError(new Error(`File not found: ${filePath}`), 'path_validation');
264
+ }
265
+ // Automatic initialization on first use (Requirement 5.2)
266
+ await this.ensureInitialized();
267
+ return this.ingestPath(filePath, options);
268
+ }
269
+ /**
270
+ * Ingest documents from a path (file or directory)
271
+ * Implements the complete pipeline: file processing → chunking → embedding → storage
272
+ *
273
+ * Requirements addressed:
274
+ * - 7.5: Single-threaded write processing to avoid SQLite lock contention
275
+ * - 3.3: Graceful handling of embedding failures without stopping ingestion
276
+ * - 10.1: Progress logging and error reporting during batch ingestion
277
+ * - 2.3: Automatic creation of database and index files in appropriate locations
278
+ */
279
+ async ingestPath(path, options = {}) {
280
+ // Automatic initialization on first use (Requirement 5.2)
281
+ await this.ensureInitialized();
282
+ const startTime = Date.now();
283
+ console.log(`\n=== Starting ingestion from: ${path} ===`);
284
+ try {
285
+ // Phase 1: File Discovery and Processing
286
+ console.log('\n--- Phase 1: File Discovery and Processing ---');
287
+ const fileResult = await discoverAndProcessFiles(path, options.fileOptions, this.pathManager);
288
+ if (fileResult.documents.length === 0) {
289
+ console.log('No documents found to process');
290
+ return {
291
+ documentsProcessed: 0,
292
+ chunksCreated: 0,
293
+ embeddingsGenerated: 0,
294
+ documentErrors: fileResult.processingResult.errors.length,
295
+ embeddingErrors: 0,
296
+ processingTimeMs: Date.now() - startTime
297
+ };
298
+ }
299
+ // Phase 2: Document Chunking
300
+ console.log('\n--- Phase 2: Document Chunking ---');
301
+ const effectiveConfig = this.getEffectiveConfig();
302
+ const effectiveChunkConfig = options.chunkConfig || {
303
+ chunkSize: effectiveConfig.chunk_size,
304
+ chunkOverlap: effectiveConfig.chunk_overlap
305
+ };
306
+ const chunkingResult = await this.chunkDocuments(fileResult.documents, effectiveChunkConfig);
307
+ if (chunkingResult.totalChunks === 0) {
308
+ console.log('No chunks created from documents');
309
+ return {
310
+ documentsProcessed: fileResult.documents.length,
311
+ chunksCreated: 0,
312
+ embeddingsGenerated: 0,
313
+ documentErrors: fileResult.processingResult.errors.length,
314
+ embeddingErrors: 0,
315
+ processingTimeMs: Date.now() - startTime
316
+ };
317
+ }
318
+ // Phase 3: Embedding Generation
319
+ console.log('\n--- Phase 3: Embedding Generation ---');
320
+ const embeddingResult = await this.generateEmbeddings(chunkingResult.allChunks);
321
+ // Phase 4: Database and Index Storage (Single-threaded writes)
322
+ console.log('\n--- Phase 4: Storage Operations ---');
323
+ await this.storeDocumentsAndChunks(chunkingResult.documentChunks, embeddingResult.embeddings);
324
+ // Phase 5: Vector Index Updates
325
+ console.log('\n--- Phase 5: Vector Index Updates ---');
326
+ await this.updateVectorIndex(embeddingResult.embeddings);
327
+ const endTime = Date.now();
328
+ const processingTimeMs = endTime - startTime;
329
+ const result = {
330
+ documentsProcessed: fileResult.documents.length,
331
+ chunksCreated: chunkingResult.totalChunks,
332
+ embeddingsGenerated: embeddingResult.embeddings.length,
333
+ documentErrors: fileResult.processingResult.errors.length,
334
+ embeddingErrors: embeddingResult.errors,
335
+ processingTimeMs
336
+ };
337
+ console.log('\n=== Ingestion Complete ===');
338
+ console.log(`Documents processed: ${result.documentsProcessed}`);
339
+ console.log(`Chunks created: ${result.chunksCreated}`);
340
+ console.log(`Embeddings generated: ${result.embeddingsGenerated}`);
341
+ console.log(`Document errors: ${result.documentErrors}`);
342
+ console.log(`Embedding errors: ${result.embeddingErrors}`);
343
+ console.log(`Total time: ${(processingTimeMs / 1000).toFixed(2)}s`);
344
+ return result;
345
+ }
346
+ catch (error) {
347
+ console.error('\n=== Ingestion Failed ===');
348
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
349
+ // Convert to user-friendly error if not already one (Requirement 2.4)
350
+ if (error instanceof IngestionError) {
351
+ throw error;
352
+ }
353
+ else {
354
+ throw this.createUserFriendlyError(error, 'ingestion');
355
+ }
356
+ }
357
+ }
358
+ /**
359
+ * Chunk all documents and organize results
360
+ */
361
+ async chunkDocuments(documents, chunkConfig) {
362
+ const documentChunks = [];
363
+ const allChunks = [];
364
+ let totalChunks = 0;
365
+ console.log(`Processing ${documents.length} document${documents.length === 1 ? '' : 's'} for chunking...`);
366
+ for (let i = 0; i < documents.length; i++) {
367
+ const document = documents[i];
368
+ try {
369
+ const chunks = await chunkDocument(document, chunkConfig);
370
+ documentChunks.push({ document, chunks });
371
+ // Collect all chunk texts for embedding
372
+ const chunkTexts = chunks.map(chunk => chunk.text);
373
+ allChunks.push(...chunkTexts);
374
+ totalChunks += chunks.length;
375
+ // Progress logging - more frequent for better user experience
376
+ if (documents.length <= 10 || (i + 1) % Math.max(1, Math.floor(documents.length / 10)) === 0 || i === documents.length - 1) {
377
+ const percentage = Math.round(((i + 1) / documents.length) * 100);
378
+ console.log(`Processed ${i + 1} of ${documents.length} documents (${percentage}%) - ${totalChunks} chunks created`);
379
+ }
380
+ }
381
+ catch (error) {
382
+ console.error(`Failed to chunk document ${document.source}:`, error instanceof Error ? error.message : String(error));
383
+ // Continue with other documents
384
+ continue;
385
+ }
386
+ }
387
+ console.log(`✓ Chunking complete: Created ${totalChunks} chunks from ${documentChunks.length} documents`);
388
+ return { documentChunks, allChunks, totalChunks };
389
+ }
390
+ /**
391
+ * Generate embeddings for all chunks with error handling
392
+ * Requirement 3.3: Graceful handling of embedding failures without stopping ingestion
393
+ */
394
+ async generateEmbeddings(chunkTexts) {
395
+ if (!this.embeddingEngine) {
396
+ throw new Error('Embedding engine not initialized');
397
+ }
398
+ console.log(`Generating embeddings for ${chunkTexts.length} chunk${chunkTexts.length === 1 ? '' : 's'}...`);
399
+ console.log('This may take a few minutes depending on the number of chunks...');
400
+ try {
401
+ // Use the embedDocumentBatch method which has built-in error handling
402
+ const embeddings = await this.embeddingEngine.embedDocumentBatch(chunkTexts);
403
+ const errors = chunkTexts.length - embeddings.length;
404
+ if (errors > 0) {
405
+ console.warn(`⚠ Warning: ${errors} chunk${errors === 1 ? '' : 's'} failed embedding and ${errors === 1 ? 'was' : 'were'} skipped`);
406
+ }
407
+ console.log(`✓ Generated ${embeddings.length} embeddings successfully`);
408
+ return { embeddings, errors };
409
+ }
410
+ catch (error) {
411
+ console.error('Critical embedding failure:', error instanceof Error ? error.message : String(error));
412
+ throw new Error(`Embedding generation failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
413
+ }
414
+ }
415
+ /**
416
+ * Store documents and chunks in database with single-threaded writes
417
+ * Requirement 7.5: Single-threaded write processing to avoid SQLite lock contention
418
+ */
419
+ async storeDocumentsAndChunks(documentChunks, embeddings) {
420
+ if (!this.db) {
421
+ throw new Error('Database not initialized');
422
+ }
423
+ console.log(`Storing ${documentChunks.length} document${documentChunks.length === 1 ? '' : 's'} and chunks in database...`);
424
+ // Create a mapping of chunk text to embedding for efficient lookup
425
+ const embeddingMap = new Map();
426
+ let embeddingIndex = 0;
427
+ // Build mapping - this assumes embeddings are in the same order as chunks were processed
428
+ for (const { chunks } of documentChunks) {
429
+ for (const chunk of chunks) {
430
+ if (embeddingIndex < embeddings.length) {
431
+ embeddingMap.set(chunk.text, embeddings[embeddingIndex]);
432
+ embeddingIndex++;
433
+ }
434
+ }
435
+ }
436
+ let totalChunksStored = 0;
437
+ let documentsStored = 0;
438
+ // Process each document sequentially (single-threaded writes)
439
+ for (const { document, chunks } of documentChunks) {
440
+ try {
441
+ // Insert or get existing document
442
+ const documentId = await upsertDocument(this.db, document.source, document.title);
443
+ documentsStored++;
444
+ // Insert all chunks for this document
445
+ let chunksStoredForDoc = 0;
446
+ for (const chunk of chunks) {
447
+ const embedding = embeddingMap.get(chunk.text);
448
+ if (embedding) {
449
+ try {
450
+ await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex);
451
+ chunksStoredForDoc++;
452
+ totalChunksStored++;
453
+ }
454
+ catch (chunkError) {
455
+ console.error(`Failed to store chunk ${chunk.chunkIndex} for document ${document.source}:`, chunkError instanceof Error ? chunkError.message : String(chunkError));
456
+ // Continue with other chunks
457
+ }
458
+ }
459
+ else {
460
+ console.warn(`No embedding found for chunk ${chunk.chunkIndex} in document ${document.source}`);
461
+ }
462
+ }
463
+ // Progress logging for storage
464
+ if (documentChunks.length <= 20 || documentsStored % Math.max(1, Math.floor(documentChunks.length / 10)) === 0 || documentsStored === documentChunks.length) {
465
+ const percentage = Math.round((documentsStored / documentChunks.length) * 100);
466
+ console.log(`Stored ${documentsStored} of ${documentChunks.length} documents (${percentage}%) - ${totalChunksStored} chunks total`);
467
+ }
468
+ }
469
+ catch (docError) {
470
+ console.error(`Failed to store document ${document.source}:`, docError instanceof Error ? docError.message : String(docError));
471
+ // Continue with other documents
472
+ }
473
+ }
474
+ console.log(`✓ Storage complete: ${documentsStored} documents, ${totalChunksStored} chunks saved to database`);
475
+ }
476
+ /**
477
+ * Update vector index with new embeddings
478
+ */
479
+ async updateVectorIndex(embeddings) {
480
+ if (!this.indexManager) {
481
+ throw new Error('Index manager not initialized');
482
+ }
483
+ if (embeddings.length === 0) {
484
+ console.log('No embeddings to add to vector index');
485
+ return;
486
+ }
487
+ console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
488
+ try {
489
+ await this.indexManager.addVectors(embeddings);
490
+ console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
491
+ }
492
+ catch (error) {
493
+ console.error('Failed to update vector index:', error instanceof Error ? error.message : String(error));
494
+ throw error;
495
+ }
496
+ }
497
+ /**
498
+ * Initialize the pipeline for rebuild (skips model compatibility check)
499
+ */
500
+ async initializeForRebuild() {
501
+ if (this.isInitialized) {
502
+ return;
503
+ }
504
+ try {
505
+ console.log('Initializing ingestion pipeline...');
506
+ const effectiveConfig = this.getEffectiveConfig();
507
+ // Validate configuration
508
+ validateConfig(effectiveConfig);
509
+ // Initialize database
510
+ console.log('Opening database connection...');
511
+ this.db = await openDatabase(this.dbPath);
512
+ await initializeSchema(this.db);
513
+ // Initialize index manager (skip model compatibility check for rebuild)
514
+ console.log('Initializing index manager...');
515
+ const { getModelDefaults } = await import('./config.js');
516
+ const modelDefaults = getModelDefaults(effectiveConfig.embedding_model);
517
+ this.indexManager = new IndexManager(this.indexPath, this.dbPath, modelDefaults.dimensions, effectiveConfig.embedding_model);
518
+ await this.indexManager.initialize(true); // Skip model check
519
+ // Initialize embedding engine (use provided one or create new)
520
+ if (!this.embeddingEngine) {
521
+ console.log('Loading embedding model...');
522
+ const { initializeEmbeddingEngine } = await import('./embedder.js');
523
+ this.embeddingEngine = await initializeEmbeddingEngine(effectiveConfig.embedding_model, effectiveConfig.batch_size);
524
+ }
525
+ else {
526
+ console.log('Using provided embedding engine...');
527
+ }
528
+ this.isInitialized = true;
529
+ console.log('Ingestion pipeline initialized successfully');
530
+ }
531
+ catch (error) {
532
+ await this.cleanup();
533
+ throw this.createUserFriendlyError(error, 'initialization');
534
+ }
535
+ }
536
+ /**
537
+ * Rebuild the entire index from scratch
538
+ * Useful when model version changes or for maintenance
539
+ * Automatically initializes resources if needed (Requirement 5.2)
540
+ */
541
+ async rebuildIndex() {
542
+ // Use special initialization for rebuild that skips model compatibility check
543
+ if (!this.isInitialized) {
544
+ await this.initializeForRebuild();
545
+ }
546
+ if (!this.indexManager || !this.embeddingEngine) {
547
+ throw this.createUserFriendlyError(new Error('Pipeline not properly initialized'), 'rebuild');
548
+ }
549
+ console.log('\n=== Starting Index Rebuild ===');
550
+ try {
551
+ await this.indexManager.rebuildWithEmbeddings(this.embeddingEngine);
552
+ console.log('Index rebuild completed successfully');
553
+ }
554
+ catch (error) {
555
+ throw this.createUserFriendlyError(error, 'rebuild');
556
+ }
557
+ }
558
+ /**
559
+ * Get pipeline statistics
560
+ */
561
+ async getStats() {
562
+ const stats = {
563
+ isInitialized: this.isInitialized,
564
+ indexStats: null
565
+ };
566
+ if (this.indexManager) {
567
+ try {
568
+ stats.indexStats = await this.indexManager.getStats();
569
+ }
570
+ catch (error) {
571
+ console.error('Failed to get index stats:', error instanceof Error ? error.message : String(error));
572
+ }
573
+ }
574
+ return stats;
575
+ }
576
+ /**
577
+ * Set up automatic cleanup on process exit (Requirement 5.5)
578
+ */
579
+ setupAutomaticCleanup() {
580
+ // Track this instance for cleanup
581
+ IngestionPipeline.instances.add(this);
582
+ // Set up process exit handlers only once
583
+ if (!IngestionPipeline.cleanupHandlersSet) {
584
+ IngestionPipeline.cleanupHandlersSet = true;
585
+ const cleanupAll = async () => {
586
+ const instances = Array.from(IngestionPipeline.instances);
587
+ await Promise.all(instances.map(instance => instance.cleanup()));
588
+ };
589
+ // Handle various exit scenarios
590
+ process.on('exit', () => {
591
+ // Synchronous cleanup for exit event
592
+ for (const instance of IngestionPipeline.instances) {
593
+ try {
594
+ if (instance.db) {
595
+ // Synchronous close for exit handler
596
+ instance.db = null;
597
+ }
598
+ if (instance.indexManager) {
599
+ instance.indexManager = null;
600
+ }
601
+ instance.embeddingEngine = null;
602
+ instance.isInitialized = false;
603
+ }
604
+ catch (error) {
605
+ // Silent cleanup on exit
606
+ }
607
+ }
608
+ });
609
+ process.on('SIGINT', async () => {
610
+ await cleanupAll();
611
+ process.exit(0);
612
+ });
613
+ process.on('SIGTERM', async () => {
614
+ await cleanupAll();
615
+ process.exit(0);
616
+ });
617
+ process.on('uncaughtException', async (error) => {
618
+ console.error('Uncaught exception:', error);
619
+ await cleanupAll();
620
+ process.exit(1);
621
+ });
622
+ process.on('unhandledRejection', async (reason) => {
623
+ console.error('Unhandled rejection:', reason);
624
+ await cleanupAll();
625
+ process.exit(1);
626
+ });
627
+ }
628
+ }
629
+ /**
630
+ * Clean up resources
631
+ */
632
+ async cleanup() {
633
+ try {
634
+ if (this.indexManager) {
635
+ await this.indexManager.close();
636
+ this.indexManager = null;
637
+ }
638
+ if (this.db) {
639
+ await this.db.close();
640
+ this.db = null;
641
+ }
642
+ this.embeddingEngine = null;
643
+ this.isInitialized = false;
644
+ // Remove from instances tracking
645
+ IngestionPipeline.instances.delete(this);
646
+ console.log('Pipeline cleanup completed');
647
+ }
648
+ catch (error) {
649
+ console.error('Error during cleanup:', error instanceof Error ? error.message : String(error));
650
+ }
651
+ }
652
+ }
653
+ /**
654
+ * Convenience function to ingest documents from a path
655
+ * Creates a pipeline instance, runs ingestion, and cleans up
656
+ */
657
+ export async function ingestDocuments(path, options = {}) {
658
+ const pipeline = new IngestionPipeline();
659
+ try {
660
+ await pipeline.initialize();
661
+ const result = await pipeline.ingestPath(path, options);
662
+ return result;
663
+ }
664
+ finally {
665
+ await pipeline.cleanup();
666
+ }
667
+ }
668
+ /**
669
+ * Convenience function to rebuild the index
670
+ * Creates a pipeline instance, rebuilds index, and cleans up
671
+ */
672
+ export async function rebuildIndex() {
673
+ // First, try to detect the stored model from the existing database
674
+ let configOverrides = {};
675
+ try {
676
+ const { openDatabase, getStoredModelInfo } = await import('./db.js');
677
+ const db = await openDatabase(config.db_file);
678
+ const storedModel = await getStoredModelInfo(db);
679
+ await db.close();
680
+ if (storedModel) {
681
+ console.log(`Detected stored model: ${storedModel.modelName}`);
682
+ const { getModelDefaults } = await import('./config.js');
683
+ const modelDefaults = getModelDefaults(storedModel.modelName);
684
+ configOverrides = {
685
+ embedding_model: storedModel.modelName,
686
+ chunk_size: modelDefaults.chunk_size,
687
+ chunk_overlap: modelDefaults.chunk_overlap,
688
+ batch_size: modelDefaults.batch_size
689
+ };
690
+ }
691
+ }
692
+ catch (error) {
693
+ console.log('Could not detect stored model, using default configuration');
694
+ }
695
+ const pipeline = new IngestionPipeline();
696
+ pipeline.setConfigOverrides(configOverrides);
697
+ try {
698
+ await pipeline.initialize();
699
+ await pipeline.rebuildIndex();
700
+ }
701
+ finally {
702
+ await pipeline.cleanup();
703
+ }
704
+ }
705
+ //# sourceMappingURL=ingestion.js.map