rag-lite-ts 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +240 -0
- package/dist/api-errors.d.ts +90 -0
- package/dist/api-errors.d.ts.map +1 -0
- package/dist/api-errors.js +320 -0
- package/dist/api-errors.js.map +1 -0
- package/dist/chunker.d.ts +47 -0
- package/dist/chunker.d.ts.map +1 -0
- package/dist/chunker.js +256 -0
- package/dist/chunker.js.map +1 -0
- package/dist/cli/indexer.d.ts +11 -0
- package/dist/cli/indexer.d.ts.map +1 -0
- package/dist/cli/indexer.js +272 -0
- package/dist/cli/indexer.js.map +1 -0
- package/dist/cli/search.d.ts +7 -0
- package/dist/cli/search.d.ts.map +1 -0
- package/dist/cli/search.js +206 -0
- package/dist/cli/search.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +362 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +90 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +281 -0
- package/dist/config.js.map +1 -0
- package/dist/db.d.ts +90 -0
- package/dist/db.d.ts.map +1 -0
- package/dist/db.js +340 -0
- package/dist/db.js.map +1 -0
- package/dist/embedder.d.ts +101 -0
- package/dist/embedder.d.ts.map +1 -0
- package/dist/embedder.js +323 -0
- package/dist/embedder.js.map +1 -0
- package/dist/error-handler.d.ts +91 -0
- package/dist/error-handler.d.ts.map +1 -0
- package/dist/error-handler.js +196 -0
- package/dist/error-handler.js.map +1 -0
- package/dist/file-processor.d.ts +59 -0
- package/dist/file-processor.d.ts.map +1 -0
- package/dist/file-processor.js +312 -0
- package/dist/file-processor.js.map +1 -0
- package/dist/index-manager.d.ts +99 -0
- package/dist/index-manager.d.ts.map +1 -0
- package/dist/index-manager.js +444 -0
- package/dist/index-manager.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +21 -0
- package/dist/index.js.map +1 -0
- package/dist/indexer.d.ts +7 -0
- package/dist/indexer.d.ts.map +1 -0
- package/dist/indexer.js +51 -0
- package/dist/indexer.js.map +1 -0
- package/dist/ingestion.d.ts +175 -0
- package/dist/ingestion.d.ts.map +1 -0
- package/dist/ingestion.js +705 -0
- package/dist/ingestion.js.map +1 -0
- package/dist/mcp-server.d.ts +14 -0
- package/dist/mcp-server.d.ts.map +1 -0
- package/dist/mcp-server.js +680 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/path-manager.d.ts +42 -0
- package/dist/path-manager.d.ts.map +1 -0
- package/dist/path-manager.js +66 -0
- package/dist/path-manager.js.map +1 -0
- package/dist/preprocess.d.ts +19 -0
- package/dist/preprocess.d.ts.map +1 -0
- package/dist/preprocess.js +203 -0
- package/dist/preprocess.js.map +1 -0
- package/dist/preprocessors/index.d.ts +17 -0
- package/dist/preprocessors/index.d.ts.map +1 -0
- package/dist/preprocessors/index.js +38 -0
- package/dist/preprocessors/index.js.map +1 -0
- package/dist/preprocessors/mdx.d.ts +25 -0
- package/dist/preprocessors/mdx.d.ts.map +1 -0
- package/dist/preprocessors/mdx.js +101 -0
- package/dist/preprocessors/mdx.js.map +1 -0
- package/dist/preprocessors/mermaid.d.ts +68 -0
- package/dist/preprocessors/mermaid.d.ts.map +1 -0
- package/dist/preprocessors/mermaid.js +329 -0
- package/dist/preprocessors/mermaid.js.map +1 -0
- package/dist/preprocessors/registry.d.ts +56 -0
- package/dist/preprocessors/registry.d.ts.map +1 -0
- package/dist/preprocessors/registry.js +179 -0
- package/dist/preprocessors/registry.js.map +1 -0
- package/dist/reranker.d.ts +40 -0
- package/dist/reranker.d.ts.map +1 -0
- package/dist/reranker.js +212 -0
- package/dist/reranker.js.map +1 -0
- package/dist/resource-manager-demo.d.ts +7 -0
- package/dist/resource-manager-demo.d.ts.map +1 -0
- package/dist/resource-manager-demo.js +52 -0
- package/dist/resource-manager-demo.js.map +1 -0
- package/dist/resource-manager.d.ts +129 -0
- package/dist/resource-manager.d.ts.map +1 -0
- package/dist/resource-manager.js +389 -0
- package/dist/resource-manager.js.map +1 -0
- package/dist/search-standalone.d.ts +7 -0
- package/dist/search-standalone.d.ts.map +1 -0
- package/dist/search-standalone.js +117 -0
- package/dist/search-standalone.js.map +1 -0
- package/dist/search.d.ts +92 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +454 -0
- package/dist/search.js.map +1 -0
- package/dist/test-utils.d.ts +36 -0
- package/dist/test-utils.d.ts.map +1 -0
- package/dist/test-utils.js +27 -0
- package/dist/test-utils.js.map +1 -0
- package/dist/tokenizer.d.ts +21 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +59 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +44 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/vector-index.d.ts +64 -0
- package/dist/vector-index.d.ts.map +1 -0
- package/dist/vector-index.js +308 -0
- package/dist/vector-index.js.map +1 -0
- package/package.json +80 -0
|
@@ -0,0 +1,705 @@
|
|
|
1
|
+
import { discoverAndProcessFiles } from './file-processor.js';
|
|
2
|
+
import { chunkDocument } from './chunker.js';
|
|
3
|
+
import { IndexManager } from './index-manager.js';
|
|
4
|
+
import { openDatabase, initializeSchema, insertChunk, upsertDocument } from './db.js';
|
|
5
|
+
import { config, validateConfig, getModelDefaults } from './config.js';
|
|
6
|
+
import { DocumentPathManager } from './path-manager.js';
|
|
7
|
+
import { join, resolve } from 'path';
|
|
8
|
+
import { existsSync } from 'fs';
|
|
9
|
+
/**
|
|
10
|
+
* User-friendly error class with actionable suggestions
|
|
11
|
+
*/
|
|
12
|
+
export class IngestionError extends Error {
|
|
13
|
+
code;
|
|
14
|
+
suggestions;
|
|
15
|
+
constructor(message, code, suggestions) {
|
|
16
|
+
super(message);
|
|
17
|
+
this.code = code;
|
|
18
|
+
this.suggestions = suggestions;
|
|
19
|
+
this.name = 'IngestionError';
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Resolves paths for the ingestion pipeline based on basePath
|
|
24
|
+
* @param basePath - Base directory path (defaults to current directory)
|
|
25
|
+
* @returns Resolved paths for database and index files
|
|
26
|
+
*/
|
|
27
|
+
function resolveIngestionPaths(basePath) {
|
|
28
|
+
const resolvedBasePath = basePath ? resolve(basePath) : process.cwd();
|
|
29
|
+
return {
|
|
30
|
+
basePath: resolvedBasePath,
|
|
31
|
+
dbPath: join(resolvedBasePath, 'db.sqlite'),
|
|
32
|
+
indexPath: join(resolvedBasePath, 'vector-index.bin')
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Main ingestion pipeline class
|
|
37
|
+
* Coordinates the entire process from file discovery to vector storage
|
|
38
|
+
*/
|
|
39
|
+
export class IngestionPipeline {
|
|
40
|
+
// Static properties for automatic resource management (Requirement 5.4, 5.5)
|
|
41
|
+
static instances = new Set();
|
|
42
|
+
static cleanupHandlersSet = false;
|
|
43
|
+
db = null;
|
|
44
|
+
indexManager = null;
|
|
45
|
+
embeddingEngine = null;
|
|
46
|
+
pathManager = null;
|
|
47
|
+
isInitialized = false;
|
|
48
|
+
dbPath;
|
|
49
|
+
indexPath;
|
|
50
|
+
basePath;
|
|
51
|
+
configOverrides = {};
|
|
52
|
+
/**
|
|
53
|
+
* Creates a new IngestionPipeline with simplified constructor
|
|
54
|
+
* Pipeline is ready to use immediately without requiring initialization calls (Requirement 1.5)
|
|
55
|
+
* @param basePath - Base directory path for database and index files (defaults to current directory)
|
|
56
|
+
* @param embedder - Pre-initialized embedding engine (optional, will use default if not provided)
|
|
57
|
+
*/
|
|
58
|
+
constructor(basePath, embedder) {
|
|
59
|
+
// Validate parameters
|
|
60
|
+
if (basePath !== undefined && (typeof basePath !== 'string' || basePath.trim() === '')) {
|
|
61
|
+
throw new Error('basePath must be a non-empty string when provided');
|
|
62
|
+
}
|
|
63
|
+
if (embedder !== undefined && (typeof embedder !== 'object' || embedder === null)) {
|
|
64
|
+
throw new Error('embedder must be a valid EmbeddingEngine instance when provided');
|
|
65
|
+
}
|
|
66
|
+
// Resolve paths automatically
|
|
67
|
+
const pathConfig = resolveIngestionPaths(basePath);
|
|
68
|
+
this.basePath = pathConfig.basePath;
|
|
69
|
+
this.dbPath = pathConfig.dbPath;
|
|
70
|
+
this.indexPath = pathConfig.indexPath;
|
|
71
|
+
// Store the provided embedder for later use
|
|
72
|
+
if (embedder) {
|
|
73
|
+
this.embeddingEngine = embedder;
|
|
74
|
+
}
|
|
75
|
+
// Initialize path manager with default configuration
|
|
76
|
+
const effectiveConfig = this.getEffectiveConfig();
|
|
77
|
+
this.pathManager = new DocumentPathManager(effectiveConfig.path_storage_strategy, this.basePath);
|
|
78
|
+
// Set up automatic cleanup on process exit (Requirement 5.5)
|
|
79
|
+
this.setupAutomaticCleanup();
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Set configuration overrides (for internal use)
|
|
83
|
+
* @param overrides - Configuration overrides to apply
|
|
84
|
+
*/
|
|
85
|
+
setConfigOverrides(overrides) {
|
|
86
|
+
this.configOverrides = overrides;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Set path storage strategy
|
|
90
|
+
* @param strategy - Path storage strategy ('absolute' or 'relative')
|
|
91
|
+
* @param basePath - Base path for relative paths (optional, defaults to current base path)
|
|
92
|
+
*/
|
|
93
|
+
setPathStorageStrategy(strategy, basePath) {
|
|
94
|
+
const effectiveBasePath = basePath || this.basePath;
|
|
95
|
+
this.pathManager = new DocumentPathManager(strategy, effectiveBasePath);
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Get effective configuration with overrides applied
|
|
99
|
+
*/
|
|
100
|
+
getEffectiveConfig() {
|
|
101
|
+
const baseConfig = { ...config, ...this.configOverrides };
|
|
102
|
+
// If model is overridden, apply model-specific defaults for chunk_size, chunk_overlap, and batch_size
|
|
103
|
+
// unless they are explicitly overridden
|
|
104
|
+
if (this.configOverrides.embedding_model && this.configOverrides.embedding_model !== config.embedding_model) {
|
|
105
|
+
const modelDefaults = getModelDefaults(this.configOverrides.embedding_model);
|
|
106
|
+
// Apply model-specific defaults only if not explicitly overridden
|
|
107
|
+
if (!this.configOverrides.chunk_size) {
|
|
108
|
+
baseConfig.chunk_size = modelDefaults.chunk_size;
|
|
109
|
+
}
|
|
110
|
+
if (!this.configOverrides.chunk_overlap) {
|
|
111
|
+
baseConfig.chunk_overlap = modelDefaults.chunk_overlap;
|
|
112
|
+
}
|
|
113
|
+
if (!this.configOverrides.batch_size) {
|
|
114
|
+
baseConfig.batch_size = modelDefaults.batch_size;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return baseConfig;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Automatically initialize resources on first use with user-friendly error handling
|
|
121
|
+
* Implements lazy initialization as required by 5.2
|
|
122
|
+
*/
|
|
123
|
+
async ensureInitialized() {
|
|
124
|
+
if (this.isInitialized) {
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
try {
|
|
128
|
+
console.log('Initializing ingestion pipeline...');
|
|
129
|
+
const effectiveConfig = this.getEffectiveConfig();
|
|
130
|
+
// Validate configuration
|
|
131
|
+
validateConfig(effectiveConfig);
|
|
132
|
+
// Initialize database
|
|
133
|
+
console.log('Opening database connection...');
|
|
134
|
+
this.db = await openDatabase(this.dbPath);
|
|
135
|
+
await initializeSchema(this.db);
|
|
136
|
+
// Initialize index manager
|
|
137
|
+
console.log('Initializing index manager...');
|
|
138
|
+
const { getModelDefaults } = await import('./config.js');
|
|
139
|
+
const modelDefaults = getModelDefaults(effectiveConfig.embedding_model);
|
|
140
|
+
this.indexManager = new IndexManager(this.indexPath, this.dbPath, modelDefaults.dimensions, effectiveConfig.embedding_model);
|
|
141
|
+
await this.indexManager.initialize();
|
|
142
|
+
// Initialize embedding engine (use provided one or create new)
|
|
143
|
+
if (!this.embeddingEngine) {
|
|
144
|
+
console.log('Loading embedding model...');
|
|
145
|
+
const { initializeEmbeddingEngine } = await import('./embedder.js');
|
|
146
|
+
this.embeddingEngine = await initializeEmbeddingEngine(effectiveConfig.embedding_model, effectiveConfig.batch_size);
|
|
147
|
+
}
|
|
148
|
+
else {
|
|
149
|
+
console.log('Using provided embedding engine...');
|
|
150
|
+
}
|
|
151
|
+
// Check model version compatibility
|
|
152
|
+
const currentModelVersion = this.embeddingEngine.getModelVersion();
|
|
153
|
+
await this.indexManager.validateModelVersionOrExit(currentModelVersion);
|
|
154
|
+
this.isInitialized = true;
|
|
155
|
+
console.log('Ingestion pipeline initialized successfully');
|
|
156
|
+
}
|
|
157
|
+
catch (error) {
|
|
158
|
+
await this.cleanup();
|
|
159
|
+
throw this.createUserFriendlyError(error, 'initialization');
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Create user-friendly error messages with actionable suggestions
|
|
164
|
+
* Implements requirement 5.3: Clear, actionable error messages with specific next steps
|
|
165
|
+
*/
|
|
166
|
+
createUserFriendlyError(error, context) {
|
|
167
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
168
|
+
// Handle common error scenarios with specific guidance
|
|
169
|
+
if (errorMessage.includes('ENOENT') || errorMessage.includes('no such file')) {
|
|
170
|
+
if (context === 'path_validation') {
|
|
171
|
+
return new IngestionError(`Directory or file path does not exist: ${errorMessage}`, 'PATH_NOT_FOUND', [
|
|
172
|
+
'Check that the path exists and is accessible',
|
|
173
|
+
'Ensure you have read permissions for the directory',
|
|
174
|
+
'Use an absolute path if the relative path is not working'
|
|
175
|
+
]);
|
|
176
|
+
}
|
|
177
|
+
else {
|
|
178
|
+
return new IngestionError(`Required files not found during ${context}`, 'FILES_NOT_FOUND', [
|
|
179
|
+
'Ensure the base directory exists and is writable',
|
|
180
|
+
'Check file permissions in the target directory',
|
|
181
|
+
'Try using an absolute path instead of a relative path'
|
|
182
|
+
]);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
if (errorMessage.includes('EACCES') || errorMessage.includes('permission denied')) {
|
|
186
|
+
return new IngestionError(`Permission denied during ${context}`, 'PERMISSION_DENIED', [
|
|
187
|
+
'Check that you have write permissions to the directory',
|
|
188
|
+
'Try running with appropriate permissions',
|
|
189
|
+
'Ensure the directory is not read-only'
|
|
190
|
+
]);
|
|
191
|
+
}
|
|
192
|
+
if (errorMessage.includes('ENOSPC') || errorMessage.includes('no space left')) {
|
|
193
|
+
return new IngestionError(`Insufficient disk space during ${context}`, 'DISK_SPACE_FULL', [
|
|
194
|
+
'Free up disk space in the target directory',
|
|
195
|
+
'Choose a different location with more available space',
|
|
196
|
+
'Check disk usage with your system tools'
|
|
197
|
+
]);
|
|
198
|
+
}
|
|
199
|
+
if (errorMessage.includes('model') && errorMessage.includes('version')) {
|
|
200
|
+
return new IngestionError(`Embedding model compatibility issue: ${errorMessage}`, 'MODEL_COMPATIBILITY', [
|
|
201
|
+
'Run pipeline.rebuildIndex() to rebuild with the current model',
|
|
202
|
+
'Or specify the same model that was used during original ingestion',
|
|
203
|
+
'Check the model configuration in your setup'
|
|
204
|
+
]);
|
|
205
|
+
}
|
|
206
|
+
if (errorMessage.includes('embedding') || errorMessage.includes('model')) {
|
|
207
|
+
return new IngestionError(`Embedding model initialization failed: ${errorMessage}`, 'MODEL_INIT_FAILED', [
|
|
208
|
+
'Check your internet connection for model downloads',
|
|
209
|
+
'Ensure you have sufficient memory available',
|
|
210
|
+
'Try specifying a different embedding model',
|
|
211
|
+
'Check that the model name is correct and supported'
|
|
212
|
+
]);
|
|
213
|
+
}
|
|
214
|
+
if (errorMessage.includes('database') || errorMessage.includes('sqlite')) {
|
|
215
|
+
return new IngestionError(`Database initialization failed: ${errorMessage}`, 'DATABASE_ERROR', [
|
|
216
|
+
'Check that the database file is not corrupted',
|
|
217
|
+
'Ensure the directory is writable',
|
|
218
|
+
'Try deleting the database file to start fresh',
|
|
219
|
+
'Check for sufficient disk space'
|
|
220
|
+
]);
|
|
221
|
+
}
|
|
222
|
+
// Generic error with basic suggestions
|
|
223
|
+
return new IngestionError(`${context} failed: ${errorMessage}`, 'GENERAL_ERROR', [
|
|
224
|
+
'Check the error message above for specific details',
|
|
225
|
+
'Ensure all file paths are correct and accessible',
|
|
226
|
+
'Verify you have necessary permissions',
|
|
227
|
+
'Try the operation again or contact support if the issue persists'
|
|
228
|
+
]);
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Initialize the ingestion pipeline (public method for backward compatibility)
|
|
232
|
+
* Sets up database, index manager, and embedding engine
|
|
233
|
+
*/
|
|
234
|
+
async initialize() {
|
|
235
|
+
await this.ensureInitialized();
|
|
236
|
+
}
|
|
237
|
+
/**
|
|
238
|
+
* Ingest documents from a directory (matches README API)
|
|
239
|
+
* Automatically initializes resources on first use (Requirements 2.1, 2.3, 5.2)
|
|
240
|
+
* @param directoryPath - Path to directory containing documents
|
|
241
|
+
* @param options - Optional ingestion configuration
|
|
242
|
+
* @returns Promise resolving to ingestion results
|
|
243
|
+
*/
|
|
244
|
+
async ingestDirectory(directoryPath, options = {}) {
|
|
245
|
+
// Validate path exists before initialization
|
|
246
|
+
if (!existsSync(directoryPath)) {
|
|
247
|
+
throw this.createUserFriendlyError(new Error(`Directory not found: ${directoryPath}`), 'path_validation');
|
|
248
|
+
}
|
|
249
|
+
// Automatic initialization on first use (Requirement 5.2)
|
|
250
|
+
await this.ensureInitialized();
|
|
251
|
+
return this.ingestPath(directoryPath, options);
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Ingest a single file (matches README API)
|
|
255
|
+
* Automatically initializes resources on first use (Requirements 2.2, 2.3, 5.2)
|
|
256
|
+
* @param filePath - Path to the file to ingest
|
|
257
|
+
* @param options - Optional ingestion configuration
|
|
258
|
+
* @returns Promise resolving to ingestion results
|
|
259
|
+
*/
|
|
260
|
+
async ingestFile(filePath, options = {}) {
|
|
261
|
+
// Validate path exists before initialization
|
|
262
|
+
if (!existsSync(filePath)) {
|
|
263
|
+
throw this.createUserFriendlyError(new Error(`File not found: ${filePath}`), 'path_validation');
|
|
264
|
+
}
|
|
265
|
+
// Automatic initialization on first use (Requirement 5.2)
|
|
266
|
+
await this.ensureInitialized();
|
|
267
|
+
return this.ingestPath(filePath, options);
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* Ingest documents from a path (file or directory)
|
|
271
|
+
* Implements the complete pipeline: file processing → chunking → embedding → storage
|
|
272
|
+
*
|
|
273
|
+
* Requirements addressed:
|
|
274
|
+
* - 7.5: Single-threaded write processing to avoid SQLite lock contention
|
|
275
|
+
* - 3.3: Graceful handling of embedding failures without stopping ingestion
|
|
276
|
+
* - 10.1: Progress logging and error reporting during batch ingestion
|
|
277
|
+
* - 2.3: Automatic creation of database and index files in appropriate locations
|
|
278
|
+
*/
|
|
279
|
+
async ingestPath(path, options = {}) {
|
|
280
|
+
// Automatic initialization on first use (Requirement 5.2)
|
|
281
|
+
await this.ensureInitialized();
|
|
282
|
+
const startTime = Date.now();
|
|
283
|
+
console.log(`\n=== Starting ingestion from: ${path} ===`);
|
|
284
|
+
try {
|
|
285
|
+
// Phase 1: File Discovery and Processing
|
|
286
|
+
console.log('\n--- Phase 1: File Discovery and Processing ---');
|
|
287
|
+
const fileResult = await discoverAndProcessFiles(path, options.fileOptions, this.pathManager);
|
|
288
|
+
if (fileResult.documents.length === 0) {
|
|
289
|
+
console.log('No documents found to process');
|
|
290
|
+
return {
|
|
291
|
+
documentsProcessed: 0,
|
|
292
|
+
chunksCreated: 0,
|
|
293
|
+
embeddingsGenerated: 0,
|
|
294
|
+
documentErrors: fileResult.processingResult.errors.length,
|
|
295
|
+
embeddingErrors: 0,
|
|
296
|
+
processingTimeMs: Date.now() - startTime
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
// Phase 2: Document Chunking
|
|
300
|
+
console.log('\n--- Phase 2: Document Chunking ---');
|
|
301
|
+
const effectiveConfig = this.getEffectiveConfig();
|
|
302
|
+
const effectiveChunkConfig = options.chunkConfig || {
|
|
303
|
+
chunkSize: effectiveConfig.chunk_size,
|
|
304
|
+
chunkOverlap: effectiveConfig.chunk_overlap
|
|
305
|
+
};
|
|
306
|
+
const chunkingResult = await this.chunkDocuments(fileResult.documents, effectiveChunkConfig);
|
|
307
|
+
if (chunkingResult.totalChunks === 0) {
|
|
308
|
+
console.log('No chunks created from documents');
|
|
309
|
+
return {
|
|
310
|
+
documentsProcessed: fileResult.documents.length,
|
|
311
|
+
chunksCreated: 0,
|
|
312
|
+
embeddingsGenerated: 0,
|
|
313
|
+
documentErrors: fileResult.processingResult.errors.length,
|
|
314
|
+
embeddingErrors: 0,
|
|
315
|
+
processingTimeMs: Date.now() - startTime
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
// Phase 3: Embedding Generation
|
|
319
|
+
console.log('\n--- Phase 3: Embedding Generation ---');
|
|
320
|
+
const embeddingResult = await this.generateEmbeddings(chunkingResult.allChunks);
|
|
321
|
+
// Phase 4: Database and Index Storage (Single-threaded writes)
|
|
322
|
+
console.log('\n--- Phase 4: Storage Operations ---');
|
|
323
|
+
await this.storeDocumentsAndChunks(chunkingResult.documentChunks, embeddingResult.embeddings);
|
|
324
|
+
// Phase 5: Vector Index Updates
|
|
325
|
+
console.log('\n--- Phase 5: Vector Index Updates ---');
|
|
326
|
+
await this.updateVectorIndex(embeddingResult.embeddings);
|
|
327
|
+
const endTime = Date.now();
|
|
328
|
+
const processingTimeMs = endTime - startTime;
|
|
329
|
+
const result = {
|
|
330
|
+
documentsProcessed: fileResult.documents.length,
|
|
331
|
+
chunksCreated: chunkingResult.totalChunks,
|
|
332
|
+
embeddingsGenerated: embeddingResult.embeddings.length,
|
|
333
|
+
documentErrors: fileResult.processingResult.errors.length,
|
|
334
|
+
embeddingErrors: embeddingResult.errors,
|
|
335
|
+
processingTimeMs
|
|
336
|
+
};
|
|
337
|
+
console.log('\n=== Ingestion Complete ===');
|
|
338
|
+
console.log(`Documents processed: ${result.documentsProcessed}`);
|
|
339
|
+
console.log(`Chunks created: ${result.chunksCreated}`);
|
|
340
|
+
console.log(`Embeddings generated: ${result.embeddingsGenerated}`);
|
|
341
|
+
console.log(`Document errors: ${result.documentErrors}`);
|
|
342
|
+
console.log(`Embedding errors: ${result.embeddingErrors}`);
|
|
343
|
+
console.log(`Total time: ${(processingTimeMs / 1000).toFixed(2)}s`);
|
|
344
|
+
return result;
|
|
345
|
+
}
|
|
346
|
+
catch (error) {
|
|
347
|
+
console.error('\n=== Ingestion Failed ===');
|
|
348
|
+
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
349
|
+
// Convert to user-friendly error if not already one (Requirement 2.4)
|
|
350
|
+
if (error instanceof IngestionError) {
|
|
351
|
+
throw error;
|
|
352
|
+
}
|
|
353
|
+
else {
|
|
354
|
+
throw this.createUserFriendlyError(error, 'ingestion');
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
/**
|
|
359
|
+
* Chunk all documents and organize results
|
|
360
|
+
*/
|
|
361
|
+
async chunkDocuments(documents, chunkConfig) {
|
|
362
|
+
const documentChunks = [];
|
|
363
|
+
const allChunks = [];
|
|
364
|
+
let totalChunks = 0;
|
|
365
|
+
console.log(`Processing ${documents.length} document${documents.length === 1 ? '' : 's'} for chunking...`);
|
|
366
|
+
for (let i = 0; i < documents.length; i++) {
|
|
367
|
+
const document = documents[i];
|
|
368
|
+
try {
|
|
369
|
+
const chunks = await chunkDocument(document, chunkConfig);
|
|
370
|
+
documentChunks.push({ document, chunks });
|
|
371
|
+
// Collect all chunk texts for embedding
|
|
372
|
+
const chunkTexts = chunks.map(chunk => chunk.text);
|
|
373
|
+
allChunks.push(...chunkTexts);
|
|
374
|
+
totalChunks += chunks.length;
|
|
375
|
+
// Progress logging - more frequent for better user experience
|
|
376
|
+
if (documents.length <= 10 || (i + 1) % Math.max(1, Math.floor(documents.length / 10)) === 0 || i === documents.length - 1) {
|
|
377
|
+
const percentage = Math.round(((i + 1) / documents.length) * 100);
|
|
378
|
+
console.log(`Processed ${i + 1} of ${documents.length} documents (${percentage}%) - ${totalChunks} chunks created`);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
catch (error) {
|
|
382
|
+
console.error(`Failed to chunk document ${document.source}:`, error instanceof Error ? error.message : String(error));
|
|
383
|
+
// Continue with other documents
|
|
384
|
+
continue;
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
console.log(`✓ Chunking complete: Created ${totalChunks} chunks from ${documentChunks.length} documents`);
|
|
388
|
+
return { documentChunks, allChunks, totalChunks };
|
|
389
|
+
}
|
|
390
|
+
/**
|
|
391
|
+
* Generate embeddings for all chunks with error handling
|
|
392
|
+
* Requirement 3.3: Graceful handling of embedding failures without stopping ingestion
|
|
393
|
+
*/
|
|
394
|
+
async generateEmbeddings(chunkTexts) {
|
|
395
|
+
if (!this.embeddingEngine) {
|
|
396
|
+
throw new Error('Embedding engine not initialized');
|
|
397
|
+
}
|
|
398
|
+
console.log(`Generating embeddings for ${chunkTexts.length} chunk${chunkTexts.length === 1 ? '' : 's'}...`);
|
|
399
|
+
console.log('This may take a few minutes depending on the number of chunks...');
|
|
400
|
+
try {
|
|
401
|
+
// Use the embedDocumentBatch method which has built-in error handling
|
|
402
|
+
const embeddings = await this.embeddingEngine.embedDocumentBatch(chunkTexts);
|
|
403
|
+
const errors = chunkTexts.length - embeddings.length;
|
|
404
|
+
if (errors > 0) {
|
|
405
|
+
console.warn(`⚠ Warning: ${errors} chunk${errors === 1 ? '' : 's'} failed embedding and ${errors === 1 ? 'was' : 'were'} skipped`);
|
|
406
|
+
}
|
|
407
|
+
console.log(`✓ Generated ${embeddings.length} embeddings successfully`);
|
|
408
|
+
return { embeddings, errors };
|
|
409
|
+
}
|
|
410
|
+
catch (error) {
|
|
411
|
+
console.error('Critical embedding failure:', error instanceof Error ? error.message : String(error));
|
|
412
|
+
throw new Error(`Embedding generation failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
/**
|
|
416
|
+
* Store documents and chunks in database with single-threaded writes
|
|
417
|
+
* Requirement 7.5: Single-threaded write processing to avoid SQLite lock contention
|
|
418
|
+
*/
|
|
419
|
+
async storeDocumentsAndChunks(documentChunks, embeddings) {
|
|
420
|
+
if (!this.db) {
|
|
421
|
+
throw new Error('Database not initialized');
|
|
422
|
+
}
|
|
423
|
+
console.log(`Storing ${documentChunks.length} document${documentChunks.length === 1 ? '' : 's'} and chunks in database...`);
|
|
424
|
+
// Create a mapping of chunk text to embedding for efficient lookup
|
|
425
|
+
const embeddingMap = new Map();
|
|
426
|
+
let embeddingIndex = 0;
|
|
427
|
+
// Build mapping - this assumes embeddings are in the same order as chunks were processed
|
|
428
|
+
for (const { chunks } of documentChunks) {
|
|
429
|
+
for (const chunk of chunks) {
|
|
430
|
+
if (embeddingIndex < embeddings.length) {
|
|
431
|
+
embeddingMap.set(chunk.text, embeddings[embeddingIndex]);
|
|
432
|
+
embeddingIndex++;
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
let totalChunksStored = 0;
|
|
437
|
+
let documentsStored = 0;
|
|
438
|
+
// Process each document sequentially (single-threaded writes)
|
|
439
|
+
for (const { document, chunks } of documentChunks) {
|
|
440
|
+
try {
|
|
441
|
+
// Insert or get existing document
|
|
442
|
+
const documentId = await upsertDocument(this.db, document.source, document.title);
|
|
443
|
+
documentsStored++;
|
|
444
|
+
// Insert all chunks for this document
|
|
445
|
+
let chunksStoredForDoc = 0;
|
|
446
|
+
for (const chunk of chunks) {
|
|
447
|
+
const embedding = embeddingMap.get(chunk.text);
|
|
448
|
+
if (embedding) {
|
|
449
|
+
try {
|
|
450
|
+
await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex);
|
|
451
|
+
chunksStoredForDoc++;
|
|
452
|
+
totalChunksStored++;
|
|
453
|
+
}
|
|
454
|
+
catch (chunkError) {
|
|
455
|
+
console.error(`Failed to store chunk ${chunk.chunkIndex} for document ${document.source}:`, chunkError instanceof Error ? chunkError.message : String(chunkError));
|
|
456
|
+
// Continue with other chunks
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
else {
|
|
460
|
+
console.warn(`No embedding found for chunk ${chunk.chunkIndex} in document ${document.source}`);
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
// Progress logging for storage
|
|
464
|
+
if (documentChunks.length <= 20 || documentsStored % Math.max(1, Math.floor(documentChunks.length / 10)) === 0 || documentsStored === documentChunks.length) {
|
|
465
|
+
const percentage = Math.round((documentsStored / documentChunks.length) * 100);
|
|
466
|
+
console.log(`Stored ${documentsStored} of ${documentChunks.length} documents (${percentage}%) - ${totalChunksStored} chunks total`);
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
catch (docError) {
|
|
470
|
+
console.error(`Failed to store document ${document.source}:`, docError instanceof Error ? docError.message : String(docError));
|
|
471
|
+
// Continue with other documents
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
console.log(`✓ Storage complete: ${documentsStored} documents, ${totalChunksStored} chunks saved to database`);
|
|
475
|
+
}
|
|
476
|
+
/**
|
|
477
|
+
* Update vector index with new embeddings
|
|
478
|
+
*/
|
|
479
|
+
async updateVectorIndex(embeddings) {
|
|
480
|
+
if (!this.indexManager) {
|
|
481
|
+
throw new Error('Index manager not initialized');
|
|
482
|
+
}
|
|
483
|
+
if (embeddings.length === 0) {
|
|
484
|
+
console.log('No embeddings to add to vector index');
|
|
485
|
+
return;
|
|
486
|
+
}
|
|
487
|
+
console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
|
|
488
|
+
try {
|
|
489
|
+
await this.indexManager.addVectors(embeddings);
|
|
490
|
+
console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
|
|
491
|
+
}
|
|
492
|
+
catch (error) {
|
|
493
|
+
console.error('Failed to update vector index:', error instanceof Error ? error.message : String(error));
|
|
494
|
+
throw error;
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
/**
|
|
498
|
+
* Initialize the pipeline for rebuild (skips model compatibility check)
|
|
499
|
+
*/
|
|
500
|
+
async initializeForRebuild() {
|
|
501
|
+
if (this.isInitialized) {
|
|
502
|
+
return;
|
|
503
|
+
}
|
|
504
|
+
try {
|
|
505
|
+
console.log('Initializing ingestion pipeline...');
|
|
506
|
+
const effectiveConfig = this.getEffectiveConfig();
|
|
507
|
+
// Validate configuration
|
|
508
|
+
validateConfig(effectiveConfig);
|
|
509
|
+
// Initialize database
|
|
510
|
+
console.log('Opening database connection...');
|
|
511
|
+
this.db = await openDatabase(this.dbPath);
|
|
512
|
+
await initializeSchema(this.db);
|
|
513
|
+
// Initialize index manager (skip model compatibility check for rebuild)
|
|
514
|
+
console.log('Initializing index manager...');
|
|
515
|
+
const { getModelDefaults } = await import('./config.js');
|
|
516
|
+
const modelDefaults = getModelDefaults(effectiveConfig.embedding_model);
|
|
517
|
+
this.indexManager = new IndexManager(this.indexPath, this.dbPath, modelDefaults.dimensions, effectiveConfig.embedding_model);
|
|
518
|
+
await this.indexManager.initialize(true); // Skip model check
|
|
519
|
+
// Initialize embedding engine (use provided one or create new)
|
|
520
|
+
if (!this.embeddingEngine) {
|
|
521
|
+
console.log('Loading embedding model...');
|
|
522
|
+
const { initializeEmbeddingEngine } = await import('./embedder.js');
|
|
523
|
+
this.embeddingEngine = await initializeEmbeddingEngine(effectiveConfig.embedding_model, effectiveConfig.batch_size);
|
|
524
|
+
}
|
|
525
|
+
else {
|
|
526
|
+
console.log('Using provided embedding engine...');
|
|
527
|
+
}
|
|
528
|
+
this.isInitialized = true;
|
|
529
|
+
console.log('Ingestion pipeline initialized successfully');
|
|
530
|
+
}
|
|
531
|
+
catch (error) {
|
|
532
|
+
await this.cleanup();
|
|
533
|
+
throw this.createUserFriendlyError(error, 'initialization');
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
/**
|
|
537
|
+
* Rebuild the entire index from scratch
|
|
538
|
+
* Useful when model version changes or for maintenance
|
|
539
|
+
* Automatically initializes resources if needed (Requirement 5.2)
|
|
540
|
+
*/
|
|
541
|
+
async rebuildIndex() {
|
|
542
|
+
// Use special initialization for rebuild that skips model compatibility check
|
|
543
|
+
if (!this.isInitialized) {
|
|
544
|
+
await this.initializeForRebuild();
|
|
545
|
+
}
|
|
546
|
+
if (!this.indexManager || !this.embeddingEngine) {
|
|
547
|
+
throw this.createUserFriendlyError(new Error('Pipeline not properly initialized'), 'rebuild');
|
|
548
|
+
}
|
|
549
|
+
console.log('\n=== Starting Index Rebuild ===');
|
|
550
|
+
try {
|
|
551
|
+
await this.indexManager.rebuildWithEmbeddings(this.embeddingEngine);
|
|
552
|
+
console.log('Index rebuild completed successfully');
|
|
553
|
+
}
|
|
554
|
+
catch (error) {
|
|
555
|
+
throw this.createUserFriendlyError(error, 'rebuild');
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
/**
|
|
559
|
+
* Get pipeline statistics
|
|
560
|
+
*/
|
|
561
|
+
async getStats() {
|
|
562
|
+
const stats = {
|
|
563
|
+
isInitialized: this.isInitialized,
|
|
564
|
+
indexStats: null
|
|
565
|
+
};
|
|
566
|
+
if (this.indexManager) {
|
|
567
|
+
try {
|
|
568
|
+
stats.indexStats = await this.indexManager.getStats();
|
|
569
|
+
}
|
|
570
|
+
catch (error) {
|
|
571
|
+
console.error('Failed to get index stats:', error instanceof Error ? error.message : String(error));
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
return stats;
|
|
575
|
+
}
|
|
576
|
+
/**
|
|
577
|
+
* Set up automatic cleanup on process exit (Requirement 5.5)
|
|
578
|
+
*/
|
|
579
|
+
setupAutomaticCleanup() {
|
|
580
|
+
// Track this instance for cleanup
|
|
581
|
+
IngestionPipeline.instances.add(this);
|
|
582
|
+
// Set up process exit handlers only once
|
|
583
|
+
if (!IngestionPipeline.cleanupHandlersSet) {
|
|
584
|
+
IngestionPipeline.cleanupHandlersSet = true;
|
|
585
|
+
const cleanupAll = async () => {
|
|
586
|
+
const instances = Array.from(IngestionPipeline.instances);
|
|
587
|
+
await Promise.all(instances.map(instance => instance.cleanup()));
|
|
588
|
+
};
|
|
589
|
+
// Handle various exit scenarios
|
|
590
|
+
process.on('exit', () => {
|
|
591
|
+
// Synchronous cleanup for exit event
|
|
592
|
+
for (const instance of IngestionPipeline.instances) {
|
|
593
|
+
try {
|
|
594
|
+
if (instance.db) {
|
|
595
|
+
// Synchronous close for exit handler
|
|
596
|
+
instance.db = null;
|
|
597
|
+
}
|
|
598
|
+
if (instance.indexManager) {
|
|
599
|
+
instance.indexManager = null;
|
|
600
|
+
}
|
|
601
|
+
instance.embeddingEngine = null;
|
|
602
|
+
instance.isInitialized = false;
|
|
603
|
+
}
|
|
604
|
+
catch (error) {
|
|
605
|
+
// Silent cleanup on exit
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
});
|
|
609
|
+
process.on('SIGINT', async () => {
|
|
610
|
+
await cleanupAll();
|
|
611
|
+
process.exit(0);
|
|
612
|
+
});
|
|
613
|
+
process.on('SIGTERM', async () => {
|
|
614
|
+
await cleanupAll();
|
|
615
|
+
process.exit(0);
|
|
616
|
+
});
|
|
617
|
+
process.on('uncaughtException', async (error) => {
|
|
618
|
+
console.error('Uncaught exception:', error);
|
|
619
|
+
await cleanupAll();
|
|
620
|
+
process.exit(1);
|
|
621
|
+
});
|
|
622
|
+
process.on('unhandledRejection', async (reason) => {
|
|
623
|
+
console.error('Unhandled rejection:', reason);
|
|
624
|
+
await cleanupAll();
|
|
625
|
+
process.exit(1);
|
|
626
|
+
});
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
/**
|
|
630
|
+
* Clean up resources
|
|
631
|
+
*/
|
|
632
|
+
async cleanup() {
|
|
633
|
+
try {
|
|
634
|
+
if (this.indexManager) {
|
|
635
|
+
await this.indexManager.close();
|
|
636
|
+
this.indexManager = null;
|
|
637
|
+
}
|
|
638
|
+
if (this.db) {
|
|
639
|
+
await this.db.close();
|
|
640
|
+
this.db = null;
|
|
641
|
+
}
|
|
642
|
+
this.embeddingEngine = null;
|
|
643
|
+
this.isInitialized = false;
|
|
644
|
+
// Remove from instances tracking
|
|
645
|
+
IngestionPipeline.instances.delete(this);
|
|
646
|
+
console.log('Pipeline cleanup completed');
|
|
647
|
+
}
|
|
648
|
+
catch (error) {
|
|
649
|
+
console.error('Error during cleanup:', error instanceof Error ? error.message : String(error));
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
/**
|
|
654
|
+
* Convenience function to ingest documents from a path
|
|
655
|
+
* Creates a pipeline instance, runs ingestion, and cleans up
|
|
656
|
+
*/
|
|
657
|
+
export async function ingestDocuments(path, options = {}) {
|
|
658
|
+
const pipeline = new IngestionPipeline();
|
|
659
|
+
try {
|
|
660
|
+
await pipeline.initialize();
|
|
661
|
+
const result = await pipeline.ingestPath(path, options);
|
|
662
|
+
return result;
|
|
663
|
+
}
|
|
664
|
+
finally {
|
|
665
|
+
await pipeline.cleanup();
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
/**
|
|
669
|
+
* Convenience function to rebuild the index
|
|
670
|
+
* Creates a pipeline instance, rebuilds index, and cleans up
|
|
671
|
+
*/
|
|
672
|
+
export async function rebuildIndex() {
|
|
673
|
+
// First, try to detect the stored model from the existing database
|
|
674
|
+
let configOverrides = {};
|
|
675
|
+
try {
|
|
676
|
+
const { openDatabase, getStoredModelInfo } = await import('./db.js');
|
|
677
|
+
const db = await openDatabase(config.db_file);
|
|
678
|
+
const storedModel = await getStoredModelInfo(db);
|
|
679
|
+
await db.close();
|
|
680
|
+
if (storedModel) {
|
|
681
|
+
console.log(`Detected stored model: ${storedModel.modelName}`);
|
|
682
|
+
const { getModelDefaults } = await import('./config.js');
|
|
683
|
+
const modelDefaults = getModelDefaults(storedModel.modelName);
|
|
684
|
+
configOverrides = {
|
|
685
|
+
embedding_model: storedModel.modelName,
|
|
686
|
+
chunk_size: modelDefaults.chunk_size,
|
|
687
|
+
chunk_overlap: modelDefaults.chunk_overlap,
|
|
688
|
+
batch_size: modelDefaults.batch_size
|
|
689
|
+
};
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
catch (error) {
|
|
693
|
+
console.log('Could not detect stored model, using default configuration');
|
|
694
|
+
}
|
|
695
|
+
const pipeline = new IngestionPipeline();
|
|
696
|
+
pipeline.setConfigOverrides(configOverrides);
|
|
697
|
+
try {
|
|
698
|
+
await pipeline.initialize();
|
|
699
|
+
await pipeline.rebuildIndex();
|
|
700
|
+
}
|
|
701
|
+
finally {
|
|
702
|
+
await pipeline.cleanup();
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
//# sourceMappingURL=ingestion.js.map
|