rag-lite-ts 1.0.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +651 -109
- package/dist/cli/indexer.js +262 -46
- package/dist/cli/search.js +54 -32
- package/dist/cli.js +185 -28
- package/dist/config.d.ts +34 -73
- package/dist/config.js +50 -255
- package/dist/core/abstract-embedder.d.ts +125 -0
- package/dist/core/abstract-embedder.js +264 -0
- package/dist/core/actionable-error-messages.d.ts +60 -0
- package/dist/core/actionable-error-messages.js +397 -0
- package/dist/core/adapters.d.ts +93 -0
- package/dist/core/adapters.js +139 -0
- package/dist/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/core/batch-processing-optimizer.js +541 -0
- package/dist/core/chunker.d.ts +119 -0
- package/dist/core/chunker.js +73 -0
- package/dist/core/cli-database-utils.d.ts +53 -0
- package/dist/core/cli-database-utils.js +239 -0
- package/dist/core/config.d.ts +102 -0
- package/dist/core/config.js +247 -0
- package/dist/core/content-errors.d.ts +111 -0
- package/dist/core/content-errors.js +362 -0
- package/dist/core/content-manager.d.ts +343 -0
- package/dist/core/content-manager.js +1504 -0
- package/dist/core/content-performance-optimizer.d.ts +150 -0
- package/dist/core/content-performance-optimizer.js +516 -0
- package/dist/core/content-resolver.d.ts +104 -0
- package/dist/core/content-resolver.js +285 -0
- package/dist/core/cross-modal-search.d.ts +164 -0
- package/dist/core/cross-modal-search.js +342 -0
- package/dist/core/database-connection-manager.d.ts +109 -0
- package/dist/core/database-connection-manager.js +304 -0
- package/dist/core/db.d.ts +245 -0
- package/dist/core/db.js +952 -0
- package/dist/core/embedder-factory.d.ts +176 -0
- package/dist/core/embedder-factory.js +338 -0
- package/dist/{error-handler.d.ts → core/error-handler.d.ts} +23 -2
- package/dist/{error-handler.js → core/error-handler.js} +51 -8
- package/dist/core/index.d.ts +59 -0
- package/dist/core/index.js +69 -0
- package/dist/core/ingestion.d.ts +213 -0
- package/dist/core/ingestion.js +812 -0
- package/dist/core/interfaces.d.ts +408 -0
- package/dist/core/interfaces.js +106 -0
- package/dist/core/lazy-dependency-loader.d.ts +152 -0
- package/dist/core/lazy-dependency-loader.js +453 -0
- package/dist/core/mode-detection-service.d.ts +150 -0
- package/dist/core/mode-detection-service.js +565 -0
- package/dist/core/mode-model-validator.d.ts +92 -0
- package/dist/core/mode-model-validator.js +203 -0
- package/dist/core/model-registry.d.ts +120 -0
- package/dist/core/model-registry.js +415 -0
- package/dist/core/model-validator.d.ts +217 -0
- package/dist/core/model-validator.js +782 -0
- package/dist/{path-manager.d.ts → core/path-manager.d.ts} +5 -0
- package/dist/{path-manager.js → core/path-manager.js} +5 -0
- package/dist/core/polymorphic-search-factory.d.ts +154 -0
- package/dist/core/polymorphic-search-factory.js +344 -0
- package/dist/core/raglite-paths.d.ts +121 -0
- package/dist/core/raglite-paths.js +145 -0
- package/dist/core/reranking-config.d.ts +42 -0
- package/dist/core/reranking-config.js +156 -0
- package/dist/core/reranking-factory.d.ts +92 -0
- package/dist/core/reranking-factory.js +591 -0
- package/dist/core/reranking-strategies.d.ts +325 -0
- package/dist/core/reranking-strategies.js +720 -0
- package/dist/core/resource-cleanup.d.ts +163 -0
- package/dist/core/resource-cleanup.js +371 -0
- package/dist/core/resource-manager.d.ts +212 -0
- package/dist/core/resource-manager.js +564 -0
- package/dist/core/search-pipeline.d.ts +111 -0
- package/dist/core/search-pipeline.js +287 -0
- package/dist/core/search.d.ts +131 -0
- package/dist/core/search.js +296 -0
- package/dist/core/streaming-operations.d.ts +145 -0
- package/dist/core/streaming-operations.js +409 -0
- package/dist/core/types.d.ts +66 -0
- package/dist/core/types.js +6 -0
- package/dist/core/universal-embedder.d.ts +177 -0
- package/dist/core/universal-embedder.js +139 -0
- package/dist/core/validation-messages.d.ts +99 -0
- package/dist/core/validation-messages.js +334 -0
- package/dist/{vector-index.d.ts → core/vector-index.d.ts} +4 -0
- package/dist/{vector-index.js → core/vector-index.js} +21 -3
- package/dist/dom-polyfills.d.ts +6 -0
- package/dist/dom-polyfills.js +40 -0
- package/dist/factories/index.d.ts +43 -0
- package/dist/factories/index.js +44 -0
- package/dist/factories/text-factory.d.ts +560 -0
- package/dist/factories/text-factory.js +968 -0
- package/dist/file-processor.d.ts +90 -4
- package/dist/file-processor.js +723 -20
- package/dist/index-manager.d.ts +3 -2
- package/dist/index-manager.js +13 -11
- package/dist/index.d.ts +72 -8
- package/dist/index.js +102 -16
- package/dist/indexer.js +1 -1
- package/dist/ingestion.d.ts +44 -154
- package/dist/ingestion.js +75 -671
- package/dist/mcp-server.d.ts +35 -3
- package/dist/mcp-server.js +1186 -79
- package/dist/multimodal/clip-embedder.d.ts +314 -0
- package/dist/multimodal/clip-embedder.js +945 -0
- package/dist/multimodal/index.d.ts +6 -0
- package/dist/multimodal/index.js +6 -0
- package/dist/preprocess.js +1 -1
- package/dist/run-error-recovery-tests.d.ts +7 -0
- package/dist/run-error-recovery-tests.js +101 -0
- package/dist/search-standalone.js +1 -1
- package/dist/search.d.ts +51 -69
- package/dist/search.js +117 -412
- package/dist/test-utils.d.ts +8 -26
- package/dist/text/chunker.d.ts +33 -0
- package/dist/{chunker.js → text/chunker.js} +98 -75
- package/dist/{embedder.d.ts → text/embedder.d.ts} +22 -1
- package/dist/{embedder.js → text/embedder.js} +84 -10
- package/dist/text/index.d.ts +8 -0
- package/dist/text/index.js +9 -0
- package/dist/text/preprocessors/index.d.ts +17 -0
- package/dist/text/preprocessors/index.js +38 -0
- package/dist/text/preprocessors/mdx.d.ts +25 -0
- package/dist/text/preprocessors/mdx.js +101 -0
- package/dist/text/preprocessors/mermaid.d.ts +68 -0
- package/dist/text/preprocessors/mermaid.js +330 -0
- package/dist/text/preprocessors/registry.d.ts +56 -0
- package/dist/text/preprocessors/registry.js +180 -0
- package/dist/text/reranker.d.ts +59 -0
- package/dist/{reranker.js → text/reranker.js} +138 -53
- package/dist/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/text/sentence-transformer-embedder.js +340 -0
- package/dist/{tokenizer.d.ts → text/tokenizer.d.ts} +1 -0
- package/dist/{tokenizer.js → text/tokenizer.js} +7 -2
- package/dist/types.d.ts +40 -1
- package/dist/utils/vector-math.d.ts +31 -0
- package/dist/utils/vector-math.js +70 -0
- package/package.json +16 -4
- package/dist/api-errors.d.ts.map +0 -1
- package/dist/api-errors.js.map +0 -1
- package/dist/chunker.d.ts +0 -47
- package/dist/chunker.d.ts.map +0 -1
- package/dist/chunker.js.map +0 -1
- package/dist/cli/indexer.d.ts.map +0 -1
- package/dist/cli/indexer.js.map +0 -1
- package/dist/cli/search.d.ts.map +0 -1
- package/dist/cli/search.js.map +0 -1
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/db.d.ts +0 -90
- package/dist/db.d.ts.map +0 -1
- package/dist/db.js +0 -340
- package/dist/db.js.map +0 -1
- package/dist/embedder.d.ts.map +0 -1
- package/dist/embedder.js.map +0 -1
- package/dist/error-handler.d.ts.map +0 -1
- package/dist/error-handler.js.map +0 -1
- package/dist/file-processor.d.ts.map +0 -1
- package/dist/file-processor.js.map +0 -1
- package/dist/index-manager.d.ts.map +0 -1
- package/dist/index-manager.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/indexer.d.ts.map +0 -1
- package/dist/indexer.js.map +0 -1
- package/dist/ingestion.d.ts.map +0 -1
- package/dist/ingestion.js.map +0 -1
- package/dist/mcp-server.d.ts.map +0 -1
- package/dist/mcp-server.js.map +0 -1
- package/dist/path-manager.d.ts.map +0 -1
- package/dist/path-manager.js.map +0 -1
- package/dist/preprocess.d.ts.map +0 -1
- package/dist/preprocess.js.map +0 -1
- package/dist/preprocessors/index.d.ts.map +0 -1
- package/dist/preprocessors/index.js.map +0 -1
- package/dist/preprocessors/mdx.d.ts.map +0 -1
- package/dist/preprocessors/mdx.js.map +0 -1
- package/dist/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/preprocessors/mermaid.js.map +0 -1
- package/dist/preprocessors/registry.d.ts.map +0 -1
- package/dist/preprocessors/registry.js.map +0 -1
- package/dist/reranker.d.ts +0 -40
- package/dist/reranker.d.ts.map +0 -1
- package/dist/reranker.js.map +0 -1
- package/dist/resource-manager-demo.d.ts +0 -7
- package/dist/resource-manager-demo.d.ts.map +0 -1
- package/dist/resource-manager-demo.js +0 -52
- package/dist/resource-manager-demo.js.map +0 -1
- package/dist/resource-manager.d.ts +0 -129
- package/dist/resource-manager.d.ts.map +0 -1
- package/dist/resource-manager.js +0 -389
- package/dist/resource-manager.js.map +0 -1
- package/dist/search-standalone.d.ts.map +0 -1
- package/dist/search-standalone.js.map +0 -1
- package/dist/search.d.ts.map +0 -1
- package/dist/search.js.map +0 -1
- package/dist/test-utils.d.ts.map +0 -1
- package/dist/test-utils.js.map +0 -1
- package/dist/tokenizer.d.ts.map +0 -1
- package/dist/tokenizer.js.map +0 -1
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js.map +0 -1
- package/dist/vector-index.d.ts.map +0 -1
- package/dist/vector-index.js.map +0 -1
package/dist/ingestion.js
CHANGED
|
@@ -1,705 +1,109 @@
|
|
|
1
|
-
import { discoverAndProcessFiles } from './file-processor.js';
|
|
2
|
-
import { chunkDocument } from './chunker.js';
|
|
3
|
-
import { IndexManager } from './index-manager.js';
|
|
4
|
-
import { openDatabase, initializeSchema, insertChunk, upsertDocument } from './db.js';
|
|
5
|
-
import { config, validateConfig, getModelDefaults } from './config.js';
|
|
6
|
-
import { DocumentPathManager } from './path-manager.js';
|
|
7
|
-
import { join, resolve } from 'path';
|
|
8
|
-
import { existsSync } from 'fs';
|
|
9
1
|
/**
|
|
10
|
-
*
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
*
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
|
|
27
|
-
function resolveIngestionPaths(basePath) {
|
|
28
|
-
const resolvedBasePath = basePath ? resolve(basePath) : process.cwd();
|
|
29
|
-
return {
|
|
30
|
-
basePath: resolvedBasePath,
|
|
31
|
-
dbPath: join(resolvedBasePath, 'db.sqlite'),
|
|
32
|
-
indexPath: join(resolvedBasePath, 'vector-index.bin')
|
|
33
|
-
};
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Main ingestion pipeline class
|
|
37
|
-
* Coordinates the entire process from file discovery to vector storage
|
|
2
|
+
* Public API IngestionPipeline - Simple constructor interface with internal factory usage
|
|
3
|
+
*
|
|
4
|
+
* This class provides a clean, simple API while using the new core architecture
|
|
5
|
+
* internally. It handles dependency injection automatically.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* // Simple usage
|
|
10
|
+
* const pipeline = new IngestionPipeline('./db.sqlite', './index.bin');
|
|
11
|
+
* await pipeline.ingestDirectory('./documents');
|
|
12
|
+
*
|
|
13
|
+
* // With options
|
|
14
|
+
* const pipeline = new IngestionPipeline('./db.sqlite', './index.bin', {
|
|
15
|
+
* embeddingModel: 'all-MiniLM-L6-v2',
|
|
16
|
+
* chunkSize: 512
|
|
17
|
+
* });
|
|
18
|
+
* ```
|
|
38
19
|
*/
|
|
20
|
+
import { TextIngestionFactory } from './factories/index.js';
|
|
39
21
|
export class IngestionPipeline {
|
|
40
|
-
// Static properties for automatic resource management (Requirement 5.4, 5.5)
|
|
41
|
-
static instances = new Set();
|
|
42
|
-
static cleanupHandlersSet = false;
|
|
43
|
-
db = null;
|
|
44
|
-
indexManager = null;
|
|
45
|
-
embeddingEngine = null;
|
|
46
|
-
pathManager = null;
|
|
47
|
-
isInitialized = false;
|
|
48
22
|
dbPath;
|
|
49
23
|
indexPath;
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
24
|
+
options;
|
|
25
|
+
corePipeline = null;
|
|
26
|
+
initPromise = null;
|
|
27
|
+
defaultChunkConfig = null;
|
|
28
|
+
constructor(dbPath, indexPath, options = {}) {
|
|
29
|
+
this.dbPath = dbPath;
|
|
30
|
+
this.indexPath = indexPath;
|
|
31
|
+
this.options = options;
|
|
32
|
+
// Validate required parameters
|
|
33
|
+
if (!dbPath || typeof dbPath !== 'string' || dbPath.trim() === '') {
|
|
34
|
+
throw new Error('Both dbPath and indexPath are required.\n' +
|
|
35
|
+
'Example: const ingestion = new IngestionPipeline("./db.sqlite", "./index.bin");\n' +
|
|
36
|
+
'Or use: const ingestion = await IngestionFactory.create("./db.sqlite", "./index.bin");');
|
|
62
37
|
}
|
|
63
|
-
if (
|
|
64
|
-
throw new Error('
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
const pathConfig = resolveIngestionPaths(basePath);
|
|
68
|
-
this.basePath = pathConfig.basePath;
|
|
69
|
-
this.dbPath = pathConfig.dbPath;
|
|
70
|
-
this.indexPath = pathConfig.indexPath;
|
|
71
|
-
// Store the provided embedder for later use
|
|
72
|
-
if (embedder) {
|
|
73
|
-
this.embeddingEngine = embedder;
|
|
74
|
-
}
|
|
75
|
-
// Initialize path manager with default configuration
|
|
76
|
-
const effectiveConfig = this.getEffectiveConfig();
|
|
77
|
-
this.pathManager = new DocumentPathManager(effectiveConfig.path_storage_strategy, this.basePath);
|
|
78
|
-
// Set up automatic cleanup on process exit (Requirement 5.5)
|
|
79
|
-
this.setupAutomaticCleanup();
|
|
80
|
-
}
|
|
81
|
-
/**
|
|
82
|
-
* Set configuration overrides (for internal use)
|
|
83
|
-
* @param overrides - Configuration overrides to apply
|
|
84
|
-
*/
|
|
85
|
-
setConfigOverrides(overrides) {
|
|
86
|
-
this.configOverrides = overrides;
|
|
87
|
-
}
|
|
88
|
-
/**
|
|
89
|
-
* Set path storage strategy
|
|
90
|
-
* @param strategy - Path storage strategy ('absolute' or 'relative')
|
|
91
|
-
* @param basePath - Base path for relative paths (optional, defaults to current base path)
|
|
92
|
-
*/
|
|
93
|
-
setPathStorageStrategy(strategy, basePath) {
|
|
94
|
-
const effectiveBasePath = basePath || this.basePath;
|
|
95
|
-
this.pathManager = new DocumentPathManager(strategy, effectiveBasePath);
|
|
96
|
-
}
|
|
97
|
-
/**
|
|
98
|
-
* Get effective configuration with overrides applied
|
|
99
|
-
*/
|
|
100
|
-
getEffectiveConfig() {
|
|
101
|
-
const baseConfig = { ...config, ...this.configOverrides };
|
|
102
|
-
// If model is overridden, apply model-specific defaults for chunk_size, chunk_overlap, and batch_size
|
|
103
|
-
// unless they are explicitly overridden
|
|
104
|
-
if (this.configOverrides.embedding_model && this.configOverrides.embedding_model !== config.embedding_model) {
|
|
105
|
-
const modelDefaults = getModelDefaults(this.configOverrides.embedding_model);
|
|
106
|
-
// Apply model-specific defaults only if not explicitly overridden
|
|
107
|
-
if (!this.configOverrides.chunk_size) {
|
|
108
|
-
baseConfig.chunk_size = modelDefaults.chunk_size;
|
|
109
|
-
}
|
|
110
|
-
if (!this.configOverrides.chunk_overlap) {
|
|
111
|
-
baseConfig.chunk_overlap = modelDefaults.chunk_overlap;
|
|
112
|
-
}
|
|
113
|
-
if (!this.configOverrides.batch_size) {
|
|
114
|
-
baseConfig.batch_size = modelDefaults.batch_size;
|
|
115
|
-
}
|
|
38
|
+
if (!indexPath || typeof indexPath !== 'string' || indexPath.trim() === '') {
|
|
39
|
+
throw new Error('Both dbPath and indexPath are required.\n' +
|
|
40
|
+
'Example: const ingestion = new IngestionPipeline("./db.sqlite", "./index.bin");\n' +
|
|
41
|
+
'Or use: const ingestion = await IngestionFactory.create("./db.sqlite", "./index.bin");');
|
|
116
42
|
}
|
|
117
|
-
return baseConfig;
|
|
118
43
|
}
|
|
119
44
|
/**
|
|
120
|
-
*
|
|
121
|
-
* Implements lazy initialization as required by 5.2
|
|
122
|
-
*/
|
|
123
|
-
async ensureInitialized() {
|
|
124
|
-
if (this.isInitialized) {
|
|
125
|
-
return;
|
|
126
|
-
}
|
|
127
|
-
try {
|
|
128
|
-
console.log('Initializing ingestion pipeline...');
|
|
129
|
-
const effectiveConfig = this.getEffectiveConfig();
|
|
130
|
-
// Validate configuration
|
|
131
|
-
validateConfig(effectiveConfig);
|
|
132
|
-
// Initialize database
|
|
133
|
-
console.log('Opening database connection...');
|
|
134
|
-
this.db = await openDatabase(this.dbPath);
|
|
135
|
-
await initializeSchema(this.db);
|
|
136
|
-
// Initialize index manager
|
|
137
|
-
console.log('Initializing index manager...');
|
|
138
|
-
const { getModelDefaults } = await import('./config.js');
|
|
139
|
-
const modelDefaults = getModelDefaults(effectiveConfig.embedding_model);
|
|
140
|
-
this.indexManager = new IndexManager(this.indexPath, this.dbPath, modelDefaults.dimensions, effectiveConfig.embedding_model);
|
|
141
|
-
await this.indexManager.initialize();
|
|
142
|
-
// Initialize embedding engine (use provided one or create new)
|
|
143
|
-
if (!this.embeddingEngine) {
|
|
144
|
-
console.log('Loading embedding model...');
|
|
145
|
-
const { initializeEmbeddingEngine } = await import('./embedder.js');
|
|
146
|
-
this.embeddingEngine = await initializeEmbeddingEngine(effectiveConfig.embedding_model, effectiveConfig.batch_size);
|
|
147
|
-
}
|
|
148
|
-
else {
|
|
149
|
-
console.log('Using provided embedding engine...');
|
|
150
|
-
}
|
|
151
|
-
// Check model version compatibility
|
|
152
|
-
const currentModelVersion = this.embeddingEngine.getModelVersion();
|
|
153
|
-
await this.indexManager.validateModelVersionOrExit(currentModelVersion);
|
|
154
|
-
this.isInitialized = true;
|
|
155
|
-
console.log('Ingestion pipeline initialized successfully');
|
|
156
|
-
}
|
|
157
|
-
catch (error) {
|
|
158
|
-
await this.cleanup();
|
|
159
|
-
throw this.createUserFriendlyError(error, 'initialization');
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
/**
|
|
163
|
-
* Create user-friendly error messages with actionable suggestions
|
|
164
|
-
* Implements requirement 5.3: Clear, actionable error messages with specific next steps
|
|
165
|
-
*/
|
|
166
|
-
createUserFriendlyError(error, context) {
|
|
167
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
168
|
-
// Handle common error scenarios with specific guidance
|
|
169
|
-
if (errorMessage.includes('ENOENT') || errorMessage.includes('no such file')) {
|
|
170
|
-
if (context === 'path_validation') {
|
|
171
|
-
return new IngestionError(`Directory or file path does not exist: ${errorMessage}`, 'PATH_NOT_FOUND', [
|
|
172
|
-
'Check that the path exists and is accessible',
|
|
173
|
-
'Ensure you have read permissions for the directory',
|
|
174
|
-
'Use an absolute path if the relative path is not working'
|
|
175
|
-
]);
|
|
176
|
-
}
|
|
177
|
-
else {
|
|
178
|
-
return new IngestionError(`Required files not found during ${context}`, 'FILES_NOT_FOUND', [
|
|
179
|
-
'Ensure the base directory exists and is writable',
|
|
180
|
-
'Check file permissions in the target directory',
|
|
181
|
-
'Try using an absolute path instead of a relative path'
|
|
182
|
-
]);
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
if (errorMessage.includes('EACCES') || errorMessage.includes('permission denied')) {
|
|
186
|
-
return new IngestionError(`Permission denied during ${context}`, 'PERMISSION_DENIED', [
|
|
187
|
-
'Check that you have write permissions to the directory',
|
|
188
|
-
'Try running with appropriate permissions',
|
|
189
|
-
'Ensure the directory is not read-only'
|
|
190
|
-
]);
|
|
191
|
-
}
|
|
192
|
-
if (errorMessage.includes('ENOSPC') || errorMessage.includes('no space left')) {
|
|
193
|
-
return new IngestionError(`Insufficient disk space during ${context}`, 'DISK_SPACE_FULL', [
|
|
194
|
-
'Free up disk space in the target directory',
|
|
195
|
-
'Choose a different location with more available space',
|
|
196
|
-
'Check disk usage with your system tools'
|
|
197
|
-
]);
|
|
198
|
-
}
|
|
199
|
-
if (errorMessage.includes('model') && errorMessage.includes('version')) {
|
|
200
|
-
return new IngestionError(`Embedding model compatibility issue: ${errorMessage}`, 'MODEL_COMPATIBILITY', [
|
|
201
|
-
'Run pipeline.rebuildIndex() to rebuild with the current model',
|
|
202
|
-
'Or specify the same model that was used during original ingestion',
|
|
203
|
-
'Check the model configuration in your setup'
|
|
204
|
-
]);
|
|
205
|
-
}
|
|
206
|
-
if (errorMessage.includes('embedding') || errorMessage.includes('model')) {
|
|
207
|
-
return new IngestionError(`Embedding model initialization failed: ${errorMessage}`, 'MODEL_INIT_FAILED', [
|
|
208
|
-
'Check your internet connection for model downloads',
|
|
209
|
-
'Ensure you have sufficient memory available',
|
|
210
|
-
'Try specifying a different embedding model',
|
|
211
|
-
'Check that the model name is correct and supported'
|
|
212
|
-
]);
|
|
213
|
-
}
|
|
214
|
-
if (errorMessage.includes('database') || errorMessage.includes('sqlite')) {
|
|
215
|
-
return new IngestionError(`Database initialization failed: ${errorMessage}`, 'DATABASE_ERROR', [
|
|
216
|
-
'Check that the database file is not corrupted',
|
|
217
|
-
'Ensure the directory is writable',
|
|
218
|
-
'Try deleting the database file to start fresh',
|
|
219
|
-
'Check for sufficient disk space'
|
|
220
|
-
]);
|
|
221
|
-
}
|
|
222
|
-
// Generic error with basic suggestions
|
|
223
|
-
return new IngestionError(`${context} failed: ${errorMessage}`, 'GENERAL_ERROR', [
|
|
224
|
-
'Check the error message above for specific details',
|
|
225
|
-
'Ensure all file paths are correct and accessible',
|
|
226
|
-
'Verify you have necessary permissions',
|
|
227
|
-
'Try the operation again or contact support if the issue persists'
|
|
228
|
-
]);
|
|
229
|
-
}
|
|
230
|
-
/**
|
|
231
|
-
* Initialize the ingestion pipeline (public method for backward compatibility)
|
|
232
|
-
* Sets up database, index manager, and embedding engine
|
|
45
|
+
* Initialize the ingestion pipeline using the factory
|
|
233
46
|
*/
|
|
234
47
|
async initialize() {
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
/**
|
|
238
|
-
* Ingest documents from a directory (matches README API)
|
|
239
|
-
* Automatically initializes resources on first use (Requirements 2.1, 2.3, 5.2)
|
|
240
|
-
* @param directoryPath - Path to directory containing documents
|
|
241
|
-
* @param options - Optional ingestion configuration
|
|
242
|
-
* @returns Promise resolving to ingestion results
|
|
243
|
-
*/
|
|
244
|
-
async ingestDirectory(directoryPath, options = {}) {
|
|
245
|
-
// Validate path exists before initialization
|
|
246
|
-
if (!existsSync(directoryPath)) {
|
|
247
|
-
throw this.createUserFriendlyError(new Error(`Directory not found: ${directoryPath}`), 'path_validation');
|
|
248
|
-
}
|
|
249
|
-
// Automatic initialization on first use (Requirement 5.2)
|
|
250
|
-
await this.ensureInitialized();
|
|
251
|
-
return this.ingestPath(directoryPath, options);
|
|
252
|
-
}
|
|
253
|
-
/**
|
|
254
|
-
* Ingest a single file (matches README API)
|
|
255
|
-
* Automatically initializes resources on first use (Requirements 2.2, 2.3, 5.2)
|
|
256
|
-
* @param filePath - Path to the file to ingest
|
|
257
|
-
* @param options - Optional ingestion configuration
|
|
258
|
-
* @returns Promise resolving to ingestion results
|
|
259
|
-
*/
|
|
260
|
-
async ingestFile(filePath, options = {}) {
|
|
261
|
-
// Validate path exists before initialization
|
|
262
|
-
if (!existsSync(filePath)) {
|
|
263
|
-
throw this.createUserFriendlyError(new Error(`File not found: ${filePath}`), 'path_validation');
|
|
264
|
-
}
|
|
265
|
-
// Automatic initialization on first use (Requirement 5.2)
|
|
266
|
-
await this.ensureInitialized();
|
|
267
|
-
return this.ingestPath(filePath, options);
|
|
268
|
-
}
|
|
269
|
-
/**
|
|
270
|
-
* Ingest documents from a path (file or directory)
|
|
271
|
-
* Implements the complete pipeline: file processing → chunking → embedding → storage
|
|
272
|
-
*
|
|
273
|
-
* Requirements addressed:
|
|
274
|
-
* - 7.5: Single-threaded write processing to avoid SQLite lock contention
|
|
275
|
-
* - 3.3: Graceful handling of embedding failures without stopping ingestion
|
|
276
|
-
* - 10.1: Progress logging and error reporting during batch ingestion
|
|
277
|
-
* - 2.3: Automatic creation of database and index files in appropriate locations
|
|
278
|
-
*/
|
|
279
|
-
async ingestPath(path, options = {}) {
|
|
280
|
-
// Automatic initialization on first use (Requirement 5.2)
|
|
281
|
-
await this.ensureInitialized();
|
|
282
|
-
const startTime = Date.now();
|
|
283
|
-
console.log(`\n=== Starting ingestion from: ${path} ===`);
|
|
284
|
-
try {
|
|
285
|
-
// Phase 1: File Discovery and Processing
|
|
286
|
-
console.log('\n--- Phase 1: File Discovery and Processing ---');
|
|
287
|
-
const fileResult = await discoverAndProcessFiles(path, options.fileOptions, this.pathManager);
|
|
288
|
-
if (fileResult.documents.length === 0) {
|
|
289
|
-
console.log('No documents found to process');
|
|
290
|
-
return {
|
|
291
|
-
documentsProcessed: 0,
|
|
292
|
-
chunksCreated: 0,
|
|
293
|
-
embeddingsGenerated: 0,
|
|
294
|
-
documentErrors: fileResult.processingResult.errors.length,
|
|
295
|
-
embeddingErrors: 0,
|
|
296
|
-
processingTimeMs: Date.now() - startTime
|
|
297
|
-
};
|
|
298
|
-
}
|
|
299
|
-
// Phase 2: Document Chunking
|
|
300
|
-
console.log('\n--- Phase 2: Document Chunking ---');
|
|
301
|
-
const effectiveConfig = this.getEffectiveConfig();
|
|
302
|
-
const effectiveChunkConfig = options.chunkConfig || {
|
|
303
|
-
chunkSize: effectiveConfig.chunk_size,
|
|
304
|
-
chunkOverlap: effectiveConfig.chunk_overlap
|
|
305
|
-
};
|
|
306
|
-
const chunkingResult = await this.chunkDocuments(fileResult.documents, effectiveChunkConfig);
|
|
307
|
-
if (chunkingResult.totalChunks === 0) {
|
|
308
|
-
console.log('No chunks created from documents');
|
|
309
|
-
return {
|
|
310
|
-
documentsProcessed: fileResult.documents.length,
|
|
311
|
-
chunksCreated: 0,
|
|
312
|
-
embeddingsGenerated: 0,
|
|
313
|
-
documentErrors: fileResult.processingResult.errors.length,
|
|
314
|
-
embeddingErrors: 0,
|
|
315
|
-
processingTimeMs: Date.now() - startTime
|
|
316
|
-
};
|
|
317
|
-
}
|
|
318
|
-
// Phase 3: Embedding Generation
|
|
319
|
-
console.log('\n--- Phase 3: Embedding Generation ---');
|
|
320
|
-
const embeddingResult = await this.generateEmbeddings(chunkingResult.allChunks);
|
|
321
|
-
// Phase 4: Database and Index Storage (Single-threaded writes)
|
|
322
|
-
console.log('\n--- Phase 4: Storage Operations ---');
|
|
323
|
-
await this.storeDocumentsAndChunks(chunkingResult.documentChunks, embeddingResult.embeddings);
|
|
324
|
-
// Phase 5: Vector Index Updates
|
|
325
|
-
console.log('\n--- Phase 5: Vector Index Updates ---');
|
|
326
|
-
await this.updateVectorIndex(embeddingResult.embeddings);
|
|
327
|
-
const endTime = Date.now();
|
|
328
|
-
const processingTimeMs = endTime - startTime;
|
|
329
|
-
const result = {
|
|
330
|
-
documentsProcessed: fileResult.documents.length,
|
|
331
|
-
chunksCreated: chunkingResult.totalChunks,
|
|
332
|
-
embeddingsGenerated: embeddingResult.embeddings.length,
|
|
333
|
-
documentErrors: fileResult.processingResult.errors.length,
|
|
334
|
-
embeddingErrors: embeddingResult.errors,
|
|
335
|
-
processingTimeMs
|
|
336
|
-
};
|
|
337
|
-
console.log('\n=== Ingestion Complete ===');
|
|
338
|
-
console.log(`Documents processed: ${result.documentsProcessed}`);
|
|
339
|
-
console.log(`Chunks created: ${result.chunksCreated}`);
|
|
340
|
-
console.log(`Embeddings generated: ${result.embeddingsGenerated}`);
|
|
341
|
-
console.log(`Document errors: ${result.documentErrors}`);
|
|
342
|
-
console.log(`Embedding errors: ${result.embeddingErrors}`);
|
|
343
|
-
console.log(`Total time: ${(processingTimeMs / 1000).toFixed(2)}s`);
|
|
344
|
-
return result;
|
|
48
|
+
if (this.corePipeline) {
|
|
49
|
+
return; // Already initialized
|
|
345
50
|
}
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
349
|
-
// Convert to user-friendly error if not already one (Requirement 2.4)
|
|
350
|
-
if (error instanceof IngestionError) {
|
|
351
|
-
throw error;
|
|
352
|
-
}
|
|
353
|
-
else {
|
|
354
|
-
throw this.createUserFriendlyError(error, 'ingestion');
|
|
355
|
-
}
|
|
51
|
+
if (this.initPromise) {
|
|
52
|
+
return this.initPromise; // Initialization in progress
|
|
356
53
|
}
|
|
54
|
+
this.initPromise = (async () => {
|
|
55
|
+
this.corePipeline = await TextIngestionFactory.create(this.dbPath, this.indexPath, this.options);
|
|
56
|
+
})();
|
|
57
|
+
return this.initPromise;
|
|
357
58
|
}
|
|
358
59
|
/**
|
|
359
|
-
*
|
|
60
|
+
* Ingest a single document
|
|
360
61
|
*/
|
|
361
|
-
async
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
console.log(`Processing ${documents.length} document${documents.length === 1 ? '' : 's'} for chunking...`);
|
|
366
|
-
for (let i = 0; i < documents.length; i++) {
|
|
367
|
-
const document = documents[i];
|
|
368
|
-
try {
|
|
369
|
-
const chunks = await chunkDocument(document, chunkConfig);
|
|
370
|
-
documentChunks.push({ document, chunks });
|
|
371
|
-
// Collect all chunk texts for embedding
|
|
372
|
-
const chunkTexts = chunks.map(chunk => chunk.text);
|
|
373
|
-
allChunks.push(...chunkTexts);
|
|
374
|
-
totalChunks += chunks.length;
|
|
375
|
-
// Progress logging - more frequent for better user experience
|
|
376
|
-
if (documents.length <= 10 || (i + 1) % Math.max(1, Math.floor(documents.length / 10)) === 0 || i === documents.length - 1) {
|
|
377
|
-
const percentage = Math.round(((i + 1) / documents.length) * 100);
|
|
378
|
-
console.log(`Processed ${i + 1} of ${documents.length} documents (${percentage}%) - ${totalChunks} chunks created`);
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
|
-
catch (error) {
|
|
382
|
-
console.error(`Failed to chunk document ${document.source}:`, error instanceof Error ? error.message : String(error));
|
|
383
|
-
// Continue with other documents
|
|
384
|
-
continue;
|
|
385
|
-
}
|
|
62
|
+
async ingestDocument(filePath, options) {
|
|
63
|
+
await this.initialize();
|
|
64
|
+
if (!this.corePipeline) {
|
|
65
|
+
throw new Error('IngestionPipeline failed to initialize');
|
|
386
66
|
}
|
|
387
|
-
|
|
388
|
-
return { documentChunks, allChunks, totalChunks };
|
|
67
|
+
return this.corePipeline.ingestFile(filePath, options);
|
|
389
68
|
}
|
|
390
69
|
/**
|
|
391
|
-
*
|
|
392
|
-
* Requirement 3.3: Graceful handling of embedding failures without stopping ingestion
|
|
70
|
+
* Ingest all documents in a directory
|
|
393
71
|
*/
|
|
394
|
-
async
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
console.log(`Generating embeddings for ${chunkTexts.length} chunk${chunkTexts.length === 1 ? '' : 's'}...`);
|
|
399
|
-
console.log('This may take a few minutes depending on the number of chunks...');
|
|
400
|
-
try {
|
|
401
|
-
// Use the embedDocumentBatch method which has built-in error handling
|
|
402
|
-
const embeddings = await this.embeddingEngine.embedDocumentBatch(chunkTexts);
|
|
403
|
-
const errors = chunkTexts.length - embeddings.length;
|
|
404
|
-
if (errors > 0) {
|
|
405
|
-
console.warn(`⚠ Warning: ${errors} chunk${errors === 1 ? '' : 's'} failed embedding and ${errors === 1 ? 'was' : 'were'} skipped`);
|
|
406
|
-
}
|
|
407
|
-
console.log(`✓ Generated ${embeddings.length} embeddings successfully`);
|
|
408
|
-
return { embeddings, errors };
|
|
409
|
-
}
|
|
410
|
-
catch (error) {
|
|
411
|
-
console.error('Critical embedding failure:', error instanceof Error ? error.message : String(error));
|
|
412
|
-
throw new Error(`Embedding generation failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
413
|
-
}
|
|
414
|
-
}
|
|
415
|
-
/**
|
|
416
|
-
* Store documents and chunks in database with single-threaded writes
|
|
417
|
-
* Requirement 7.5: Single-threaded write processing to avoid SQLite lock contention
|
|
418
|
-
*/
|
|
419
|
-
async storeDocumentsAndChunks(documentChunks, embeddings) {
|
|
420
|
-
if (!this.db) {
|
|
421
|
-
throw new Error('Database not initialized');
|
|
422
|
-
}
|
|
423
|
-
console.log(`Storing ${documentChunks.length} document${documentChunks.length === 1 ? '' : 's'} and chunks in database...`);
|
|
424
|
-
// Create a mapping of chunk text to embedding for efficient lookup
|
|
425
|
-
const embeddingMap = new Map();
|
|
426
|
-
let embeddingIndex = 0;
|
|
427
|
-
// Build mapping - this assumes embeddings are in the same order as chunks were processed
|
|
428
|
-
for (const { chunks } of documentChunks) {
|
|
429
|
-
for (const chunk of chunks) {
|
|
430
|
-
if (embeddingIndex < embeddings.length) {
|
|
431
|
-
embeddingMap.set(chunk.text, embeddings[embeddingIndex]);
|
|
432
|
-
embeddingIndex++;
|
|
433
|
-
}
|
|
434
|
-
}
|
|
435
|
-
}
|
|
436
|
-
let totalChunksStored = 0;
|
|
437
|
-
let documentsStored = 0;
|
|
438
|
-
// Process each document sequentially (single-threaded writes)
|
|
439
|
-
for (const { document, chunks } of documentChunks) {
|
|
440
|
-
try {
|
|
441
|
-
// Insert or get existing document
|
|
442
|
-
const documentId = await upsertDocument(this.db, document.source, document.title);
|
|
443
|
-
documentsStored++;
|
|
444
|
-
// Insert all chunks for this document
|
|
445
|
-
let chunksStoredForDoc = 0;
|
|
446
|
-
for (const chunk of chunks) {
|
|
447
|
-
const embedding = embeddingMap.get(chunk.text);
|
|
448
|
-
if (embedding) {
|
|
449
|
-
try {
|
|
450
|
-
await insertChunk(this.db, embedding.embedding_id, documentId, chunk.text, chunk.chunkIndex);
|
|
451
|
-
chunksStoredForDoc++;
|
|
452
|
-
totalChunksStored++;
|
|
453
|
-
}
|
|
454
|
-
catch (chunkError) {
|
|
455
|
-
console.error(`Failed to store chunk ${chunk.chunkIndex} for document ${document.source}:`, chunkError instanceof Error ? chunkError.message : String(chunkError));
|
|
456
|
-
// Continue with other chunks
|
|
457
|
-
}
|
|
458
|
-
}
|
|
459
|
-
else {
|
|
460
|
-
console.warn(`No embedding found for chunk ${chunk.chunkIndex} in document ${document.source}`);
|
|
461
|
-
}
|
|
462
|
-
}
|
|
463
|
-
// Progress logging for storage
|
|
464
|
-
if (documentChunks.length <= 20 || documentsStored % Math.max(1, Math.floor(documentChunks.length / 10)) === 0 || documentsStored === documentChunks.length) {
|
|
465
|
-
const percentage = Math.round((documentsStored / documentChunks.length) * 100);
|
|
466
|
-
console.log(`Stored ${documentsStored} of ${documentChunks.length} documents (${percentage}%) - ${totalChunksStored} chunks total`);
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
catch (docError) {
|
|
470
|
-
console.error(`Failed to store document ${document.source}:`, docError instanceof Error ? docError.message : String(docError));
|
|
471
|
-
// Continue with other documents
|
|
472
|
-
}
|
|
473
|
-
}
|
|
474
|
-
console.log(`✓ Storage complete: ${documentsStored} documents, ${totalChunksStored} chunks saved to database`);
|
|
475
|
-
}
|
|
476
|
-
/**
|
|
477
|
-
* Update vector index with new embeddings
|
|
478
|
-
*/
|
|
479
|
-
async updateVectorIndex(embeddings) {
|
|
480
|
-
if (!this.indexManager) {
|
|
481
|
-
throw new Error('Index manager not initialized');
|
|
482
|
-
}
|
|
483
|
-
if (embeddings.length === 0) {
|
|
484
|
-
console.log('No embeddings to add to vector index');
|
|
485
|
-
return;
|
|
486
|
-
}
|
|
487
|
-
console.log(`Adding ${embeddings.length} vector${embeddings.length === 1 ? '' : 's'} to search index...`);
|
|
488
|
-
try {
|
|
489
|
-
await this.indexManager.addVectors(embeddings);
|
|
490
|
-
console.log(`✓ Vector index updated successfully with ${embeddings.length} new vectors`);
|
|
491
|
-
}
|
|
492
|
-
catch (error) {
|
|
493
|
-
console.error('Failed to update vector index:', error instanceof Error ? error.message : String(error));
|
|
494
|
-
throw error;
|
|
495
|
-
}
|
|
496
|
-
}
|
|
497
|
-
/**
|
|
498
|
-
* Initialize the pipeline for rebuild (skips model compatibility check)
|
|
499
|
-
*/
|
|
500
|
-
async initializeForRebuild() {
|
|
501
|
-
if (this.isInitialized) {
|
|
502
|
-
return;
|
|
503
|
-
}
|
|
504
|
-
try {
|
|
505
|
-
console.log('Initializing ingestion pipeline...');
|
|
506
|
-
const effectiveConfig = this.getEffectiveConfig();
|
|
507
|
-
// Validate configuration
|
|
508
|
-
validateConfig(effectiveConfig);
|
|
509
|
-
// Initialize database
|
|
510
|
-
console.log('Opening database connection...');
|
|
511
|
-
this.db = await openDatabase(this.dbPath);
|
|
512
|
-
await initializeSchema(this.db);
|
|
513
|
-
// Initialize index manager (skip model compatibility check for rebuild)
|
|
514
|
-
console.log('Initializing index manager...');
|
|
515
|
-
const { getModelDefaults } = await import('./config.js');
|
|
516
|
-
const modelDefaults = getModelDefaults(effectiveConfig.embedding_model);
|
|
517
|
-
this.indexManager = new IndexManager(this.indexPath, this.dbPath, modelDefaults.dimensions, effectiveConfig.embedding_model);
|
|
518
|
-
await this.indexManager.initialize(true); // Skip model check
|
|
519
|
-
// Initialize embedding engine (use provided one or create new)
|
|
520
|
-
if (!this.embeddingEngine) {
|
|
521
|
-
console.log('Loading embedding model...');
|
|
522
|
-
const { initializeEmbeddingEngine } = await import('./embedder.js');
|
|
523
|
-
this.embeddingEngine = await initializeEmbeddingEngine(effectiveConfig.embedding_model, effectiveConfig.batch_size);
|
|
524
|
-
}
|
|
525
|
-
else {
|
|
526
|
-
console.log('Using provided embedding engine...');
|
|
527
|
-
}
|
|
528
|
-
this.isInitialized = true;
|
|
529
|
-
console.log('Ingestion pipeline initialized successfully');
|
|
530
|
-
}
|
|
531
|
-
catch (error) {
|
|
532
|
-
await this.cleanup();
|
|
533
|
-
throw this.createUserFriendlyError(error, 'initialization');
|
|
72
|
+
async ingestDirectory(directoryPath, options) {
|
|
73
|
+
await this.initialize();
|
|
74
|
+
if (!this.corePipeline) {
|
|
75
|
+
throw new Error('IngestionPipeline failed to initialize');
|
|
534
76
|
}
|
|
77
|
+
return this.corePipeline.ingestDirectory(directoryPath, options);
|
|
535
78
|
}
|
|
536
79
|
/**
|
|
537
|
-
*
|
|
538
|
-
*
|
|
539
|
-
*
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
console.log('\n=== Starting Index Rebuild ===');
|
|
550
|
-
try {
|
|
551
|
-
await this.indexManager.rebuildWithEmbeddings(this.embeddingEngine);
|
|
552
|
-
console.log('Index rebuild completed successfully');
|
|
553
|
-
}
|
|
554
|
-
catch (error) {
|
|
555
|
-
throw this.createUserFriendlyError(error, 'rebuild');
|
|
556
|
-
}
|
|
557
|
-
}
|
|
558
|
-
/**
|
|
559
|
-
* Get pipeline statistics
|
|
560
|
-
*/
|
|
561
|
-
async getStats() {
|
|
562
|
-
const stats = {
|
|
563
|
-
isInitialized: this.isInitialized,
|
|
564
|
-
indexStats: null
|
|
565
|
-
};
|
|
566
|
-
if (this.indexManager) {
|
|
567
|
-
try {
|
|
568
|
-
stats.indexStats = await this.indexManager.getStats();
|
|
569
|
-
}
|
|
570
|
-
catch (error) {
|
|
571
|
-
console.error('Failed to get index stats:', error instanceof Error ? error.message : String(error));
|
|
572
|
-
}
|
|
573
|
-
}
|
|
574
|
-
return stats;
|
|
575
|
-
}
|
|
576
|
-
/**
|
|
577
|
-
* Set up automatic cleanup on process exit (Requirement 5.5)
|
|
80
|
+
* Ingest content from memory buffer
|
|
81
|
+
* Enables MCP integration and real-time content processing
|
|
82
|
+
*
|
|
83
|
+
* @example
|
|
84
|
+
* ```typescript
|
|
85
|
+
* const pipeline = new IngestionPipeline('./db.sqlite', './index.bin');
|
|
86
|
+
* const contentId = await pipeline.ingestFromMemory(buffer, {
|
|
87
|
+
* displayName: 'uploaded-file.txt',
|
|
88
|
+
* contentType: 'text/plain'
|
|
89
|
+
* });
|
|
90
|
+
* console.log('Content ingested with ID:', contentId);
|
|
91
|
+
* ```
|
|
578
92
|
*/
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
if (!IngestionPipeline.cleanupHandlersSet) {
|
|
584
|
-
IngestionPipeline.cleanupHandlersSet = true;
|
|
585
|
-
const cleanupAll = async () => {
|
|
586
|
-
const instances = Array.from(IngestionPipeline.instances);
|
|
587
|
-
await Promise.all(instances.map(instance => instance.cleanup()));
|
|
588
|
-
};
|
|
589
|
-
// Handle various exit scenarios
|
|
590
|
-
process.on('exit', () => {
|
|
591
|
-
// Synchronous cleanup for exit event
|
|
592
|
-
for (const instance of IngestionPipeline.instances) {
|
|
593
|
-
try {
|
|
594
|
-
if (instance.db) {
|
|
595
|
-
// Synchronous close for exit handler
|
|
596
|
-
instance.db = null;
|
|
597
|
-
}
|
|
598
|
-
if (instance.indexManager) {
|
|
599
|
-
instance.indexManager = null;
|
|
600
|
-
}
|
|
601
|
-
instance.embeddingEngine = null;
|
|
602
|
-
instance.isInitialized = false;
|
|
603
|
-
}
|
|
604
|
-
catch (error) {
|
|
605
|
-
// Silent cleanup on exit
|
|
606
|
-
}
|
|
607
|
-
}
|
|
608
|
-
});
|
|
609
|
-
process.on('SIGINT', async () => {
|
|
610
|
-
await cleanupAll();
|
|
611
|
-
process.exit(0);
|
|
612
|
-
});
|
|
613
|
-
process.on('SIGTERM', async () => {
|
|
614
|
-
await cleanupAll();
|
|
615
|
-
process.exit(0);
|
|
616
|
-
});
|
|
617
|
-
process.on('uncaughtException', async (error) => {
|
|
618
|
-
console.error('Uncaught exception:', error);
|
|
619
|
-
await cleanupAll();
|
|
620
|
-
process.exit(1);
|
|
621
|
-
});
|
|
622
|
-
process.on('unhandledRejection', async (reason) => {
|
|
623
|
-
console.error('Unhandled rejection:', reason);
|
|
624
|
-
await cleanupAll();
|
|
625
|
-
process.exit(1);
|
|
626
|
-
});
|
|
93
|
+
async ingestFromMemory(content, metadata, options) {
|
|
94
|
+
await this.initialize();
|
|
95
|
+
if (!this.corePipeline) {
|
|
96
|
+
throw new Error('IngestionPipeline failed to initialize');
|
|
627
97
|
}
|
|
98
|
+
return this.corePipeline.ingestFromMemory(content, metadata, options);
|
|
628
99
|
}
|
|
629
100
|
/**
|
|
630
101
|
* Clean up resources
|
|
631
102
|
*/
|
|
632
103
|
async cleanup() {
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
await this.indexManager.close();
|
|
636
|
-
this.indexManager = null;
|
|
637
|
-
}
|
|
638
|
-
if (this.db) {
|
|
639
|
-
await this.db.close();
|
|
640
|
-
this.db = null;
|
|
641
|
-
}
|
|
642
|
-
this.embeddingEngine = null;
|
|
643
|
-
this.isInitialized = false;
|
|
644
|
-
// Remove from instances tracking
|
|
645
|
-
IngestionPipeline.instances.delete(this);
|
|
646
|
-
console.log('Pipeline cleanup completed');
|
|
104
|
+
if (this.corePipeline) {
|
|
105
|
+
await this.corePipeline.cleanup();
|
|
647
106
|
}
|
|
648
|
-
catch (error) {
|
|
649
|
-
console.error('Error during cleanup:', error instanceof Error ? error.message : String(error));
|
|
650
|
-
}
|
|
651
|
-
}
|
|
652
|
-
}
|
|
653
|
-
/**
|
|
654
|
-
* Convenience function to ingest documents from a path
|
|
655
|
-
* Creates a pipeline instance, runs ingestion, and cleans up
|
|
656
|
-
*/
|
|
657
|
-
export async function ingestDocuments(path, options = {}) {
|
|
658
|
-
const pipeline = new IngestionPipeline();
|
|
659
|
-
try {
|
|
660
|
-
await pipeline.initialize();
|
|
661
|
-
const result = await pipeline.ingestPath(path, options);
|
|
662
|
-
return result;
|
|
663
|
-
}
|
|
664
|
-
finally {
|
|
665
|
-
await pipeline.cleanup();
|
|
666
|
-
}
|
|
667
|
-
}
|
|
668
|
-
/**
|
|
669
|
-
* Convenience function to rebuild the index
|
|
670
|
-
* Creates a pipeline instance, rebuilds index, and cleans up
|
|
671
|
-
*/
|
|
672
|
-
export async function rebuildIndex() {
|
|
673
|
-
// First, try to detect the stored model from the existing database
|
|
674
|
-
let configOverrides = {};
|
|
675
|
-
try {
|
|
676
|
-
const { openDatabase, getStoredModelInfo } = await import('./db.js');
|
|
677
|
-
const db = await openDatabase(config.db_file);
|
|
678
|
-
const storedModel = await getStoredModelInfo(db);
|
|
679
|
-
await db.close();
|
|
680
|
-
if (storedModel) {
|
|
681
|
-
console.log(`Detected stored model: ${storedModel.modelName}`);
|
|
682
|
-
const { getModelDefaults } = await import('./config.js');
|
|
683
|
-
const modelDefaults = getModelDefaults(storedModel.modelName);
|
|
684
|
-
configOverrides = {
|
|
685
|
-
embedding_model: storedModel.modelName,
|
|
686
|
-
chunk_size: modelDefaults.chunk_size,
|
|
687
|
-
chunk_overlap: modelDefaults.chunk_overlap,
|
|
688
|
-
batch_size: modelDefaults.batch_size
|
|
689
|
-
};
|
|
690
|
-
}
|
|
691
|
-
}
|
|
692
|
-
catch (error) {
|
|
693
|
-
console.log('Could not detect stored model, using default configuration');
|
|
694
|
-
}
|
|
695
|
-
const pipeline = new IngestionPipeline();
|
|
696
|
-
pipeline.setConfigOverrides(configOverrides);
|
|
697
|
-
try {
|
|
698
|
-
await pipeline.initialize();
|
|
699
|
-
await pipeline.rebuildIndex();
|
|
700
|
-
}
|
|
701
|
-
finally {
|
|
702
|
-
await pipeline.cleanup();
|
|
703
107
|
}
|
|
704
108
|
}
|
|
705
109
|
//# sourceMappingURL=ingestion.js.map
|