rag-lite-ts 1.0.2 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +605 -93
- package/dist/cli/indexer.js +192 -4
- package/dist/cli/search.js +50 -11
- package/dist/cli.js +183 -26
- package/dist/core/abstract-embedder.d.ts +125 -0
- package/dist/core/abstract-embedder.js +264 -0
- package/dist/core/actionable-error-messages.d.ts +60 -0
- package/dist/core/actionable-error-messages.js +397 -0
- package/dist/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/core/batch-processing-optimizer.js +541 -0
- package/dist/core/binary-index-format.d.ts +52 -0
- package/dist/core/binary-index-format.js +122 -0
- package/dist/core/chunker.d.ts +2 -0
- package/dist/core/cli-database-utils.d.ts +53 -0
- package/dist/core/cli-database-utils.js +239 -0
- package/dist/core/config.js +10 -3
- package/dist/core/content-errors.d.ts +111 -0
- package/dist/core/content-errors.js +362 -0
- package/dist/core/content-manager.d.ts +343 -0
- package/dist/core/content-manager.js +1504 -0
- package/dist/core/content-performance-optimizer.d.ts +150 -0
- package/dist/core/content-performance-optimizer.js +516 -0
- package/dist/core/content-resolver.d.ts +104 -0
- package/dist/core/content-resolver.js +285 -0
- package/dist/core/cross-modal-search.d.ts +164 -0
- package/dist/core/cross-modal-search.js +342 -0
- package/dist/core/database-connection-manager.d.ts +109 -0
- package/dist/core/database-connection-manager.js +304 -0
- package/dist/core/db.d.ts +141 -2
- package/dist/core/db.js +631 -89
- package/dist/core/embedder-factory.d.ts +176 -0
- package/dist/core/embedder-factory.js +338 -0
- package/dist/core/index.d.ts +3 -1
- package/dist/core/index.js +4 -1
- package/dist/core/ingestion.d.ts +85 -15
- package/dist/core/ingestion.js +510 -45
- package/dist/core/lazy-dependency-loader.d.ts +152 -0
- package/dist/core/lazy-dependency-loader.js +453 -0
- package/dist/core/mode-detection-service.d.ts +150 -0
- package/dist/core/mode-detection-service.js +565 -0
- package/dist/core/mode-model-validator.d.ts +92 -0
- package/dist/core/mode-model-validator.js +203 -0
- package/dist/core/model-registry.d.ts +120 -0
- package/dist/core/model-registry.js +415 -0
- package/dist/core/model-validator.d.ts +217 -0
- package/dist/core/model-validator.js +782 -0
- package/dist/core/polymorphic-search-factory.d.ts +154 -0
- package/dist/core/polymorphic-search-factory.js +344 -0
- package/dist/core/raglite-paths.d.ts +121 -0
- package/dist/core/raglite-paths.js +145 -0
- package/dist/core/reranking-config.d.ts +42 -0
- package/dist/core/reranking-config.js +156 -0
- package/dist/core/reranking-factory.d.ts +92 -0
- package/dist/core/reranking-factory.js +591 -0
- package/dist/core/reranking-strategies.d.ts +325 -0
- package/dist/core/reranking-strategies.js +720 -0
- package/dist/core/resource-cleanup.d.ts +163 -0
- package/dist/core/resource-cleanup.js +371 -0
- package/dist/core/resource-manager.d.ts +212 -0
- package/dist/core/resource-manager.js +564 -0
- package/dist/core/search.d.ts +28 -1
- package/dist/core/search.js +83 -5
- package/dist/core/streaming-operations.d.ts +145 -0
- package/dist/core/streaming-operations.js +409 -0
- package/dist/core/types.d.ts +3 -0
- package/dist/core/universal-embedder.d.ts +177 -0
- package/dist/core/universal-embedder.js +139 -0
- package/dist/core/validation-messages.d.ts +99 -0
- package/dist/core/validation-messages.js +334 -0
- package/dist/core/vector-index.d.ts +1 -1
- package/dist/core/vector-index.js +37 -39
- package/dist/factories/index.d.ts +3 -1
- package/dist/factories/index.js +2 -0
- package/dist/factories/polymorphic-factory.d.ts +50 -0
- package/dist/factories/polymorphic-factory.js +159 -0
- package/dist/factories/text-factory.d.ts +128 -34
- package/dist/factories/text-factory.js +346 -97
- package/dist/file-processor.d.ts +88 -2
- package/dist/file-processor.js +720 -17
- package/dist/index.d.ts +32 -0
- package/dist/index.js +29 -0
- package/dist/ingestion.d.ts +16 -0
- package/dist/ingestion.js +21 -0
- package/dist/mcp-server.d.ts +35 -3
- package/dist/mcp-server.js +1107 -31
- package/dist/multimodal/clip-embedder.d.ts +327 -0
- package/dist/multimodal/clip-embedder.js +992 -0
- package/dist/multimodal/index.d.ts +6 -0
- package/dist/multimodal/index.js +6 -0
- package/dist/run-error-recovery-tests.d.ts +7 -0
- package/dist/run-error-recovery-tests.js +101 -0
- package/dist/search.d.ts +60 -9
- package/dist/search.js +82 -11
- package/dist/test-utils.d.ts +8 -26
- package/dist/text/chunker.d.ts +1 -0
- package/dist/text/embedder.js +15 -8
- package/dist/text/index.d.ts +1 -0
- package/dist/text/index.js +1 -0
- package/dist/text/reranker.d.ts +1 -2
- package/dist/text/reranker.js +17 -47
- package/dist/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/text/sentence-transformer-embedder.js +340 -0
- package/dist/types.d.ts +39 -0
- package/dist/utils/vector-math.d.ts +31 -0
- package/dist/utils/vector-math.js +70 -0
- package/package.json +27 -6
- package/dist/api-errors.d.ts.map +0 -1
- package/dist/api-errors.js.map +0 -1
- package/dist/cli/indexer.d.ts.map +0 -1
- package/dist/cli/indexer.js.map +0 -1
- package/dist/cli/search.d.ts.map +0 -1
- package/dist/cli/search.js.map +0 -1
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/core/adapters.d.ts.map +0 -1
- package/dist/core/adapters.js.map +0 -1
- package/dist/core/chunker.d.ts.map +0 -1
- package/dist/core/chunker.js.map +0 -1
- package/dist/core/config.d.ts.map +0 -1
- package/dist/core/config.js.map +0 -1
- package/dist/core/db.d.ts.map +0 -1
- package/dist/core/db.js.map +0 -1
- package/dist/core/error-handler.d.ts.map +0 -1
- package/dist/core/error-handler.js.map +0 -1
- package/dist/core/index.d.ts.map +0 -1
- package/dist/core/index.js.map +0 -1
- package/dist/core/ingestion.d.ts.map +0 -1
- package/dist/core/ingestion.js.map +0 -1
- package/dist/core/interfaces.d.ts.map +0 -1
- package/dist/core/interfaces.js.map +0 -1
- package/dist/core/path-manager.d.ts.map +0 -1
- package/dist/core/path-manager.js.map +0 -1
- package/dist/core/search-example.d.ts +0 -25
- package/dist/core/search-example.d.ts.map +0 -1
- package/dist/core/search-example.js +0 -138
- package/dist/core/search-example.js.map +0 -1
- package/dist/core/search-pipeline-example.d.ts +0 -21
- package/dist/core/search-pipeline-example.d.ts.map +0 -1
- package/dist/core/search-pipeline-example.js +0 -188
- package/dist/core/search-pipeline-example.js.map +0 -1
- package/dist/core/search-pipeline.d.ts.map +0 -1
- package/dist/core/search-pipeline.js.map +0 -1
- package/dist/core/search.d.ts.map +0 -1
- package/dist/core/search.js.map +0 -1
- package/dist/core/types.d.ts.map +0 -1
- package/dist/core/types.js.map +0 -1
- package/dist/core/vector-index.d.ts.map +0 -1
- package/dist/core/vector-index.js.map +0 -1
- package/dist/dom-polyfills.d.ts.map +0 -1
- package/dist/dom-polyfills.js.map +0 -1
- package/dist/examples/clean-api-examples.d.ts +0 -44
- package/dist/examples/clean-api-examples.d.ts.map +0 -1
- package/dist/examples/clean-api-examples.js +0 -206
- package/dist/examples/clean-api-examples.js.map +0 -1
- package/dist/factories/index.d.ts.map +0 -1
- package/dist/factories/index.js.map +0 -1
- package/dist/factories/text-factory.d.ts.map +0 -1
- package/dist/factories/text-factory.js.map +0 -1
- package/dist/file-processor.d.ts.map +0 -1
- package/dist/file-processor.js.map +0 -1
- package/dist/index-manager.d.ts.map +0 -1
- package/dist/index-manager.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/indexer.d.ts.map +0 -1
- package/dist/indexer.js.map +0 -1
- package/dist/ingestion.d.ts.map +0 -1
- package/dist/ingestion.js.map +0 -1
- package/dist/mcp-server.d.ts.map +0 -1
- package/dist/mcp-server.js.map +0 -1
- package/dist/preprocess.d.ts.map +0 -1
- package/dist/preprocess.js.map +0 -1
- package/dist/preprocessors/index.d.ts.map +0 -1
- package/dist/preprocessors/index.js.map +0 -1
- package/dist/preprocessors/mdx.d.ts.map +0 -1
- package/dist/preprocessors/mdx.js.map +0 -1
- package/dist/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/preprocessors/mermaid.js.map +0 -1
- package/dist/preprocessors/registry.d.ts.map +0 -1
- package/dist/preprocessors/registry.js.map +0 -1
- package/dist/search-standalone.d.ts.map +0 -1
- package/dist/search-standalone.js.map +0 -1
- package/dist/search.d.ts.map +0 -1
- package/dist/search.js.map +0 -1
- package/dist/test-utils.d.ts.map +0 -1
- package/dist/test-utils.js.map +0 -1
- package/dist/text/chunker.d.ts.map +0 -1
- package/dist/text/chunker.js.map +0 -1
- package/dist/text/embedder.d.ts.map +0 -1
- package/dist/text/embedder.js.map +0 -1
- package/dist/text/index.d.ts.map +0 -1
- package/dist/text/index.js.map +0 -1
- package/dist/text/preprocessors/index.d.ts.map +0 -1
- package/dist/text/preprocessors/index.js.map +0 -1
- package/dist/text/preprocessors/mdx.d.ts.map +0 -1
- package/dist/text/preprocessors/mdx.js.map +0 -1
- package/dist/text/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/text/preprocessors/mermaid.js.map +0 -1
- package/dist/text/preprocessors/registry.d.ts.map +0 -1
- package/dist/text/preprocessors/registry.js.map +0 -1
- package/dist/text/reranker.d.ts.map +0 -1
- package/dist/text/reranker.js.map +0 -1
- package/dist/text/tokenizer.d.ts.map +0 -1
- package/dist/text/tokenizer.js.map +0 -1
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js.map +0 -1
|
@@ -0,0 +1,541 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CORE MODULE — Batch Processing Optimizer
|
|
3
|
+
* Optimizes embedding generation for large multimodal content batches
|
|
4
|
+
* Implements efficient image processing pipelines with progress reporting
|
|
5
|
+
* Creates memory-efficient processing for large image collections
|
|
6
|
+
*/
|
|
7
|
+
import { LazyMultimodalLoader } from './lazy-dependency-loader.js';
|
|
8
|
+
import { createError } from './error-handler.js';
|
|
9
|
+
import { getResourceManager } from './resource-manager.js';
|
|
10
|
+
/**
|
|
11
|
+
* Default batch processing configuration optimized for multimodal content
|
|
12
|
+
*/
|
|
13
|
+
export const DEFAULT_BATCH_CONFIG = {
|
|
14
|
+
// Conservative batch sizes for memory efficiency
|
|
15
|
+
textBatchSize: 16,
|
|
16
|
+
imageBatchSize: 4, // Smaller for memory-intensive image processing
|
|
17
|
+
maxConcurrentBatches: 2,
|
|
18
|
+
// Memory management (256MB threshold)
|
|
19
|
+
memoryThresholdMB: 256,
|
|
20
|
+
enableMemoryMonitoring: true,
|
|
21
|
+
enableGarbageCollection: true,
|
|
22
|
+
// Progress reporting every 5 batches
|
|
23
|
+
enableProgressReporting: true,
|
|
24
|
+
progressReportInterval: 5,
|
|
25
|
+
// Error handling with retries
|
|
26
|
+
maxRetries: 3,
|
|
27
|
+
retryDelayMs: 1000,
|
|
28
|
+
enableFallbackProcessing: true,
|
|
29
|
+
// Performance optimization
|
|
30
|
+
enableParallelProcessing: true,
|
|
31
|
+
enableResourcePooling: true,
|
|
32
|
+
preloadModels: false // Lazy loading by default
|
|
33
|
+
};
|
|
34
|
+
// =============================================================================
|
|
35
|
+
// MEMORY MONITORING
|
|
36
|
+
// =============================================================================
|
|
37
|
+
/**
|
|
38
|
+
* Memory monitoring utilities for batch processing
|
|
39
|
+
*/
|
|
40
|
+
class MemoryMonitor {
|
|
41
|
+
initialMemoryMB;
|
|
42
|
+
peakMemoryMB;
|
|
43
|
+
constructor() {
|
|
44
|
+
this.initialMemoryMB = this.getCurrentMemoryUsageMB();
|
|
45
|
+
this.peakMemoryMB = this.initialMemoryMB;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Get current memory usage in MB
|
|
49
|
+
*/
|
|
50
|
+
getCurrentMemoryUsageMB() {
|
|
51
|
+
const usage = process.memoryUsage();
|
|
52
|
+
return Math.round(usage.heapUsed / 1024 / 1024);
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Update peak memory usage
|
|
56
|
+
*/
|
|
57
|
+
updatePeakMemory() {
|
|
58
|
+
const current = this.getCurrentMemoryUsageMB();
|
|
59
|
+
if (current > this.peakMemoryMB) {
|
|
60
|
+
this.peakMemoryMB = current;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Check if memory usage exceeds threshold
|
|
65
|
+
*/
|
|
66
|
+
isMemoryThresholdExceeded(thresholdMB) {
|
|
67
|
+
return this.getCurrentMemoryUsageMB() > thresholdMB;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Force garbage collection if enabled
|
|
71
|
+
*/
|
|
72
|
+
forceGarbageCollection() {
|
|
73
|
+
if (global.gc) {
|
|
74
|
+
global.gc();
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Get memory statistics
|
|
79
|
+
*/
|
|
80
|
+
getStats() {
|
|
81
|
+
return {
|
|
82
|
+
currentMB: this.getCurrentMemoryUsageMB(),
|
|
83
|
+
peakMB: this.peakMemoryMB,
|
|
84
|
+
initialMB: this.initialMemoryMB
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
// =============================================================================
|
|
89
|
+
// BATCH PROCESSING OPTIMIZER
|
|
90
|
+
// =============================================================================
|
|
91
|
+
/**
|
|
92
|
+
* Optimized batch processor for multimodal content
|
|
93
|
+
* Handles large collections of text and image content efficiently
|
|
94
|
+
*/
|
|
95
|
+
export class BatchProcessingOptimizer {
|
|
96
|
+
config;
|
|
97
|
+
memoryMonitor;
|
|
98
|
+
resourcePool = new Map();
|
|
99
|
+
resourceManager = getResourceManager();
|
|
100
|
+
constructor(config = {}) {
|
|
101
|
+
this.config = { ...DEFAULT_BATCH_CONFIG, ...config };
|
|
102
|
+
this.memoryMonitor = new MemoryMonitor();
|
|
103
|
+
}
|
|
104
|
+
// =============================================================================
|
|
105
|
+
// PUBLIC API
|
|
106
|
+
// =============================================================================
|
|
107
|
+
/**
|
|
108
|
+
* Process a large batch of multimodal content with optimization
|
|
109
|
+
*/
|
|
110
|
+
async processBatch(items, embedFunction, progressCallback) {
|
|
111
|
+
const startTime = Date.now();
|
|
112
|
+
// Initialize statistics
|
|
113
|
+
const stats = {
|
|
114
|
+
totalItems: items.length,
|
|
115
|
+
processedItems: 0,
|
|
116
|
+
failedItems: 0,
|
|
117
|
+
skippedItems: 0,
|
|
118
|
+
totalBatches: 0,
|
|
119
|
+
completedBatches: 0,
|
|
120
|
+
failedBatches: 0,
|
|
121
|
+
processingTimeMs: 0,
|
|
122
|
+
averageBatchTimeMs: 0,
|
|
123
|
+
itemsPerSecond: 0,
|
|
124
|
+
memoryUsageMB: this.memoryMonitor.getCurrentMemoryUsageMB(),
|
|
125
|
+
peakMemoryUsageMB: this.memoryMonitor.getCurrentMemoryUsageMB(),
|
|
126
|
+
retryCount: 0,
|
|
127
|
+
fallbackCount: 0
|
|
128
|
+
};
|
|
129
|
+
const results = [];
|
|
130
|
+
const errors = [];
|
|
131
|
+
try {
|
|
132
|
+
// Preload models if configured
|
|
133
|
+
if (this.config.preloadModels) {
|
|
134
|
+
await this.preloadRequiredModels(items);
|
|
135
|
+
}
|
|
136
|
+
// Separate items by content type for optimized processing
|
|
137
|
+
const textItems = items.filter(item => item.contentType === 'text');
|
|
138
|
+
const imageItems = items.filter(item => item.contentType === 'image');
|
|
139
|
+
// Process text items in optimized batches
|
|
140
|
+
if (textItems.length > 0) {
|
|
141
|
+
const textResults = await this.processTextBatches(textItems, embedFunction, stats, errors, progressCallback);
|
|
142
|
+
results.push(...textResults);
|
|
143
|
+
}
|
|
144
|
+
// Process image items in optimized batches
|
|
145
|
+
if (imageItems.length > 0) {
|
|
146
|
+
const imageResults = await this.processImageBatches(imageItems, embedFunction, stats, errors, progressCallback);
|
|
147
|
+
results.push(...imageResults);
|
|
148
|
+
}
|
|
149
|
+
// Calculate final statistics
|
|
150
|
+
const endTime = Date.now();
|
|
151
|
+
stats.processingTimeMs = endTime - startTime;
|
|
152
|
+
stats.averageBatchTimeMs = stats.totalBatches > 0 ? stats.processingTimeMs / stats.totalBatches : 0;
|
|
153
|
+
stats.itemsPerSecond = stats.processingTimeMs > 0 ? (stats.processedItems / stats.processingTimeMs) * 1000 : 0;
|
|
154
|
+
const memoryStats = this.memoryMonitor.getStats();
|
|
155
|
+
stats.memoryUsageMB = memoryStats.currentMB;
|
|
156
|
+
stats.peakMemoryUsageMB = memoryStats.peakMB;
|
|
157
|
+
// Final progress report
|
|
158
|
+
if (progressCallback && this.config.enableProgressReporting) {
|
|
159
|
+
progressCallback(stats);
|
|
160
|
+
}
|
|
161
|
+
return { results, stats, errors };
|
|
162
|
+
}
|
|
163
|
+
catch (error) {
|
|
164
|
+
throw createError.model(`Batch processing failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
165
|
+
}
|
|
166
|
+
finally {
|
|
167
|
+
// Cleanup resources
|
|
168
|
+
await this.cleanupResources();
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
// =============================================================================
|
|
172
|
+
// TEXT BATCH PROCESSING
|
|
173
|
+
// =============================================================================
|
|
174
|
+
/**
|
|
175
|
+
* Process text items in optimized batches
|
|
176
|
+
*/
|
|
177
|
+
async processTextBatches(textItems, embedFunction, stats, errors, progressCallback) {
|
|
178
|
+
const results = [];
|
|
179
|
+
const batchSize = this.config.textBatchSize;
|
|
180
|
+
const totalBatches = Math.ceil(textItems.length / batchSize);
|
|
181
|
+
console.log(`Processing ${textItems.length} text items in ${totalBatches} batches (batch size: ${batchSize})`);
|
|
182
|
+
for (let i = 0; i < textItems.length; i += batchSize) {
|
|
183
|
+
const batch = textItems.slice(i, i + batchSize);
|
|
184
|
+
const batchIndex = Math.floor(i / batchSize);
|
|
185
|
+
stats.totalBatches++;
|
|
186
|
+
try {
|
|
187
|
+
const batchResults = await this.processTextBatch(batch, embedFunction, batchIndex, stats, errors);
|
|
188
|
+
results.push(...batchResults);
|
|
189
|
+
stats.completedBatches++;
|
|
190
|
+
// Memory management
|
|
191
|
+
await this.performMemoryManagement();
|
|
192
|
+
// Progress reporting
|
|
193
|
+
if (progressCallback && this.shouldReportProgress(batchIndex)) {
|
|
194
|
+
progressCallback({ ...stats });
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
catch (error) {
|
|
198
|
+
stats.failedBatches++;
|
|
199
|
+
console.warn(`Text batch ${batchIndex + 1}/${totalBatches} failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
200
|
+
// Try fallback processing if enabled
|
|
201
|
+
if (this.config.enableFallbackProcessing) {
|
|
202
|
+
const fallbackResults = await this.processBatchWithFallback(batch, embedFunction, batchIndex, stats, errors);
|
|
203
|
+
results.push(...fallbackResults);
|
|
204
|
+
stats.fallbackCount++;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return results;
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Process a single text batch with error handling
|
|
212
|
+
*/
|
|
213
|
+
async processTextBatch(batch, embedFunction, batchIndex, stats, errors) {
|
|
214
|
+
const batchStartTime = Date.now();
|
|
215
|
+
try {
|
|
216
|
+
// Process batch items in parallel if enabled
|
|
217
|
+
if (this.config.enableParallelProcessing) {
|
|
218
|
+
const promises = batch.map(async (item, itemIndex) => {
|
|
219
|
+
try {
|
|
220
|
+
const result = await embedFunction(item);
|
|
221
|
+
stats.processedItems++;
|
|
222
|
+
return result;
|
|
223
|
+
}
|
|
224
|
+
catch (error) {
|
|
225
|
+
stats.failedItems++;
|
|
226
|
+
errors.push({
|
|
227
|
+
item,
|
|
228
|
+
error: error instanceof Error ? error.message : String(error),
|
|
229
|
+
batchIndex,
|
|
230
|
+
itemIndex
|
|
231
|
+
});
|
|
232
|
+
return null;
|
|
233
|
+
}
|
|
234
|
+
});
|
|
235
|
+
const results = await Promise.all(promises);
|
|
236
|
+
return results.filter((result) => result !== null);
|
|
237
|
+
}
|
|
238
|
+
else {
|
|
239
|
+
// Sequential processing
|
|
240
|
+
const results = [];
|
|
241
|
+
for (let itemIndex = 0; itemIndex < batch.length; itemIndex++) {
|
|
242
|
+
const item = batch[itemIndex];
|
|
243
|
+
try {
|
|
244
|
+
const result = await embedFunction(item);
|
|
245
|
+
results.push(result);
|
|
246
|
+
stats.processedItems++;
|
|
247
|
+
}
|
|
248
|
+
catch (error) {
|
|
249
|
+
stats.failedItems++;
|
|
250
|
+
errors.push({
|
|
251
|
+
item,
|
|
252
|
+
error: error instanceof Error ? error.message : String(error),
|
|
253
|
+
batchIndex,
|
|
254
|
+
itemIndex
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
return results;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
finally {
|
|
262
|
+
// Update batch timing
|
|
263
|
+
const batchTime = Date.now() - batchStartTime;
|
|
264
|
+
stats.averageBatchTimeMs = ((stats.averageBatchTimeMs * (stats.completedBatches + stats.failedBatches)) + batchTime) / (stats.completedBatches + stats.failedBatches + 1);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
// =============================================================================
|
|
268
|
+
// IMAGE BATCH PROCESSING
|
|
269
|
+
// =============================================================================
|
|
270
|
+
/**
|
|
271
|
+
* Process image items in optimized batches with memory management
|
|
272
|
+
*/
|
|
273
|
+
async processImageBatches(imageItems, embedFunction, stats, errors, progressCallback) {
|
|
274
|
+
const results = [];
|
|
275
|
+
const batchSize = this.config.imageBatchSize;
|
|
276
|
+
const totalBatches = Math.ceil(imageItems.length / batchSize);
|
|
277
|
+
console.log(`Processing ${imageItems.length} image items in ${totalBatches} batches (batch size: ${batchSize})`);
|
|
278
|
+
// Preload image processing models
|
|
279
|
+
await this.preloadImageProcessingModels();
|
|
280
|
+
for (let i = 0; i < imageItems.length; i += batchSize) {
|
|
281
|
+
const batch = imageItems.slice(i, i + batchSize);
|
|
282
|
+
const batchIndex = Math.floor(i / batchSize) + Math.ceil(stats.totalBatches);
|
|
283
|
+
stats.totalBatches++;
|
|
284
|
+
try {
|
|
285
|
+
const batchResults = await this.processImageBatch(batch, embedFunction, batchIndex, stats, errors);
|
|
286
|
+
results.push(...batchResults);
|
|
287
|
+
stats.completedBatches++;
|
|
288
|
+
// Aggressive memory management for images
|
|
289
|
+
await this.performMemoryManagement(true);
|
|
290
|
+
// Progress reporting
|
|
291
|
+
if (progressCallback && this.shouldReportProgress(batchIndex)) {
|
|
292
|
+
progressCallback({ ...stats });
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
catch (error) {
|
|
296
|
+
stats.failedBatches++;
|
|
297
|
+
console.warn(`Image batch ${batchIndex + 1} failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
298
|
+
// Try fallback processing if enabled
|
|
299
|
+
if (this.config.enableFallbackProcessing) {
|
|
300
|
+
const fallbackResults = await this.processBatchWithFallback(batch, embedFunction, batchIndex, stats, errors);
|
|
301
|
+
results.push(...fallbackResults);
|
|
302
|
+
stats.fallbackCount++;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
return results;
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Process a single image batch with memory optimization
|
|
310
|
+
*/
|
|
311
|
+
async processImageBatch(batch, embedFunction, batchIndex, stats, errors) {
|
|
312
|
+
const batchStartTime = Date.now();
|
|
313
|
+
try {
|
|
314
|
+
// For images, use sequential processing to manage memory better
|
|
315
|
+
const results = [];
|
|
316
|
+
for (let itemIndex = 0; itemIndex < batch.length; itemIndex++) {
|
|
317
|
+
const item = batch[itemIndex];
|
|
318
|
+
try {
|
|
319
|
+
// Check memory before processing each image
|
|
320
|
+
if (this.memoryMonitor.isMemoryThresholdExceeded(this.config.memoryThresholdMB)) {
|
|
321
|
+
console.warn(`Memory threshold exceeded (${this.memoryMonitor.getCurrentMemoryUsageMB()}MB), forcing garbage collection`);
|
|
322
|
+
this.memoryMonitor.forceGarbageCollection();
|
|
323
|
+
}
|
|
324
|
+
const result = await embedFunction(item);
|
|
325
|
+
results.push(result);
|
|
326
|
+
stats.processedItems++;
|
|
327
|
+
// Update memory tracking
|
|
328
|
+
this.memoryMonitor.updatePeakMemory();
|
|
329
|
+
}
|
|
330
|
+
catch (error) {
|
|
331
|
+
stats.failedItems++;
|
|
332
|
+
errors.push({
|
|
333
|
+
item,
|
|
334
|
+
error: error instanceof Error ? error.message : String(error),
|
|
335
|
+
batchIndex,
|
|
336
|
+
itemIndex
|
|
337
|
+
});
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
return results;
|
|
341
|
+
}
|
|
342
|
+
finally {
|
|
343
|
+
// Update batch timing
|
|
344
|
+
const batchTime = Date.now() - batchStartTime;
|
|
345
|
+
stats.averageBatchTimeMs = ((stats.averageBatchTimeMs * (stats.completedBatches + stats.failedBatches)) + batchTime) / (stats.completedBatches + stats.failedBatches + 1);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
// =============================================================================
|
|
349
|
+
// FALLBACK PROCESSING
|
|
350
|
+
// =============================================================================
|
|
351
|
+
/**
|
|
352
|
+
* Process batch with fallback to individual item processing
|
|
353
|
+
*/
|
|
354
|
+
async processBatchWithFallback(batch, embedFunction, batchIndex, stats, errors) {
|
|
355
|
+
console.log(`Attempting fallback processing for batch ${batchIndex} (${batch.length} items)`);
|
|
356
|
+
const results = [];
|
|
357
|
+
for (let itemIndex = 0; itemIndex < batch.length; itemIndex++) {
|
|
358
|
+
const item = batch[itemIndex];
|
|
359
|
+
let retryCount = 0;
|
|
360
|
+
while (retryCount <= this.config.maxRetries) {
|
|
361
|
+
try {
|
|
362
|
+
const result = await embedFunction(item);
|
|
363
|
+
results.push(result);
|
|
364
|
+
stats.processedItems++;
|
|
365
|
+
break;
|
|
366
|
+
}
|
|
367
|
+
catch (error) {
|
|
368
|
+
retryCount++;
|
|
369
|
+
stats.retryCount++;
|
|
370
|
+
if (retryCount <= this.config.maxRetries) {
|
|
371
|
+
console.warn(`Retry ${retryCount}/${this.config.maxRetries} for item ${itemIndex} in batch ${batchIndex}`);
|
|
372
|
+
await this.delay(this.config.retryDelayMs);
|
|
373
|
+
}
|
|
374
|
+
else {
|
|
375
|
+
stats.failedItems++;
|
|
376
|
+
errors.push({
|
|
377
|
+
item,
|
|
378
|
+
error: error instanceof Error ? error.message : String(error),
|
|
379
|
+
batchIndex,
|
|
380
|
+
itemIndex
|
|
381
|
+
});
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
return results;
|
|
387
|
+
}
|
|
388
|
+
// =============================================================================
|
|
389
|
+
// RESOURCE MANAGEMENT
|
|
390
|
+
// =============================================================================
|
|
391
|
+
/**
|
|
392
|
+
* Preload required models based on content types
|
|
393
|
+
*/
|
|
394
|
+
async preloadRequiredModels(items) {
|
|
395
|
+
const hasImages = items.some(item => item.contentType === 'image');
|
|
396
|
+
if (hasImages) {
|
|
397
|
+
await this.preloadImageProcessingModels();
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
/**
|
|
401
|
+
* Preload image processing models
|
|
402
|
+
*/
|
|
403
|
+
async preloadImageProcessingModels() {
|
|
404
|
+
try {
|
|
405
|
+
if (!this.resourcePool.has('imageToText')) {
|
|
406
|
+
console.log('Preloading image-to-text processor...');
|
|
407
|
+
const processor = await LazyMultimodalLoader.loadImageToTextProcessor();
|
|
408
|
+
this.resourcePool.set('imageToText', processor);
|
|
409
|
+
// Register with resource manager
|
|
410
|
+
this.resourceManager.registerImageProcessor(processor, 'image-to-text');
|
|
411
|
+
}
|
|
412
|
+
if (!this.resourcePool.has('metadataExtractor')) {
|
|
413
|
+
console.log('Preloading image metadata extractor...');
|
|
414
|
+
const extractor = await LazyMultimodalLoader.loadImageMetadataExtractor();
|
|
415
|
+
this.resourcePool.set('metadataExtractor', extractor);
|
|
416
|
+
// Register with resource manager
|
|
417
|
+
this.resourceManager.registerImageProcessor(extractor, 'metadata-extractor');
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
catch (error) {
|
|
421
|
+
console.warn(`Failed to preload image processing models: ${error instanceof Error ? error.message : String(error)}`);
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
/**
|
|
425
|
+
* Perform memory management operations
|
|
426
|
+
*/
|
|
427
|
+
async performMemoryManagement(aggressive = false) {
|
|
428
|
+
if (!this.config.enableMemoryMonitoring) {
|
|
429
|
+
return;
|
|
430
|
+
}
|
|
431
|
+
const currentMemory = this.memoryMonitor.getCurrentMemoryUsageMB();
|
|
432
|
+
// Force garbage collection if memory threshold exceeded or aggressive mode
|
|
433
|
+
if (aggressive || this.memoryMonitor.isMemoryThresholdExceeded(this.config.memoryThresholdMB)) {
|
|
434
|
+
if (this.config.enableGarbageCollection) {
|
|
435
|
+
this.memoryMonitor.forceGarbageCollection();
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
// Update peak memory tracking
|
|
439
|
+
this.memoryMonitor.updatePeakMemory();
|
|
440
|
+
}
|
|
441
|
+
/**
|
|
442
|
+
* Cleanup resources after processing with resource manager integration
|
|
443
|
+
*/
|
|
444
|
+
async cleanupResources() {
|
|
445
|
+
try {
|
|
446
|
+
// Clear resource pool if not using resource pooling
|
|
447
|
+
if (!this.config.enableResourcePooling) {
|
|
448
|
+
// Clean up registered processors
|
|
449
|
+
for (const [key, processor] of this.resourcePool) {
|
|
450
|
+
try {
|
|
451
|
+
// The resource manager will handle proper cleanup
|
|
452
|
+
if (processor && typeof processor.cleanup === 'function') {
|
|
453
|
+
await processor.cleanup();
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
catch (error) {
|
|
457
|
+
console.warn(`Failed to cleanup processor ${key}: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
this.resourcePool.clear();
|
|
461
|
+
}
|
|
462
|
+
// Use resource manager for memory optimization
|
|
463
|
+
if (this.config.enableGarbageCollection) {
|
|
464
|
+
await this.resourceManager.optimizeMemory();
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
catch (error) {
|
|
468
|
+
console.warn(`Error during batch processing cleanup: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
// =============================================================================
|
|
472
|
+
// UTILITY METHODS
|
|
473
|
+
// =============================================================================
|
|
474
|
+
/**
|
|
475
|
+
* Check if progress should be reported for this batch
|
|
476
|
+
*/
|
|
477
|
+
shouldReportProgress(batchIndex) {
|
|
478
|
+
return this.config.enableProgressReporting &&
|
|
479
|
+
(batchIndex + 1) % this.config.progressReportInterval === 0;
|
|
480
|
+
}
|
|
481
|
+
/**
|
|
482
|
+
* Delay execution for specified milliseconds
|
|
483
|
+
*/
|
|
484
|
+
delay(ms) {
|
|
485
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* Get current configuration
|
|
489
|
+
*/
|
|
490
|
+
getConfig() {
|
|
491
|
+
return { ...this.config };
|
|
492
|
+
}
|
|
493
|
+
/**
|
|
494
|
+
* Update configuration
|
|
495
|
+
*/
|
|
496
|
+
updateConfig(updates) {
|
|
497
|
+
this.config = { ...this.config, ...updates };
|
|
498
|
+
}
|
|
499
|
+
/**
|
|
500
|
+
* Get current memory statistics
|
|
501
|
+
*/
|
|
502
|
+
getMemoryStats() {
|
|
503
|
+
return this.memoryMonitor.getStats();
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
// =============================================================================
|
|
507
|
+
// FACTORY FUNCTIONS
|
|
508
|
+
// =============================================================================
|
|
509
|
+
/**
|
|
510
|
+
* Create a batch processing optimizer with default configuration
|
|
511
|
+
*/
|
|
512
|
+
export function createBatchProcessor(config) {
|
|
513
|
+
return new BatchProcessingOptimizer(config);
|
|
514
|
+
}
|
|
515
|
+
/**
|
|
516
|
+
* Create a batch processing optimizer optimized for large image collections
|
|
517
|
+
*/
|
|
518
|
+
export function createImageBatchProcessor() {
|
|
519
|
+
return new BatchProcessingOptimizer({
|
|
520
|
+
imageBatchSize: 2, // Very small batches for memory efficiency
|
|
521
|
+
textBatchSize: 8,
|
|
522
|
+
memoryThresholdMB: 128, // Lower threshold for images
|
|
523
|
+
enableMemoryMonitoring: true,
|
|
524
|
+
enableGarbageCollection: true,
|
|
525
|
+
enableParallelProcessing: false, // Sequential for better memory control
|
|
526
|
+
progressReportInterval: 2 // More frequent progress reports
|
|
527
|
+
});
|
|
528
|
+
}
|
|
529
|
+
/**
|
|
530
|
+
* Create a batch processing optimizer optimized for text processing
|
|
531
|
+
*/
|
|
532
|
+
export function createTextBatchProcessor() {
|
|
533
|
+
return new BatchProcessingOptimizer({
|
|
534
|
+
textBatchSize: 32, // Larger batches for text
|
|
535
|
+
imageBatchSize: 4,
|
|
536
|
+
enableParallelProcessing: true, // Parallel processing for text
|
|
537
|
+
memoryThresholdMB: 512, // Higher threshold for text
|
|
538
|
+
progressReportInterval: 10
|
|
539
|
+
});
|
|
540
|
+
}
|
|
541
|
+
//# sourceMappingURL=batch-processing-optimizer.js.map
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Binary Index Format Module
|
|
3
|
+
*
|
|
4
|
+
* Provides efficient binary serialization for HNSW vector indices.
|
|
5
|
+
*
|
|
6
|
+
* Format Specification:
|
|
7
|
+
* - Header: 24 bytes (6 × uint32)
|
|
8
|
+
* - Vectors: N × (4 + D × 4) bytes
|
|
9
|
+
* - Little-endian encoding for cross-platform compatibility
|
|
10
|
+
* - 4-byte alignment for Float32Array zero-copy views
|
|
11
|
+
*
|
|
12
|
+
* Performance:
|
|
13
|
+
* - 3.66x smaller than JSON format
|
|
14
|
+
* - 3.5x faster loading
|
|
15
|
+
* - Zero-copy Float32Array views
|
|
16
|
+
*/
|
|
17
|
+
export interface BinaryIndexData {
|
|
18
|
+
dimensions: number;
|
|
19
|
+
maxElements: number;
|
|
20
|
+
M: number;
|
|
21
|
+
efConstruction: number;
|
|
22
|
+
seed: number;
|
|
23
|
+
currentSize: number;
|
|
24
|
+
vectors: Array<{
|
|
25
|
+
id: number;
|
|
26
|
+
vector: Float32Array;
|
|
27
|
+
}>;
|
|
28
|
+
}
|
|
29
|
+
export declare class BinaryIndexFormat {
|
|
30
|
+
/**
|
|
31
|
+
* Save index data to binary format
|
|
32
|
+
*
|
|
33
|
+
* File structure:
|
|
34
|
+
* - Header (24 bytes): dimensions, maxElements, M, efConstruction, seed, currentSize
|
|
35
|
+
* - Vectors: For each vector: id (4 bytes) + vector data (dimensions × 4 bytes)
|
|
36
|
+
*
|
|
37
|
+
* @param indexPath Path to save the binary index file
|
|
38
|
+
* @param data Index data to serialize
|
|
39
|
+
*/
|
|
40
|
+
static save(indexPath: string, data: BinaryIndexData): Promise<void>;
|
|
41
|
+
/**
|
|
42
|
+
* Load index data from binary format
|
|
43
|
+
*
|
|
44
|
+
* Uses zero-copy Float32Array views for efficient loading.
|
|
45
|
+
* Copies the views to ensure data persistence after buffer lifecycle.
|
|
46
|
+
*
|
|
47
|
+
* @param indexPath Path to the binary index file
|
|
48
|
+
* @returns Deserialized index data
|
|
49
|
+
*/
|
|
50
|
+
static load(indexPath: string): Promise<BinaryIndexData>;
|
|
51
|
+
}
|
|
52
|
+
//# sourceMappingURL=binary-index-format.d.ts.map
|