rag-lite-ts 1.0.2 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. package/README.md +605 -93
  2. package/dist/cli/indexer.js +192 -4
  3. package/dist/cli/search.js +50 -11
  4. package/dist/cli.js +183 -26
  5. package/dist/core/abstract-embedder.d.ts +125 -0
  6. package/dist/core/abstract-embedder.js +264 -0
  7. package/dist/core/actionable-error-messages.d.ts +60 -0
  8. package/dist/core/actionable-error-messages.js +397 -0
  9. package/dist/core/batch-processing-optimizer.d.ts +155 -0
  10. package/dist/core/batch-processing-optimizer.js +541 -0
  11. package/dist/core/binary-index-format.d.ts +52 -0
  12. package/dist/core/binary-index-format.js +122 -0
  13. package/dist/core/chunker.d.ts +2 -0
  14. package/dist/core/cli-database-utils.d.ts +53 -0
  15. package/dist/core/cli-database-utils.js +239 -0
  16. package/dist/core/config.js +10 -3
  17. package/dist/core/content-errors.d.ts +111 -0
  18. package/dist/core/content-errors.js +362 -0
  19. package/dist/core/content-manager.d.ts +343 -0
  20. package/dist/core/content-manager.js +1504 -0
  21. package/dist/core/content-performance-optimizer.d.ts +150 -0
  22. package/dist/core/content-performance-optimizer.js +516 -0
  23. package/dist/core/content-resolver.d.ts +104 -0
  24. package/dist/core/content-resolver.js +285 -0
  25. package/dist/core/cross-modal-search.d.ts +164 -0
  26. package/dist/core/cross-modal-search.js +342 -0
  27. package/dist/core/database-connection-manager.d.ts +109 -0
  28. package/dist/core/database-connection-manager.js +304 -0
  29. package/dist/core/db.d.ts +141 -2
  30. package/dist/core/db.js +631 -89
  31. package/dist/core/embedder-factory.d.ts +176 -0
  32. package/dist/core/embedder-factory.js +338 -0
  33. package/dist/core/index.d.ts +3 -1
  34. package/dist/core/index.js +4 -1
  35. package/dist/core/ingestion.d.ts +85 -15
  36. package/dist/core/ingestion.js +510 -45
  37. package/dist/core/lazy-dependency-loader.d.ts +152 -0
  38. package/dist/core/lazy-dependency-loader.js +453 -0
  39. package/dist/core/mode-detection-service.d.ts +150 -0
  40. package/dist/core/mode-detection-service.js +565 -0
  41. package/dist/core/mode-model-validator.d.ts +92 -0
  42. package/dist/core/mode-model-validator.js +203 -0
  43. package/dist/core/model-registry.d.ts +120 -0
  44. package/dist/core/model-registry.js +415 -0
  45. package/dist/core/model-validator.d.ts +217 -0
  46. package/dist/core/model-validator.js +782 -0
  47. package/dist/core/polymorphic-search-factory.d.ts +154 -0
  48. package/dist/core/polymorphic-search-factory.js +344 -0
  49. package/dist/core/raglite-paths.d.ts +121 -0
  50. package/dist/core/raglite-paths.js +145 -0
  51. package/dist/core/reranking-config.d.ts +42 -0
  52. package/dist/core/reranking-config.js +156 -0
  53. package/dist/core/reranking-factory.d.ts +92 -0
  54. package/dist/core/reranking-factory.js +591 -0
  55. package/dist/core/reranking-strategies.d.ts +325 -0
  56. package/dist/core/reranking-strategies.js +720 -0
  57. package/dist/core/resource-cleanup.d.ts +163 -0
  58. package/dist/core/resource-cleanup.js +371 -0
  59. package/dist/core/resource-manager.d.ts +212 -0
  60. package/dist/core/resource-manager.js +564 -0
  61. package/dist/core/search.d.ts +28 -1
  62. package/dist/core/search.js +83 -5
  63. package/dist/core/streaming-operations.d.ts +145 -0
  64. package/dist/core/streaming-operations.js +409 -0
  65. package/dist/core/types.d.ts +3 -0
  66. package/dist/core/universal-embedder.d.ts +177 -0
  67. package/dist/core/universal-embedder.js +139 -0
  68. package/dist/core/validation-messages.d.ts +99 -0
  69. package/dist/core/validation-messages.js +334 -0
  70. package/dist/core/vector-index.d.ts +1 -1
  71. package/dist/core/vector-index.js +37 -39
  72. package/dist/factories/index.d.ts +3 -1
  73. package/dist/factories/index.js +2 -0
  74. package/dist/factories/polymorphic-factory.d.ts +50 -0
  75. package/dist/factories/polymorphic-factory.js +159 -0
  76. package/dist/factories/text-factory.d.ts +128 -34
  77. package/dist/factories/text-factory.js +346 -97
  78. package/dist/file-processor.d.ts +88 -2
  79. package/dist/file-processor.js +720 -17
  80. package/dist/index.d.ts +32 -0
  81. package/dist/index.js +29 -0
  82. package/dist/ingestion.d.ts +16 -0
  83. package/dist/ingestion.js +21 -0
  84. package/dist/mcp-server.d.ts +35 -3
  85. package/dist/mcp-server.js +1107 -31
  86. package/dist/multimodal/clip-embedder.d.ts +327 -0
  87. package/dist/multimodal/clip-embedder.js +992 -0
  88. package/dist/multimodal/index.d.ts +6 -0
  89. package/dist/multimodal/index.js +6 -0
  90. package/dist/run-error-recovery-tests.d.ts +7 -0
  91. package/dist/run-error-recovery-tests.js +101 -0
  92. package/dist/search.d.ts +60 -9
  93. package/dist/search.js +82 -11
  94. package/dist/test-utils.d.ts +8 -26
  95. package/dist/text/chunker.d.ts +1 -0
  96. package/dist/text/embedder.js +15 -8
  97. package/dist/text/index.d.ts +1 -0
  98. package/dist/text/index.js +1 -0
  99. package/dist/text/reranker.d.ts +1 -2
  100. package/dist/text/reranker.js +17 -47
  101. package/dist/text/sentence-transformer-embedder.d.ts +96 -0
  102. package/dist/text/sentence-transformer-embedder.js +340 -0
  103. package/dist/types.d.ts +39 -0
  104. package/dist/utils/vector-math.d.ts +31 -0
  105. package/dist/utils/vector-math.js +70 -0
  106. package/package.json +27 -6
  107. package/dist/api-errors.d.ts.map +0 -1
  108. package/dist/api-errors.js.map +0 -1
  109. package/dist/cli/indexer.d.ts.map +0 -1
  110. package/dist/cli/indexer.js.map +0 -1
  111. package/dist/cli/search.d.ts.map +0 -1
  112. package/dist/cli/search.js.map +0 -1
  113. package/dist/cli.d.ts.map +0 -1
  114. package/dist/cli.js.map +0 -1
  115. package/dist/config.d.ts.map +0 -1
  116. package/dist/config.js.map +0 -1
  117. package/dist/core/adapters.d.ts.map +0 -1
  118. package/dist/core/adapters.js.map +0 -1
  119. package/dist/core/chunker.d.ts.map +0 -1
  120. package/dist/core/chunker.js.map +0 -1
  121. package/dist/core/config.d.ts.map +0 -1
  122. package/dist/core/config.js.map +0 -1
  123. package/dist/core/db.d.ts.map +0 -1
  124. package/dist/core/db.js.map +0 -1
  125. package/dist/core/error-handler.d.ts.map +0 -1
  126. package/dist/core/error-handler.js.map +0 -1
  127. package/dist/core/index.d.ts.map +0 -1
  128. package/dist/core/index.js.map +0 -1
  129. package/dist/core/ingestion.d.ts.map +0 -1
  130. package/dist/core/ingestion.js.map +0 -1
  131. package/dist/core/interfaces.d.ts.map +0 -1
  132. package/dist/core/interfaces.js.map +0 -1
  133. package/dist/core/path-manager.d.ts.map +0 -1
  134. package/dist/core/path-manager.js.map +0 -1
  135. package/dist/core/search-example.d.ts +0 -25
  136. package/dist/core/search-example.d.ts.map +0 -1
  137. package/dist/core/search-example.js +0 -138
  138. package/dist/core/search-example.js.map +0 -1
  139. package/dist/core/search-pipeline-example.d.ts +0 -21
  140. package/dist/core/search-pipeline-example.d.ts.map +0 -1
  141. package/dist/core/search-pipeline-example.js +0 -188
  142. package/dist/core/search-pipeline-example.js.map +0 -1
  143. package/dist/core/search-pipeline.d.ts.map +0 -1
  144. package/dist/core/search-pipeline.js.map +0 -1
  145. package/dist/core/search.d.ts.map +0 -1
  146. package/dist/core/search.js.map +0 -1
  147. package/dist/core/types.d.ts.map +0 -1
  148. package/dist/core/types.js.map +0 -1
  149. package/dist/core/vector-index.d.ts.map +0 -1
  150. package/dist/core/vector-index.js.map +0 -1
  151. package/dist/dom-polyfills.d.ts.map +0 -1
  152. package/dist/dom-polyfills.js.map +0 -1
  153. package/dist/examples/clean-api-examples.d.ts +0 -44
  154. package/dist/examples/clean-api-examples.d.ts.map +0 -1
  155. package/dist/examples/clean-api-examples.js +0 -206
  156. package/dist/examples/clean-api-examples.js.map +0 -1
  157. package/dist/factories/index.d.ts.map +0 -1
  158. package/dist/factories/index.js.map +0 -1
  159. package/dist/factories/text-factory.d.ts.map +0 -1
  160. package/dist/factories/text-factory.js.map +0 -1
  161. package/dist/file-processor.d.ts.map +0 -1
  162. package/dist/file-processor.js.map +0 -1
  163. package/dist/index-manager.d.ts.map +0 -1
  164. package/dist/index-manager.js.map +0 -1
  165. package/dist/index.d.ts.map +0 -1
  166. package/dist/index.js.map +0 -1
  167. package/dist/indexer.d.ts.map +0 -1
  168. package/dist/indexer.js.map +0 -1
  169. package/dist/ingestion.d.ts.map +0 -1
  170. package/dist/ingestion.js.map +0 -1
  171. package/dist/mcp-server.d.ts.map +0 -1
  172. package/dist/mcp-server.js.map +0 -1
  173. package/dist/preprocess.d.ts.map +0 -1
  174. package/dist/preprocess.js.map +0 -1
  175. package/dist/preprocessors/index.d.ts.map +0 -1
  176. package/dist/preprocessors/index.js.map +0 -1
  177. package/dist/preprocessors/mdx.d.ts.map +0 -1
  178. package/dist/preprocessors/mdx.js.map +0 -1
  179. package/dist/preprocessors/mermaid.d.ts.map +0 -1
  180. package/dist/preprocessors/mermaid.js.map +0 -1
  181. package/dist/preprocessors/registry.d.ts.map +0 -1
  182. package/dist/preprocessors/registry.js.map +0 -1
  183. package/dist/search-standalone.d.ts.map +0 -1
  184. package/dist/search-standalone.js.map +0 -1
  185. package/dist/search.d.ts.map +0 -1
  186. package/dist/search.js.map +0 -1
  187. package/dist/test-utils.d.ts.map +0 -1
  188. package/dist/test-utils.js.map +0 -1
  189. package/dist/text/chunker.d.ts.map +0 -1
  190. package/dist/text/chunker.js.map +0 -1
  191. package/dist/text/embedder.d.ts.map +0 -1
  192. package/dist/text/embedder.js.map +0 -1
  193. package/dist/text/index.d.ts.map +0 -1
  194. package/dist/text/index.js.map +0 -1
  195. package/dist/text/preprocessors/index.d.ts.map +0 -1
  196. package/dist/text/preprocessors/index.js.map +0 -1
  197. package/dist/text/preprocessors/mdx.d.ts.map +0 -1
  198. package/dist/text/preprocessors/mdx.js.map +0 -1
  199. package/dist/text/preprocessors/mermaid.d.ts.map +0 -1
  200. package/dist/text/preprocessors/mermaid.js.map +0 -1
  201. package/dist/text/preprocessors/registry.d.ts.map +0 -1
  202. package/dist/text/preprocessors/registry.js.map +0 -1
  203. package/dist/text/reranker.d.ts.map +0 -1
  204. package/dist/text/reranker.js.map +0 -1
  205. package/dist/text/tokenizer.d.ts.map +0 -1
  206. package/dist/text/tokenizer.js.map +0 -1
  207. package/dist/types.d.ts.map +0 -1
  208. package/dist/types.js.map +0 -1
@@ -4,6 +4,7 @@
4
4
  */
5
5
  import { getChunksByEmbeddingIds } from './db.js';
6
6
  import { config } from './config.js';
7
+ import { createMissingDependencyError } from './actionable-error-messages.js';
7
8
  /**
8
9
  * Search engine that provides semantic search capabilities
9
10
  * Implements the core search pipeline: query embedding → vector search → metadata retrieval → optional reranking
@@ -14,6 +15,7 @@ export class SearchEngine {
14
15
  indexManager;
15
16
  db;
16
17
  rerankFn;
18
+ contentResolver;
17
19
  /**
18
20
  * Creates a new SearchEngine with explicit dependency injection
19
21
  *
@@ -68,21 +70,29 @@ export class SearchEngine {
68
70
  * const search = new SearchEngine(customEmbedFn, indexManager, db);
69
71
  * ```
70
72
  */
71
- constructor(embedFn, indexManager, db, rerankFn) {
73
+ constructor(embedFn, indexManager, db, rerankFn, contentResolver) {
72
74
  this.embedFn = embedFn;
73
75
  this.indexManager = indexManager;
74
76
  this.db = db;
75
77
  this.rerankFn = rerankFn;
76
78
  // Validate required dependencies
77
79
  if (!embedFn || typeof embedFn !== 'function') {
78
- throw new Error('embedFn must be a valid function');
80
+ throw createMissingDependencyError('embedFn', 'function', {
81
+ operationContext: 'SearchEngine constructor'
82
+ });
79
83
  }
80
84
  if (!indexManager) {
81
- throw new Error('indexManager is required');
85
+ throw createMissingDependencyError('indexManager', 'object', {
86
+ operationContext: 'SearchEngine constructor'
87
+ });
82
88
  }
83
89
  if (!db) {
84
- throw new Error('db connection is required');
90
+ throw createMissingDependencyError('db', 'object', {
91
+ operationContext: 'SearchEngine constructor'
92
+ });
85
93
  }
94
+ // Initialize ContentResolver if provided, or create lazily when needed
95
+ this.contentResolver = contentResolver;
86
96
  }
87
97
  /**
88
98
  * Perform semantic search on the indexed documents
@@ -183,7 +193,8 @@ export class SearchEngine {
183
193
  id: chunk.document_id,
184
194
  source: chunk.document_source,
185
195
  title: chunk.document_title,
186
- contentType: chunk.document_content_type || 'text'
196
+ contentType: chunk.document_content_type || 'text',
197
+ contentId: chunk.document_content_id || undefined
187
198
  }
188
199
  });
189
200
  }
@@ -202,11 +213,78 @@ export class SearchEngine {
202
213
  rerankingEnabled: this.rerankFn !== undefined
203
214
  };
204
215
  }
216
+ /**
217
+ * Retrieve content by ID in the specified format
218
+ * @param contentId - Content ID to retrieve
219
+ * @param format - Format to return ('file' for CLI clients, 'base64' for MCP clients)
220
+ * @returns Promise that resolves to content in requested format
221
+ */
222
+ async getContent(contentId, format = 'file') {
223
+ // Lazy initialization of ContentResolver
224
+ if (!this.contentResolver) {
225
+ const { ContentResolver } = await import('./content-resolver.js');
226
+ this.contentResolver = new ContentResolver(this.db);
227
+ }
228
+ return this.contentResolver.getContent(contentId, format);
229
+ }
230
+ /**
231
+ * Retrieve multiple content items efficiently in batch
232
+ * @param contentIds - Array of content IDs to retrieve
233
+ * @param format - Format to return ('file' for CLI clients, 'base64' for MCP clients)
234
+ * @returns Promise that resolves to array of content in requested format
235
+ */
236
+ async getContentBatch(contentIds, format = 'file') {
237
+ // Lazy initialization of ContentResolver
238
+ if (!this.contentResolver) {
239
+ const { ContentResolver } = await import('./content-resolver.js');
240
+ this.contentResolver = new ContentResolver(this.db);
241
+ }
242
+ // Convert contentIds array to ContentRequest array
243
+ const requests = contentIds.map(contentId => ({ contentId, format }));
244
+ const results = await this.contentResolver.getContentBatch(requests);
245
+ // Extract content from results, maintaining order and handling errors
246
+ return results.map(result => {
247
+ if (!result.success) {
248
+ throw new Error(`Failed to retrieve content ${result.contentId}: ${result.error}`);
249
+ }
250
+ return result.content;
251
+ });
252
+ }
253
+ /**
254
+ * Retrieve content metadata for result enhancement
255
+ * @param contentId - Content ID to get metadata for
256
+ * @returns Promise that resolves to content metadata
257
+ */
258
+ async getContentMetadata(contentId) {
259
+ // Lazy initialization of ContentResolver
260
+ if (!this.contentResolver) {
261
+ const { ContentResolver } = await import('./content-resolver.js');
262
+ this.contentResolver = new ContentResolver(this.db);
263
+ }
264
+ return this.contentResolver.getContentMetadata(contentId);
265
+ }
266
+ /**
267
+ * Verify that content exists and is accessible
268
+ * @param contentId - Content ID to verify
269
+ * @returns Promise that resolves to true if content exists, false otherwise
270
+ */
271
+ async verifyContentExists(contentId) {
272
+ // Lazy initialization of ContentResolver
273
+ if (!this.contentResolver) {
274
+ const { ContentResolver } = await import('./content-resolver.js');
275
+ this.contentResolver = new ContentResolver(this.db);
276
+ }
277
+ return this.contentResolver.verifyContentExists(contentId);
278
+ }
205
279
  /**
206
280
  * Clean up resources - explicit cleanup method
207
281
  */
208
282
  async cleanup() {
209
283
  try {
284
+ // Clean up ContentResolver to prevent resource leaks
285
+ if (this.contentResolver && typeof this.contentResolver.cleanup === 'function') {
286
+ this.contentResolver.cleanup();
287
+ }
210
288
  await this.db.close();
211
289
  await this.indexManager.close();
212
290
  }
@@ -0,0 +1,145 @@
1
+ /**
2
+ * Streaming Operations for Large Content - Task 9.1 Implementation
3
+ * Provides memory-efficient streaming operations for content ingestion and retrieval
4
+ * Minimizes memory usage for large files through streaming algorithms
5
+ */
6
+ /**
7
+ * Progress callback for long-running operations
8
+ */
9
+ export interface ProgressCallback {
10
+ (bytesProcessed: number, totalBytes?: number): void;
11
+ }
12
+ /**
13
+ * Streaming hash calculation result
14
+ */
15
+ export interface StreamingHashResult {
16
+ hash: string;
17
+ bytesProcessed: number;
18
+ processingTimeMs: number;
19
+ }
20
+ /**
21
+ * Streaming file copy result
22
+ */
23
+ export interface StreamingCopyResult {
24
+ bytesWritten: number;
25
+ processingTimeMs: number;
26
+ hash?: string;
27
+ }
28
+ /**
29
+ * Configuration for streaming operations
30
+ */
31
+ export interface StreamingConfig {
32
+ chunkSize: number;
33
+ enableProgress: boolean;
34
+ enableHashing: boolean;
35
+ timeout: number;
36
+ }
37
+ /**
38
+ * StreamingOperations class provides memory-efficient operations for large content
39
+ */
40
+ export declare class StreamingOperations {
41
+ private config;
42
+ constructor(config?: Partial<StreamingConfig>);
43
+ /**
44
+ * Calculates SHA-256 hash of a file using streaming to minimize memory usage
45
+ * @param filePath - Path to the file to hash
46
+ * @param progressCallback - Optional callback for progress reporting
47
+ * @returns Promise that resolves to hash result
48
+ */
49
+ calculateFileHashStreaming(filePath: string, progressCallback?: ProgressCallback): Promise<StreamingHashResult>;
50
+ /**
51
+ * Calculates SHA-256 hash of a buffer using streaming to minimize memory usage
52
+ * @param content - Buffer to hash
53
+ * @param progressCallback - Optional callback for progress reporting
54
+ * @returns Promise that resolves to hash result
55
+ */
56
+ calculateBufferHashStreaming(content: Buffer, progressCallback?: ProgressCallback): Promise<StreamingHashResult>;
57
+ /**
58
+ * Copies a file using streaming operations with optional hashing
59
+ * @param sourcePath - Source file path
60
+ * @param destinationPath - Destination file path
61
+ * @param progressCallback - Optional callback for progress reporting
62
+ * @returns Promise that resolves to copy result
63
+ */
64
+ copyFileStreaming(sourcePath: string, destinationPath: string, progressCallback?: ProgressCallback): Promise<StreamingCopyResult>;
65
+ /**
66
+ * Writes buffer content to file using streaming operations
67
+ * @param content - Buffer to write
68
+ * @param destinationPath - Destination file path
69
+ * @param progressCallback - Optional callback for progress reporting
70
+ * @returns Promise that resolves to write result
71
+ */
72
+ writeBufferStreaming(content: Buffer, destinationPath: string, progressCallback?: ProgressCallback): Promise<StreamingCopyResult>;
73
+ /**
74
+ * Reads file content and converts to base64 using streaming to minimize memory usage
75
+ * @param filePath - Path to the file to read
76
+ * @param progressCallback - Optional callback for progress reporting
77
+ * @returns Promise that resolves to base64 string
78
+ */
79
+ readFileAsBase64Streaming(filePath: string, progressCallback?: ProgressCallback): Promise<string>;
80
+ /**
81
+ * Validates file integrity by comparing streaming hash with expected hash
82
+ * @param filePath - Path to the file to validate
83
+ * @param expectedHash - Expected SHA-256 hash
84
+ * @param progressCallback - Optional callback for progress reporting
85
+ * @returns Promise that resolves to validation result
86
+ */
87
+ validateFileIntegrityStreaming(filePath: string, expectedHash: string, progressCallback?: ProgressCallback): Promise<{
88
+ isValid: boolean;
89
+ actualHash: string;
90
+ bytesProcessed: number;
91
+ }>;
92
+ /**
93
+ * Gets file information without loading content into memory
94
+ * @param filePath - Path to the file
95
+ * @returns Promise that resolves to file information
96
+ */
97
+ getFileInfo(filePath: string): Promise<{
98
+ size: number;
99
+ isFile: boolean;
100
+ isDirectory: boolean;
101
+ lastModified: Date;
102
+ canRead: boolean;
103
+ canWrite: boolean;
104
+ }>;
105
+ /**
106
+ * Converts buffer to chunks for streaming
107
+ * @param buffer - Buffer to chunk
108
+ * @returns Generator that yields buffer chunks
109
+ */
110
+ private bufferToChunks;
111
+ /**
112
+ * Wraps a promise with timeout functionality
113
+ * @param promise - Promise to wrap
114
+ * @param timeoutMs - Timeout in milliseconds
115
+ * @param errorMessage - Error message for timeout
116
+ * @returns Promise that rejects if timeout is reached
117
+ */
118
+ private withTimeout;
119
+ }
120
+ /**
121
+ * Creates a StreamingOperations instance with default configuration
122
+ * @param config - Optional configuration overrides
123
+ * @returns StreamingOperations instance
124
+ */
125
+ export declare function createStreamingOperations(config?: Partial<StreamingConfig>): StreamingOperations;
126
+ /**
127
+ * Utility function to format bytes for progress reporting
128
+ * @param bytes - Number of bytes
129
+ * @returns Formatted string (e.g., "1.5 MB")
130
+ */
131
+ export declare function formatBytes(bytes: number): string;
132
+ /**
133
+ * Utility function to format processing time
134
+ * @param milliseconds - Processing time in milliseconds
135
+ * @returns Formatted string (e.g., "1.5s" or "150ms")
136
+ */
137
+ export declare function formatProcessingTime(milliseconds: number): string;
138
+ /**
139
+ * Utility function to calculate processing speed
140
+ * @param bytes - Number of bytes processed
141
+ * @param milliseconds - Processing time in milliseconds
142
+ * @returns Speed in MB/s
143
+ */
144
+ export declare function calculateProcessingSpeed(bytes: number, milliseconds: number): number;
145
+ //# sourceMappingURL=streaming-operations.d.ts.map
@@ -0,0 +1,409 @@
1
+ /**
2
+ * Streaming Operations for Large Content - Task 9.1 Implementation
3
+ * Provides memory-efficient streaming operations for content ingestion and retrieval
4
+ * Minimizes memory usage for large files through streaming algorithms
5
+ */
6
+ import { createHash } from 'crypto';
7
+ import { createReadStream, createWriteStream, promises as fs } from 'fs';
8
+ import { pipeline } from 'stream/promises';
9
+ import { Transform, Readable } from 'stream';
10
+ import { dirname } from 'path';
11
+ /**
12
+ * Default streaming configuration
13
+ */
14
+ const DEFAULT_STREAMING_CONFIG = {
15
+ chunkSize: 64 * 1024, // 64KB chunks
16
+ enableProgress: false,
17
+ enableHashing: false,
18
+ timeout: 300000 // 5 minutes
19
+ };
20
+ /**
21
+ * StreamingOperations class provides memory-efficient operations for large content
22
+ */
23
+ export class StreamingOperations {
24
+ config;
25
+ constructor(config = {}) {
26
+ this.config = { ...DEFAULT_STREAMING_CONFIG, ...config };
27
+ }
28
+ /**
29
+ * Calculates SHA-256 hash of a file using streaming to minimize memory usage
30
+ * @param filePath - Path to the file to hash
31
+ * @param progressCallback - Optional callback for progress reporting
32
+ * @returns Promise that resolves to hash result
33
+ */
34
+ async calculateFileHashStreaming(filePath, progressCallback) {
35
+ const startTime = Date.now();
36
+ let bytesProcessed = 0;
37
+ let totalBytes;
38
+ try {
39
+ // Get file size for progress reporting
40
+ if (this.config.enableProgress || progressCallback) {
41
+ const stats = await fs.stat(filePath);
42
+ totalBytes = stats.size;
43
+ }
44
+ const hash = createHash('sha256');
45
+ const readStream = createReadStream(filePath, {
46
+ highWaterMark: this.config.chunkSize
47
+ });
48
+ // Use promise-based approach instead of pipeline for better control
49
+ return new Promise((resolve, reject) => {
50
+ const timeoutId = setTimeout(() => {
51
+ readStream.destroy();
52
+ reject(new Error('File hash calculation timed out'));
53
+ }, this.config.timeout);
54
+ readStream.on('data', (chunk) => {
55
+ const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
56
+ hash.update(buffer);
57
+ bytesProcessed += buffer.length;
58
+ // Report progress if callback provided
59
+ if (progressCallback) {
60
+ progressCallback(bytesProcessed, totalBytes);
61
+ }
62
+ });
63
+ readStream.on('end', () => {
64
+ clearTimeout(timeoutId);
65
+ const processingTimeMs = Date.now() - startTime;
66
+ resolve({
67
+ hash: hash.digest('hex'),
68
+ bytesProcessed,
69
+ processingTimeMs
70
+ });
71
+ });
72
+ readStream.on('error', (error) => {
73
+ clearTimeout(timeoutId);
74
+ reject(new Error(`Failed to read file: ${error.message}`));
75
+ });
76
+ });
77
+ }
78
+ catch (error) {
79
+ throw new Error(`Failed to calculate file hash: ${error instanceof Error ? error.message : 'Unknown error'}`);
80
+ }
81
+ }
82
+ /**
83
+ * Calculates SHA-256 hash of a buffer using streaming to minimize memory usage
84
+ * @param content - Buffer to hash
85
+ * @param progressCallback - Optional callback for progress reporting
86
+ * @returns Promise that resolves to hash result
87
+ */
88
+ async calculateBufferHashStreaming(content, progressCallback) {
89
+ const startTime = Date.now();
90
+ let bytesProcessed = 0;
91
+ const totalBytes = content.length;
92
+ try {
93
+ const hash = createHash('sha256');
94
+ // Process buffer in chunks to avoid memory spikes
95
+ const chunkSize = this.config.chunkSize;
96
+ for (let offset = 0; offset < content.length; offset += chunkSize) {
97
+ const chunk = content.subarray(offset, Math.min(offset + chunkSize, content.length));
98
+ hash.update(chunk);
99
+ bytesProcessed += chunk.length;
100
+ // Report progress if callback provided
101
+ if (progressCallback) {
102
+ progressCallback(bytesProcessed, totalBytes);
103
+ }
104
+ // Yield control to event loop to prevent blocking
105
+ if (offset % (chunkSize * 10) === 0) {
106
+ await new Promise(resolve => setImmediate(resolve));
107
+ }
108
+ }
109
+ const processingTimeMs = Date.now() - startTime;
110
+ return {
111
+ hash: hash.digest('hex'),
112
+ bytesProcessed,
113
+ processingTimeMs
114
+ };
115
+ }
116
+ catch (error) {
117
+ throw new Error(`Failed to calculate buffer hash: ${error instanceof Error ? error.message : 'Unknown error'}`);
118
+ }
119
+ }
120
+ /**
121
+ * Copies a file using streaming operations with optional hashing
122
+ * @param sourcePath - Source file path
123
+ * @param destinationPath - Destination file path
124
+ * @param progressCallback - Optional callback for progress reporting
125
+ * @returns Promise that resolves to copy result
126
+ */
127
+ async copyFileStreaming(sourcePath, destinationPath, progressCallback) {
128
+ const startTime = Date.now();
129
+ let bytesWritten = 0;
130
+ let totalBytes;
131
+ let hash;
132
+ try {
133
+ // Get file size for progress reporting
134
+ if (this.config.enableProgress || progressCallback) {
135
+ const stats = await fs.stat(sourcePath);
136
+ totalBytes = stats.size;
137
+ }
138
+ // Ensure destination directory exists
139
+ await fs.mkdir(dirname(destinationPath), { recursive: true });
140
+ const readStream = createReadStream(sourcePath, {
141
+ highWaterMark: this.config.chunkSize
142
+ });
143
+ const writeStream = createWriteStream(destinationPath);
144
+ let hashCalculator;
145
+ if (this.config.enableHashing) {
146
+ hashCalculator = createHash('sha256');
147
+ }
148
+ // Create transform stream for progress tracking and optional hashing
149
+ const progressTransform = new Transform({
150
+ transform(chunk, encoding, callback) {
151
+ bytesWritten += chunk.length;
152
+ // Update hash if enabled
153
+ if (hashCalculator) {
154
+ hashCalculator.update(chunk);
155
+ }
156
+ // Report progress if callback provided
157
+ if (progressCallback) {
158
+ progressCallback(bytesWritten, totalBytes);
159
+ }
160
+ callback(null, chunk);
161
+ }
162
+ });
163
+ // Use pipeline for proper error handling and cleanup
164
+ await this.withTimeout(pipeline(readStream, progressTransform, writeStream), this.config.timeout, 'File copy operation timed out');
165
+ const processingTimeMs = Date.now() - startTime;
166
+ if (hashCalculator) {
167
+ hash = hashCalculator.digest('hex');
168
+ }
169
+ return {
170
+ bytesWritten,
171
+ processingTimeMs,
172
+ hash
173
+ };
174
+ }
175
+ catch (error) {
176
+ // Clean up destination file if copy failed
177
+ try {
178
+ await fs.unlink(destinationPath);
179
+ }
180
+ catch {
181
+ // Ignore cleanup errors
182
+ }
183
+ throw new Error(`Failed to copy file: ${error instanceof Error ? error.message : 'Unknown error'}`);
184
+ }
185
+ }
186
+ /**
187
+ * Writes buffer content to file using streaming operations
188
+ * @param content - Buffer to write
189
+ * @param destinationPath - Destination file path
190
+ * @param progressCallback - Optional callback for progress reporting
191
+ * @returns Promise that resolves to write result
192
+ */
193
+ async writeBufferStreaming(content, destinationPath, progressCallback) {
194
+ const startTime = Date.now();
195
+ let bytesWritten = 0;
196
+ const totalBytes = content.length;
197
+ let hash;
198
+ try {
199
+ // Ensure destination directory exists
200
+ await fs.mkdir(dirname(destinationPath), { recursive: true });
201
+ const writeStream = createWriteStream(destinationPath);
202
+ let hashCalculator;
203
+ if (this.config.enableHashing) {
204
+ hashCalculator = createHash('sha256');
205
+ }
206
+ // Create readable stream from buffer
207
+ const readableStream = Readable.from(this.bufferToChunks(content));
208
+ // Create transform stream for progress tracking and optional hashing
209
+ const progressTransform = new Transform({
210
+ transform(chunk, encoding, callback) {
211
+ bytesWritten += chunk.length;
212
+ // Update hash if enabled
213
+ if (hashCalculator) {
214
+ hashCalculator.update(chunk);
215
+ }
216
+ // Report progress if callback provided
217
+ if (progressCallback) {
218
+ progressCallback(bytesWritten, totalBytes);
219
+ }
220
+ callback(null, chunk);
221
+ }
222
+ });
223
+ // Use pipeline for proper error handling and cleanup
224
+ await this.withTimeout(pipeline(readableStream, progressTransform, writeStream), this.config.timeout, 'Buffer write operation timed out');
225
+ const processingTimeMs = Date.now() - startTime;
226
+ if (hashCalculator) {
227
+ hash = hashCalculator.digest('hex');
228
+ }
229
+ return {
230
+ bytesWritten,
231
+ processingTimeMs,
232
+ hash
233
+ };
234
+ }
235
+ catch (error) {
236
+ // Clean up destination file if write failed
237
+ try {
238
+ await fs.unlink(destinationPath);
239
+ }
240
+ catch {
241
+ // Ignore cleanup errors
242
+ }
243
+ throw new Error(`Failed to write buffer: ${error instanceof Error ? error.message : 'Unknown error'}`);
244
+ }
245
+ }
246
+ /**
247
+ * Reads file content and converts to base64 using streaming to minimize memory usage
248
+ * @param filePath - Path to the file to read
249
+ * @param progressCallback - Optional callback for progress reporting
250
+ * @returns Promise that resolves to base64 string
251
+ */
252
+ async readFileAsBase64Streaming(filePath, progressCallback) {
253
+ let bytesProcessed = 0;
254
+ let totalBytes;
255
+ try {
256
+ // Get file size for progress reporting
257
+ if (this.config.enableProgress || progressCallback) {
258
+ const stats = await fs.stat(filePath);
259
+ totalBytes = stats.size;
260
+ }
261
+ // For base64 conversion, we need to read the entire file to get correct encoding
262
+ // Streaming base64 conversion chunk by chunk doesn't work correctly because
263
+ // base64 encoding requires complete byte sequences
264
+ const content = await fs.readFile(filePath);
265
+ bytesProcessed = content.length;
266
+ // Report progress if callback provided
267
+ if (progressCallback) {
268
+ progressCallback(bytesProcessed, totalBytes);
269
+ }
270
+ // Convert to base64
271
+ return content.toString('base64');
272
+ }
273
+ catch (error) {
274
+ throw new Error(`Failed to read file as base64: ${error instanceof Error ? error.message : 'Unknown error'}`);
275
+ }
276
+ }
277
+ /**
278
+ * Validates file integrity by comparing streaming hash with expected hash
279
+ * @param filePath - Path to the file to validate
280
+ * @param expectedHash - Expected SHA-256 hash
281
+ * @param progressCallback - Optional callback for progress reporting
282
+ * @returns Promise that resolves to validation result
283
+ */
284
+ async validateFileIntegrityStreaming(filePath, expectedHash, progressCallback) {
285
+ try {
286
+ const result = await this.calculateFileHashStreaming(filePath, progressCallback);
287
+ return {
288
+ isValid: result.hash === expectedHash.toLowerCase(),
289
+ actualHash: result.hash,
290
+ bytesProcessed: result.bytesProcessed
291
+ };
292
+ }
293
+ catch (error) {
294
+ throw new Error(`Failed to validate file integrity: ${error instanceof Error ? error.message : 'Unknown error'}`);
295
+ }
296
+ }
297
+ /**
298
+ * Gets file information without loading content into memory
299
+ * @param filePath - Path to the file
300
+ * @returns Promise that resolves to file information
301
+ */
302
+ async getFileInfo(filePath) {
303
+ try {
304
+ const stats = await fs.stat(filePath);
305
+ // Check permissions
306
+ let canRead = false;
307
+ let canWrite = false;
308
+ try {
309
+ await fs.access(filePath, fs.constants.R_OK);
310
+ canRead = true;
311
+ }
312
+ catch {
313
+ // Cannot read
314
+ }
315
+ try {
316
+ await fs.access(filePath, fs.constants.W_OK);
317
+ canWrite = true;
318
+ }
319
+ catch {
320
+ // Cannot write
321
+ }
322
+ return {
323
+ size: stats.size,
324
+ isFile: stats.isFile(),
325
+ isDirectory: stats.isDirectory(),
326
+ lastModified: stats.mtime,
327
+ canRead,
328
+ canWrite
329
+ };
330
+ }
331
+ catch (error) {
332
+ throw new Error(`Failed to get file info: ${error instanceof Error ? error.message : 'Unknown error'}`);
333
+ }
334
+ }
335
+ // =============================================================================
336
+ // PRIVATE HELPER METHODS
337
+ // =============================================================================
338
+ /**
339
+ * Converts buffer to chunks for streaming
340
+ * @param buffer - Buffer to chunk
341
+ * @returns Generator that yields buffer chunks
342
+ */
343
+ *bufferToChunks(buffer) {
344
+ const chunkSize = this.config.chunkSize;
345
+ for (let offset = 0; offset < buffer.length; offset += chunkSize) {
346
+ yield buffer.subarray(offset, Math.min(offset + chunkSize, buffer.length));
347
+ }
348
+ }
349
+ /**
350
+ * Wraps a promise with timeout functionality
351
+ * @param promise - Promise to wrap
352
+ * @param timeoutMs - Timeout in milliseconds
353
+ * @param errorMessage - Error message for timeout
354
+ * @returns Promise that rejects if timeout is reached
355
+ */
356
+ async withTimeout(promise, timeoutMs, errorMessage) {
357
+ const timeoutPromise = new Promise((_, reject) => {
358
+ setTimeout(() => reject(new Error(errorMessage)), timeoutMs);
359
+ });
360
+ return Promise.race([promise, timeoutPromise]);
361
+ }
362
+ }
363
+ /**
364
+ * Creates a StreamingOperations instance with default configuration
365
+ * @param config - Optional configuration overrides
366
+ * @returns StreamingOperations instance
367
+ */
368
+ export function createStreamingOperations(config) {
369
+ return new StreamingOperations(config);
370
+ }
371
+ /**
372
+ * Utility function to format bytes for progress reporting
373
+ * @param bytes - Number of bytes
374
+ * @returns Formatted string (e.g., "1.5 MB")
375
+ */
376
+ export function formatBytes(bytes) {
377
+ if (bytes === 0)
378
+ return '0 B';
379
+ const k = 1024;
380
+ const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
381
+ const i = Math.floor(Math.log(bytes) / Math.log(k));
382
+ return `${parseFloat((bytes / Math.pow(k, i)).toFixed(2))} ${sizes[i]}`;
383
+ }
384
+ /**
385
+ * Utility function to format processing time
386
+ * @param milliseconds - Processing time in milliseconds
387
+ * @returns Formatted string (e.g., "1.5s" or "150ms")
388
+ */
389
+ export function formatProcessingTime(milliseconds) {
390
+ if (milliseconds < 1000) {
391
+ return `${Math.round(milliseconds)}ms`;
392
+ }
393
+ else {
394
+ return `${(milliseconds / 1000).toFixed(1)}s`;
395
+ }
396
+ }
397
+ /**
398
+ * Utility function to calculate processing speed
399
+ * @param bytes - Number of bytes processed
400
+ * @param milliseconds - Processing time in milliseconds
401
+ * @returns Speed in MB/s
402
+ */
403
+ export function calculateProcessingSpeed(bytes, milliseconds) {
404
+ if (milliseconds === 0)
405
+ return 0;
406
+ const bytesPerSecond = (bytes / milliseconds) * 1000;
407
+ return bytesPerSecond / (1024 * 1024); // Convert to MB/s
408
+ }
409
+ //# sourceMappingURL=streaming-operations.js.map