rag-lite-ts 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/README.md +606 -93
  2. package/dist/cli/indexer.js +192 -4
  3. package/dist/cli/search.js +50 -11
  4. package/dist/cli.js +183 -26
  5. package/dist/core/abstract-embedder.d.ts +125 -0
  6. package/dist/core/abstract-embedder.js +264 -0
  7. package/dist/core/actionable-error-messages.d.ts +60 -0
  8. package/dist/core/actionable-error-messages.js +397 -0
  9. package/dist/core/batch-processing-optimizer.d.ts +155 -0
  10. package/dist/core/batch-processing-optimizer.js +541 -0
  11. package/dist/core/chunker.d.ts +2 -0
  12. package/dist/core/cli-database-utils.d.ts +53 -0
  13. package/dist/core/cli-database-utils.js +239 -0
  14. package/dist/core/config.js +10 -3
  15. package/dist/core/content-errors.d.ts +111 -0
  16. package/dist/core/content-errors.js +362 -0
  17. package/dist/core/content-manager.d.ts +343 -0
  18. package/dist/core/content-manager.js +1504 -0
  19. package/dist/core/content-performance-optimizer.d.ts +150 -0
  20. package/dist/core/content-performance-optimizer.js +516 -0
  21. package/dist/core/content-resolver.d.ts +104 -0
  22. package/dist/core/content-resolver.js +285 -0
  23. package/dist/core/cross-modal-search.d.ts +164 -0
  24. package/dist/core/cross-modal-search.js +342 -0
  25. package/dist/core/database-connection-manager.d.ts +109 -0
  26. package/dist/core/database-connection-manager.js +304 -0
  27. package/dist/core/db.d.ts +141 -2
  28. package/dist/core/db.js +631 -89
  29. package/dist/core/embedder-factory.d.ts +176 -0
  30. package/dist/core/embedder-factory.js +338 -0
  31. package/dist/core/index.d.ts +3 -1
  32. package/dist/core/index.js +4 -1
  33. package/dist/core/ingestion.d.ts +85 -15
  34. package/dist/core/ingestion.js +510 -45
  35. package/dist/core/lazy-dependency-loader.d.ts +152 -0
  36. package/dist/core/lazy-dependency-loader.js +453 -0
  37. package/dist/core/mode-detection-service.d.ts +150 -0
  38. package/dist/core/mode-detection-service.js +565 -0
  39. package/dist/core/mode-model-validator.d.ts +92 -0
  40. package/dist/core/mode-model-validator.js +203 -0
  41. package/dist/core/model-registry.d.ts +120 -0
  42. package/dist/core/model-registry.js +415 -0
  43. package/dist/core/model-validator.d.ts +217 -0
  44. package/dist/core/model-validator.js +782 -0
  45. package/dist/core/polymorphic-search-factory.d.ts +154 -0
  46. package/dist/core/polymorphic-search-factory.js +344 -0
  47. package/dist/core/raglite-paths.d.ts +121 -0
  48. package/dist/core/raglite-paths.js +145 -0
  49. package/dist/core/reranking-config.d.ts +42 -0
  50. package/dist/core/reranking-config.js +156 -0
  51. package/dist/core/reranking-factory.d.ts +92 -0
  52. package/dist/core/reranking-factory.js +591 -0
  53. package/dist/core/reranking-strategies.d.ts +325 -0
  54. package/dist/core/reranking-strategies.js +720 -0
  55. package/dist/core/resource-cleanup.d.ts +163 -0
  56. package/dist/core/resource-cleanup.js +371 -0
  57. package/dist/core/resource-manager.d.ts +212 -0
  58. package/dist/core/resource-manager.js +564 -0
  59. package/dist/core/search.d.ts +28 -1
  60. package/dist/core/search.js +83 -5
  61. package/dist/core/streaming-operations.d.ts +145 -0
  62. package/dist/core/streaming-operations.js +409 -0
  63. package/dist/core/types.d.ts +3 -0
  64. package/dist/core/universal-embedder.d.ts +177 -0
  65. package/dist/core/universal-embedder.js +139 -0
  66. package/dist/core/validation-messages.d.ts +99 -0
  67. package/dist/core/validation-messages.js +334 -0
  68. package/dist/core/vector-index.js +7 -8
  69. package/dist/factories/index.d.ts +1 -1
  70. package/dist/factories/text-factory.d.ts +128 -34
  71. package/dist/factories/text-factory.js +346 -97
  72. package/dist/file-processor.d.ts +88 -2
  73. package/dist/file-processor.js +720 -17
  74. package/dist/index.d.ts +9 -0
  75. package/dist/index.js +11 -0
  76. package/dist/ingestion.d.ts +16 -0
  77. package/dist/ingestion.js +21 -0
  78. package/dist/mcp-server.d.ts +35 -3
  79. package/dist/mcp-server.js +1107 -31
  80. package/dist/multimodal/clip-embedder.d.ts +314 -0
  81. package/dist/multimodal/clip-embedder.js +945 -0
  82. package/dist/multimodal/index.d.ts +6 -0
  83. package/dist/multimodal/index.js +6 -0
  84. package/dist/run-error-recovery-tests.d.ts +7 -0
  85. package/dist/run-error-recovery-tests.js +101 -0
  86. package/dist/search.d.ts +26 -0
  87. package/dist/search.js +54 -1
  88. package/dist/test-utils.d.ts +8 -26
  89. package/dist/text/chunker.d.ts +1 -0
  90. package/dist/text/embedder.js +15 -8
  91. package/dist/text/index.d.ts +1 -0
  92. package/dist/text/index.js +1 -0
  93. package/dist/text/reranker.d.ts +1 -2
  94. package/dist/text/reranker.js +17 -47
  95. package/dist/text/sentence-transformer-embedder.d.ts +96 -0
  96. package/dist/text/sentence-transformer-embedder.js +340 -0
  97. package/dist/types.d.ts +39 -0
  98. package/dist/utils/vector-math.d.ts +31 -0
  99. package/dist/utils/vector-math.js +70 -0
  100. package/package.json +15 -3
  101. package/dist/api-errors.d.ts.map +0 -1
  102. package/dist/api-errors.js.map +0 -1
  103. package/dist/cli/indexer.d.ts.map +0 -1
  104. package/dist/cli/indexer.js.map +0 -1
  105. package/dist/cli/search.d.ts.map +0 -1
  106. package/dist/cli/search.js.map +0 -1
  107. package/dist/cli.d.ts.map +0 -1
  108. package/dist/cli.js.map +0 -1
  109. package/dist/config.d.ts.map +0 -1
  110. package/dist/config.js.map +0 -1
  111. package/dist/core/adapters.d.ts.map +0 -1
  112. package/dist/core/adapters.js.map +0 -1
  113. package/dist/core/chunker.d.ts.map +0 -1
  114. package/dist/core/chunker.js.map +0 -1
  115. package/dist/core/config.d.ts.map +0 -1
  116. package/dist/core/config.js.map +0 -1
  117. package/dist/core/db.d.ts.map +0 -1
  118. package/dist/core/db.js.map +0 -1
  119. package/dist/core/error-handler.d.ts.map +0 -1
  120. package/dist/core/error-handler.js.map +0 -1
  121. package/dist/core/index.d.ts.map +0 -1
  122. package/dist/core/index.js.map +0 -1
  123. package/dist/core/ingestion.d.ts.map +0 -1
  124. package/dist/core/ingestion.js.map +0 -1
  125. package/dist/core/interfaces.d.ts.map +0 -1
  126. package/dist/core/interfaces.js.map +0 -1
  127. package/dist/core/path-manager.d.ts.map +0 -1
  128. package/dist/core/path-manager.js.map +0 -1
  129. package/dist/core/search-example.d.ts +0 -25
  130. package/dist/core/search-example.d.ts.map +0 -1
  131. package/dist/core/search-example.js +0 -138
  132. package/dist/core/search-example.js.map +0 -1
  133. package/dist/core/search-pipeline-example.d.ts +0 -21
  134. package/dist/core/search-pipeline-example.d.ts.map +0 -1
  135. package/dist/core/search-pipeline-example.js +0 -188
  136. package/dist/core/search-pipeline-example.js.map +0 -1
  137. package/dist/core/search-pipeline.d.ts.map +0 -1
  138. package/dist/core/search-pipeline.js.map +0 -1
  139. package/dist/core/search.d.ts.map +0 -1
  140. package/dist/core/search.js.map +0 -1
  141. package/dist/core/types.d.ts.map +0 -1
  142. package/dist/core/types.js.map +0 -1
  143. package/dist/core/vector-index.d.ts.map +0 -1
  144. package/dist/core/vector-index.js.map +0 -1
  145. package/dist/dom-polyfills.d.ts.map +0 -1
  146. package/dist/dom-polyfills.js.map +0 -1
  147. package/dist/examples/clean-api-examples.d.ts +0 -44
  148. package/dist/examples/clean-api-examples.d.ts.map +0 -1
  149. package/dist/examples/clean-api-examples.js +0 -206
  150. package/dist/examples/clean-api-examples.js.map +0 -1
  151. package/dist/factories/index.d.ts.map +0 -1
  152. package/dist/factories/index.js.map +0 -1
  153. package/dist/factories/text-factory.d.ts.map +0 -1
  154. package/dist/factories/text-factory.js.map +0 -1
  155. package/dist/file-processor.d.ts.map +0 -1
  156. package/dist/file-processor.js.map +0 -1
  157. package/dist/index-manager.d.ts.map +0 -1
  158. package/dist/index-manager.js.map +0 -1
  159. package/dist/index.d.ts.map +0 -1
  160. package/dist/index.js.map +0 -1
  161. package/dist/indexer.d.ts.map +0 -1
  162. package/dist/indexer.js.map +0 -1
  163. package/dist/ingestion.d.ts.map +0 -1
  164. package/dist/ingestion.js.map +0 -1
  165. package/dist/mcp-server.d.ts.map +0 -1
  166. package/dist/mcp-server.js.map +0 -1
  167. package/dist/preprocess.d.ts.map +0 -1
  168. package/dist/preprocess.js.map +0 -1
  169. package/dist/preprocessors/index.d.ts.map +0 -1
  170. package/dist/preprocessors/index.js.map +0 -1
  171. package/dist/preprocessors/mdx.d.ts.map +0 -1
  172. package/dist/preprocessors/mdx.js.map +0 -1
  173. package/dist/preprocessors/mermaid.d.ts.map +0 -1
  174. package/dist/preprocessors/mermaid.js.map +0 -1
  175. package/dist/preprocessors/registry.d.ts.map +0 -1
  176. package/dist/preprocessors/registry.js.map +0 -1
  177. package/dist/search-standalone.d.ts.map +0 -1
  178. package/dist/search-standalone.js.map +0 -1
  179. package/dist/search.d.ts.map +0 -1
  180. package/dist/search.js.map +0 -1
  181. package/dist/test-utils.d.ts.map +0 -1
  182. package/dist/test-utils.js.map +0 -1
  183. package/dist/text/chunker.d.ts.map +0 -1
  184. package/dist/text/chunker.js.map +0 -1
  185. package/dist/text/embedder.d.ts.map +0 -1
  186. package/dist/text/embedder.js.map +0 -1
  187. package/dist/text/index.d.ts.map +0 -1
  188. package/dist/text/index.js.map +0 -1
  189. package/dist/text/preprocessors/index.d.ts.map +0 -1
  190. package/dist/text/preprocessors/index.js.map +0 -1
  191. package/dist/text/preprocessors/mdx.d.ts.map +0 -1
  192. package/dist/text/preprocessors/mdx.js.map +0 -1
  193. package/dist/text/preprocessors/mermaid.d.ts.map +0 -1
  194. package/dist/text/preprocessors/mermaid.js.map +0 -1
  195. package/dist/text/preprocessors/registry.d.ts.map +0 -1
  196. package/dist/text/preprocessors/registry.js.map +0 -1
  197. package/dist/text/reranker.d.ts.map +0 -1
  198. package/dist/text/reranker.js.map +0 -1
  199. package/dist/text/tokenizer.d.ts.map +0 -1
  200. package/dist/text/tokenizer.js.map +0 -1
  201. package/dist/types.d.ts.map +0 -1
  202. package/dist/types.js.map +0 -1
@@ -43,7 +43,15 @@ import * as mammoth from 'mammoth';
43
43
  /**
44
44
  * Supported file extensions for document ingestion
45
45
  */
46
- const SUPPORTED_EXTENSIONS = ['.md', '.txt', '.mdx', '.pdf', '.docx'];
46
+ const SUPPORTED_TEXT_EXTENSIONS = ['.md', '.txt', '.mdx', '.pdf', '.docx'];
47
+ /**
48
+ * Supported image file extensions for multimodal ingestion
49
+ */
50
+ const SUPPORTED_IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'];
51
+ /**
52
+ * All supported file extensions (text + image)
53
+ */
54
+ const SUPPORTED_EXTENSIONS = [...SUPPORTED_TEXT_EXTENSIONS, ...SUPPORTED_IMAGE_EXTENSIONS];
47
55
  /**
48
56
  * Default options for file processing
49
57
  */
@@ -51,6 +59,15 @@ export const DEFAULT_FILE_PROCESSOR_OPTIONS = {
51
59
  recursive: true,
52
60
  maxFileSize: 10 * 1024 * 1024 // 10MB
53
61
  };
62
+ /**
63
+ * Default options for image-to-text processing
64
+ */
65
+ export const DEFAULT_IMAGE_TO_TEXT_OPTIONS = {
66
+ model: 'Xenova/vit-gpt2-image-captioning',
67
+ maxLength: 50,
68
+ batchSize: 4,
69
+ includeConfidence: false
70
+ };
54
71
  /**
55
72
  * Check if a file has a supported extension
56
73
  */
@@ -58,6 +75,93 @@ function isSupportedFile(filePath) {
58
75
  const ext = extname(filePath).toLowerCase();
59
76
  return SUPPORTED_EXTENSIONS.includes(ext);
60
77
  }
78
+ /**
79
+ * Determine content type based on file extension
80
+ */
81
+ function getContentType(filePath) {
82
+ const ext = extname(filePath).toLowerCase();
83
+ if (SUPPORTED_IMAGE_EXTENSIONS.includes(ext)) {
84
+ return 'image';
85
+ }
86
+ return 'text';
87
+ }
88
+ /**
89
+ * Check if a file is an image file
90
+ */
91
+ function isImageFile(filePath) {
92
+ const ext = extname(filePath).toLowerCase();
93
+ return SUPPORTED_IMAGE_EXTENSIONS.includes(ext);
94
+ }
95
+ /**
96
+ * Validate image file format and accessibility
97
+ */
98
+ async function validateImageFile(filePath) {
99
+ try {
100
+ const stats = await fs.stat(filePath);
101
+ // Check if file is readable
102
+ if (!stats.isFile()) {
103
+ return { valid: false, error: 'Path is not a file' };
104
+ }
105
+ // Check file size (images can be larger than text files)
106
+ const maxImageSize = 50 * 1024 * 1024; // 50MB for images
107
+ if (stats.size > maxImageSize) {
108
+ return {
109
+ valid: false,
110
+ error: `Image file size (${Math.round(stats.size / 1024 / 1024)}MB) exceeds maximum (50MB)`
111
+ };
112
+ }
113
+ // Check if file is empty
114
+ if (stats.size === 0) {
115
+ return { valid: false, error: 'Image file is empty' };
116
+ }
117
+ // Basic format validation by reading file header
118
+ const buffer = await fs.readFile(filePath, { encoding: null });
119
+ const ext = extname(filePath).toLowerCase();
120
+ // Validate file signatures (magic numbers)
121
+ if (ext === '.jpg' || ext === '.jpeg') {
122
+ if (buffer[0] !== 0xFF || buffer[1] !== 0xD8) {
123
+ return { valid: false, error: 'Invalid JPEG file format' };
124
+ }
125
+ }
126
+ else if (ext === '.png') {
127
+ const pngSignature = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
128
+ for (let i = 0; i < pngSignature.length; i++) {
129
+ if (buffer[i] !== pngSignature[i]) {
130
+ return { valid: false, error: 'Invalid PNG file format' };
131
+ }
132
+ }
133
+ }
134
+ else if (ext === '.gif') {
135
+ const gifSignature = [0x47, 0x49, 0x46]; // "GIF"
136
+ for (let i = 0; i < gifSignature.length; i++) {
137
+ if (buffer[i] !== gifSignature[i]) {
138
+ return { valid: false, error: 'Invalid GIF file format' };
139
+ }
140
+ }
141
+ }
142
+ else if (ext === '.webp') {
143
+ // WebP: "RIFF" at start and "WEBP" at offset 8
144
+ if (buffer[0] !== 0x52 || buffer[1] !== 0x49 || buffer[2] !== 0x46 || buffer[3] !== 0x46) {
145
+ return { valid: false, error: 'Invalid WebP file format (missing RIFF header)' };
146
+ }
147
+ if (buffer[8] !== 0x57 || buffer[9] !== 0x45 || buffer[10] !== 0x42 || buffer[11] !== 0x50) {
148
+ return { valid: false, error: 'Invalid WebP file format (missing WEBP signature)' };
149
+ }
150
+ }
151
+ else if (ext === '.bmp') {
152
+ if (buffer[0] !== 0x42 || buffer[1] !== 0x4D) { // "BM"
153
+ return { valid: false, error: 'Invalid BMP file format' };
154
+ }
155
+ }
156
+ return { valid: true };
157
+ }
158
+ catch (error) {
159
+ return {
160
+ valid: false,
161
+ error: `Failed to validate image file: ${error instanceof Error ? error.message : String(error)}`
162
+ };
163
+ }
164
+ }
61
165
  /**
62
166
  * Recursively discover files in a directory
63
167
  */
@@ -81,21 +185,37 @@ async function discoverFilesRecursive(dirPath, options) {
81
185
  else if (entry.isFile()) {
82
186
  if (isSupportedFile(fullPath)) {
83
187
  try {
84
- // Check file size
188
+ // Check file size based on content type
85
189
  const stats = await fs.stat(fullPath);
86
- if (options.maxFileSize && stats.size > options.maxFileSize) {
190
+ const contentType = getContentType(fullPath);
191
+ // Different size limits for different content types
192
+ const maxSize = contentType === 'image'
193
+ ? 50 * 1024 * 1024 // 50MB for images
194
+ : (options.maxFileSize || 10 * 1024 * 1024); // 10MB for text files
195
+ if (stats.size > maxSize) {
87
196
  result.skipped.push({
88
197
  path: fullPath,
89
- reason: `File size (${stats.size} bytes) exceeds maximum (${options.maxFileSize} bytes)`
198
+ reason: `File size (${Math.round(stats.size / 1024 / 1024)}MB) exceeds maximum (${Math.round(maxSize / 1024 / 1024)}MB) for ${contentType} files`
90
199
  });
91
200
  continue;
92
201
  }
202
+ // Additional validation for image files
203
+ if (contentType === 'image') {
204
+ const validation = await validateImageFile(fullPath);
205
+ if (!validation.valid) {
206
+ result.skipped.push({
207
+ path: fullPath,
208
+ reason: validation.error || 'Invalid image file'
209
+ });
210
+ continue;
211
+ }
212
+ }
93
213
  result.files.push(fullPath);
94
214
  }
95
215
  catch (error) {
96
216
  result.skipped.push({
97
217
  path: fullPath,
98
- reason: `Failed to stat file: ${error instanceof Error ? error.message : String(error)}`
218
+ reason: `Failed to validate file: ${error instanceof Error ? error.message : String(error)}`
99
219
  });
100
220
  }
101
221
  }
@@ -125,20 +245,37 @@ export async function discoverFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIO
125
245
  files: [],
126
246
  skipped: [{
127
247
  path: resolvedPath,
128
- reason: `Unsupported file extension. Supported: ${SUPPORTED_EXTENSIONS.join(', ')}`
248
+ reason: `Unsupported file extension. Supported text: ${SUPPORTED_TEXT_EXTENSIONS.join(', ')}, images: ${SUPPORTED_IMAGE_EXTENSIONS.join(', ')}`
129
249
  }]
130
250
  };
131
251
  }
132
- // Check file size
133
- if (options.maxFileSize && stats.size > options.maxFileSize) {
252
+ const contentType = getContentType(resolvedPath);
253
+ // Check file size based on content type
254
+ const maxSize = contentType === 'image'
255
+ ? 50 * 1024 * 1024 // 50MB for images
256
+ : (options.maxFileSize || 10 * 1024 * 1024); // 10MB for text files
257
+ if (stats.size > maxSize) {
134
258
  return {
135
259
  files: [],
136
260
  skipped: [{
137
261
  path: resolvedPath,
138
- reason: `File size (${stats.size} bytes) exceeds maximum (${options.maxFileSize} bytes)`
262
+ reason: `File size (${Math.round(stats.size / 1024 / 1024)}MB) exceeds maximum (${Math.round(maxSize / 1024 / 1024)}MB) for ${contentType} files`
139
263
  }]
140
264
  };
141
265
  }
266
+ // Additional validation for image files
267
+ if (contentType === 'image') {
268
+ const validation = await validateImageFile(resolvedPath);
269
+ if (!validation.valid) {
270
+ return {
271
+ files: [],
272
+ skipped: [{
273
+ path: resolvedPath,
274
+ reason: validation.error || 'Invalid image file'
275
+ }]
276
+ };
277
+ }
278
+ }
142
279
  return {
143
280
  files: [resolvedPath],
144
281
  skipped: []
@@ -205,11 +342,424 @@ function extractTitle(content, filePath) {
205
342
  const ext = extname(filename);
206
343
  return ext ? filename.slice(0, -ext.length) : filename;
207
344
  }
345
+ /**
346
+ * Cache for image-to-text pipeline to avoid reloading
347
+ */
348
+ let imageToTextPipeline = null;
349
+ /**
350
+ * Initialize the image-to-text pipeline
351
+ */
352
+ async function initializeImageToTextPipeline(modelName = 'Xenova/vit-gpt2-image-captioning') {
353
+ if (imageToTextPipeline) {
354
+ return imageToTextPipeline;
355
+ }
356
+ try {
357
+ const { pipeline } = await import('@huggingface/transformers');
358
+ console.log(`Loading image-to-text model: ${modelName}`);
359
+ imageToTextPipeline = await pipeline('image-to-text', modelName);
360
+ console.log(`Successfully loaded image-to-text model: ${modelName}`);
361
+ return imageToTextPipeline;
362
+ }
363
+ catch (error) {
364
+ console.error(`Failed to load image-to-text model ${modelName}:`, error);
365
+ throw new Error(`Failed to initialize image-to-text pipeline: ${error instanceof Error ? error.message : String(error)}`);
366
+ }
367
+ }
368
+ /**
369
+ * Parse PNG image dimensions from file buffer
370
+ */
371
+ function parsePngDimensions(buffer) {
372
+ try {
373
+ // PNG signature: 89 50 4E 47 0D 0A 1A 0A
374
+ if (buffer.length < 24)
375
+ return null;
376
+ // Check PNG signature
377
+ const pngSignature = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
378
+ for (let i = 0; i < pngSignature.length; i++) {
379
+ if (buffer[i] !== pngSignature[i])
380
+ return null;
381
+ }
382
+ // IHDR chunk starts at byte 8, dimensions at bytes 16-23
383
+ const width = buffer.readUInt32BE(16);
384
+ const height = buffer.readUInt32BE(20);
385
+ return { width, height };
386
+ }
387
+ catch (error) {
388
+ return null;
389
+ }
390
+ }
391
+ /**
392
+ * Parse JPEG image dimensions from file buffer
393
+ */
394
+ function parseJpegDimensions(buffer) {
395
+ try {
396
+ if (buffer.length < 4)
397
+ return null;
398
+ // Check JPEG signature
399
+ if (buffer[0] !== 0xFF || buffer[1] !== 0xD8)
400
+ return null;
401
+ let offset = 2;
402
+ while (offset < buffer.length - 8) {
403
+ // Find SOF (Start of Frame) markers
404
+ if (buffer[offset] === 0xFF) {
405
+ const marker = buffer[offset + 1];
406
+ // SOF0 (0xC0) or SOF2 (0xC2) markers contain dimensions
407
+ if (marker === 0xC0 || marker === 0xC2) {
408
+ const height = buffer.readUInt16BE(offset + 5);
409
+ const width = buffer.readUInt16BE(offset + 7);
410
+ return { width, height };
411
+ }
412
+ // Skip to next marker
413
+ const segmentLength = buffer.readUInt16BE(offset + 2);
414
+ offset += 2 + segmentLength;
415
+ }
416
+ else {
417
+ offset++;
418
+ }
419
+ }
420
+ return null;
421
+ }
422
+ catch (error) {
423
+ return null;
424
+ }
425
+ }
426
+ /**
427
+ * Parse GIF image dimensions from file buffer
428
+ */
429
+ function parseGifDimensions(buffer) {
430
+ try {
431
+ if (buffer.length < 10)
432
+ return null;
433
+ // Check GIF signature
434
+ const gifSignature = [0x47, 0x49, 0x46]; // "GIF"
435
+ for (let i = 0; i < gifSignature.length; i++) {
436
+ if (buffer[i] !== gifSignature[i])
437
+ return null;
438
+ }
439
+ // Dimensions are at bytes 6-9 (little endian)
440
+ const width = buffer.readUInt16LE(6);
441
+ const height = buffer.readUInt16LE(8);
442
+ return { width, height };
443
+ }
444
+ catch (error) {
445
+ return null;
446
+ }
447
+ }
448
+ /**
449
+ * Parse WebP image dimensions from file buffer
450
+ */
451
+ function parseWebpDimensions(buffer) {
452
+ try {
453
+ if (buffer.length < 30)
454
+ return null;
455
+ // Check WebP signature
456
+ if (buffer.readUInt32BE(0) !== 0x52494646)
457
+ return null; // "RIFF"
458
+ if (buffer.readUInt32BE(8) !== 0x57454250)
459
+ return null; // "WEBP"
460
+ // VP8 format
461
+ if (buffer.readUInt32BE(12) === 0x56503820) { // "VP8 "
462
+ const width = buffer.readUInt16LE(26) & 0x3FFF;
463
+ const height = buffer.readUInt16LE(28) & 0x3FFF;
464
+ return { width, height };
465
+ }
466
+ // VP8L format
467
+ if (buffer.readUInt32BE(12) === 0x5650384C) { // "VP8L"
468
+ const bits = buffer.readUInt32LE(21);
469
+ const width = (bits & 0x3FFF) + 1;
470
+ const height = ((bits >> 14) & 0x3FFF) + 1;
471
+ return { width, height };
472
+ }
473
+ return null;
474
+ }
475
+ catch (error) {
476
+ return null;
477
+ }
478
+ }
479
+ /**
480
+ * Parse BMP image dimensions from file buffer
481
+ */
482
+ function parseBmpDimensions(buffer) {
483
+ try {
484
+ if (buffer.length < 26)
485
+ return null;
486
+ // Check BMP signature
487
+ if (buffer[0] !== 0x42 || buffer[1] !== 0x4D)
488
+ return null; // "BM"
489
+ // Dimensions are at bytes 18-25 (little endian)
490
+ const width = buffer.readInt32LE(18);
491
+ const height = Math.abs(buffer.readInt32LE(22)); // Height can be negative
492
+ return { width, height };
493
+ }
494
+ catch (error) {
495
+ return null;
496
+ }
497
+ }
498
+ /**
499
+ * Extract image dimensions from file buffer based on format
500
+ */
501
+ function extractImageDimensions(buffer, format) {
502
+ switch (format.toLowerCase()) {
503
+ case 'png':
504
+ return parsePngDimensions(buffer);
505
+ case 'jpg':
506
+ case 'jpeg':
507
+ return parseJpegDimensions(buffer);
508
+ case 'gif':
509
+ return parseGifDimensions(buffer);
510
+ case 'webp':
511
+ return parseWebpDimensions(buffer);
512
+ case 'bmp':
513
+ return parseBmpDimensions(buffer);
514
+ default:
515
+ return null;
516
+ }
517
+ }
518
+ /**
519
+ * Extract metadata from an image file using native parsing
520
+ */
521
+ async function extractImageMetadata(imagePath) {
522
+ try {
523
+ const stats = await fs.stat(imagePath);
524
+ const format = extname(imagePath).toLowerCase().substring(1);
525
+ // Read file buffer for dimension extraction
526
+ const buffer = await fs.readFile(imagePath);
527
+ // Extract dimensions using native parsing
528
+ const dimensions = extractImageDimensions(buffer, format);
529
+ const imageMetadata = {
530
+ originalPath: imagePath,
531
+ dimensions: dimensions || { width: 0, height: 0 }, // Use 0 if dimensions can't be extracted
532
+ fileSize: stats.size,
533
+ format: format,
534
+ createdAt: stats.birthtime || stats.mtime
535
+ };
536
+ return imageMetadata;
537
+ }
538
+ catch (error) {
539
+ throw new Error(`Failed to extract metadata for image ${imagePath}: ${error instanceof Error ? error.message : String(error)}`);
540
+ }
541
+ }
542
+ /**
543
+ * Generate text description for a single image
544
+ */
545
+ async function generateImageDescription(imagePath, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
546
+ try {
547
+ const pipeline = await initializeImageToTextPipeline(options.model);
548
+ // Generate description
549
+ const result = await pipeline(imagePath, {
550
+ max_length: options.maxLength || 50,
551
+ num_beams: 4,
552
+ early_stopping: true
553
+ });
554
+ // Extract description and confidence
555
+ const description = Array.isArray(result) ? result[0]?.generated_text : result?.generated_text;
556
+ const confidence = Array.isArray(result) ? result[0]?.score : result?.score;
557
+ if (!description) {
558
+ throw new Error('No description generated for image');
559
+ }
560
+ // Clean up the description
561
+ const cleanDescription = description.trim();
562
+ return {
563
+ description: cleanDescription,
564
+ confidence: options.includeConfidence ? confidence : undefined,
565
+ model: options.model || DEFAULT_IMAGE_TO_TEXT_OPTIONS.model
566
+ };
567
+ }
568
+ catch (error) {
569
+ throw new Error(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : String(error)}`);
570
+ }
571
+ }
572
+ /**
573
+ * Generate text descriptions for multiple images in batches
574
+ */
575
+ async function generateImageDescriptionsBatch(imagePaths, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
576
+ const results = [];
577
+ const batchSize = options.batchSize || DEFAULT_IMAGE_TO_TEXT_OPTIONS.batchSize;
578
+ // Process images in batches
579
+ for (let i = 0; i < imagePaths.length; i += batchSize) {
580
+ const batch = imagePaths.slice(i, i + batchSize);
581
+ console.log(`Processing image batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(imagePaths.length / batchSize)} (${batch.length} images)`);
582
+ // Process batch in parallel
583
+ const batchPromises = batch.map(async (imagePath) => {
584
+ try {
585
+ const result = await generateImageDescription(imagePath, options);
586
+ return { path: imagePath, result };
587
+ }
588
+ catch (error) {
589
+ return {
590
+ path: imagePath,
591
+ error: error instanceof Error ? error.message : String(error)
592
+ };
593
+ }
594
+ });
595
+ const batchResults = await Promise.all(batchPromises);
596
+ results.push(...batchResults);
597
+ }
598
+ return results;
599
+ }
600
+ /**
601
+ * Generate text descriptions for multiple images using optimized batch processing
602
+ * Uses BatchProcessingOptimizer for memory-efficient processing of large image collections
603
+ */
604
+ async function generateImageDescriptionsBatchOptimized(imagePaths, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
605
+ // For small batches, use the existing implementation
606
+ if (imagePaths.length <= 10) {
607
+ return generateImageDescriptionsBatch(imagePaths, options);
608
+ }
609
+ try {
610
+ // Import batch processing optimizer
611
+ const { createImageBatchProcessor } = await import('./core/batch-processing-optimizer.js');
612
+ const batchProcessor = createImageBatchProcessor();
613
+ // Convert image paths to batch items
614
+ const batchItems = imagePaths.map(path => ({
615
+ content: path,
616
+ contentType: 'image',
617
+ metadata: { originalPath: path }
618
+ }));
619
+ // Create image description function
620
+ const imageDescriptionFunction = async (item) => {
621
+ try {
622
+ const result = await generateImageDescription(item.content, options);
623
+ return {
624
+ embedding_id: `img_desc_${Date.now()}_${Math.random()}`,
625
+ vector: new Float32Array([0]), // Placeholder vector
626
+ contentType: 'image',
627
+ metadata: {
628
+ path: item.content,
629
+ description: result.description,
630
+ confidence: result.confidence,
631
+ model: result.model
632
+ }
633
+ };
634
+ }
635
+ catch (error) {
636
+ throw new Error(`Failed to generate description for ${item.content}: ${error instanceof Error ? error.message : String(error)}`);
637
+ }
638
+ };
639
+ // Process with optimization and progress reporting
640
+ const batchResult = await batchProcessor.processBatch(batchItems, imageDescriptionFunction, (stats) => {
641
+ console.log(`Image description progress: ${stats.processedItems}/${stats.totalItems} (${Math.round((stats.processedItems / stats.totalItems) * 100)}%)`);
642
+ console.log(` Memory usage: ${stats.memoryUsageMB}MB (peak: ${stats.peakMemoryUsageMB}MB)`);
643
+ if (stats.failedItems > 0) {
644
+ console.log(` Failed items: ${stats.failedItems}`);
645
+ }
646
+ });
647
+ // Log final statistics
648
+ console.log(`✓ Image description generation complete:`);
649
+ console.log(` Processed: ${batchResult.stats.processedItems}/${batchResult.stats.totalItems}`);
650
+ console.log(` Failed: ${batchResult.stats.failedItems}`);
651
+ console.log(` Processing time: ${Math.round(batchResult.stats.processingTimeMs / 1000)}s`);
652
+ console.log(` Rate: ${Math.round(batchResult.stats.itemsPerSecond)} images/sec`);
653
+ console.log(` Peak memory usage: ${batchResult.stats.peakMemoryUsageMB}MB`);
654
+ if (batchResult.stats.retryCount > 0) {
655
+ console.log(` Retries: ${batchResult.stats.retryCount}`);
656
+ }
657
+ // Convert results back to expected format
658
+ const results = [];
659
+ // Add successful results
660
+ for (const result of batchResult.results) {
661
+ if (result.metadata?.description) {
662
+ results.push({
663
+ path: result.metadata.path,
664
+ result: {
665
+ description: result.metadata.description,
666
+ confidence: result.metadata.confidence,
667
+ model: result.metadata.model
668
+ }
669
+ });
670
+ }
671
+ }
672
+ // Add failed results
673
+ for (const error of batchResult.errors) {
674
+ results.push({
675
+ path: error.item.content,
676
+ error: error.error
677
+ });
678
+ }
679
+ return results;
680
+ }
681
+ catch (error) {
682
+ console.warn(`Optimized batch processing failed, falling back to standard batch processing: ${error instanceof Error ? error.message : String(error)}`);
683
+ // Fall back to existing implementation
684
+ return generateImageDescriptionsBatch(imagePaths, options);
685
+ }
686
+ }
687
+ /**
688
+ * Process image file to extract text description and metadata
689
+ */
690
+ async function processImageFile(filePath, pathManager, options = DEFAULT_IMAGE_TO_TEXT_OPTIONS) {
691
+ try {
692
+ // Extract image metadata first
693
+ const imageMetadata = await extractImageMetadata(filePath);
694
+ // Generate text description for the image
695
+ const descriptionResult = await generateImageDescription(filePath, options);
696
+ // Update metadata with description information
697
+ imageMetadata.description = descriptionResult.description;
698
+ imageMetadata.descriptionModel = descriptionResult.model;
699
+ imageMetadata.descriptionConfidence = descriptionResult.confidence;
700
+ // Create document with image description as content
701
+ const title = extractTitle('', filePath); // Use filename as title for images
702
+ // Create content that includes description and key metadata
703
+ const content = `Image: ${title}\nDescription: ${descriptionResult.description}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
704
+ return {
705
+ source: pathManager.toStoragePath(filePath),
706
+ title,
707
+ content: content.trim(),
708
+ // Store comprehensive metadata about the image
709
+ metadata: {
710
+ contentType: 'image',
711
+ ...imageMetadata // Spread all image metadata fields
712
+ }
713
+ };
714
+ }
715
+ catch (error) {
716
+ // If processing fails, try to extract at least basic metadata
717
+ console.warn(`Failed to fully process image ${filePath}, attempting basic metadata extraction: ${error instanceof Error ? error.message : String(error)}`);
718
+ try {
719
+ const imageMetadata = await extractImageMetadata(filePath);
720
+ const title = extractTitle('', filePath);
721
+ const content = `Image: ${title}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
722
+ return {
723
+ source: pathManager.toStoragePath(filePath),
724
+ title,
725
+ content: content.trim(),
726
+ metadata: {
727
+ contentType: 'image',
728
+ ...imageMetadata,
729
+ processingError: error instanceof Error ? error.message : String(error)
730
+ }
731
+ };
732
+ }
733
+ catch (metadataError) {
734
+ // Final fallback - create document with minimal information
735
+ console.warn(`Failed to extract any metadata for image ${filePath}, using minimal fallback: ${metadataError instanceof Error ? metadataError.message : String(metadataError)}`);
736
+ const title = extractTitle('', filePath);
737
+ const content = `Image: ${title}\nPath: ${filePath}`;
738
+ return {
739
+ source: pathManager.toStoragePath(filePath),
740
+ title,
741
+ content: content.trim(),
742
+ metadata: {
743
+ contentType: 'image',
744
+ originalPath: filePath,
745
+ processingError: error instanceof Error ? error.message : String(error),
746
+ metadataError: metadataError instanceof Error ? metadataError.message : String(metadataError)
747
+ }
748
+ };
749
+ }
750
+ }
751
+ }
208
752
  /**
209
753
  * Process a single file into a Document
210
754
  */
211
- async function processFile(filePath, pathManager) {
755
+ async function processFile(filePath, pathManager, imageToTextOptions) {
212
756
  const result = await safeExecute(async () => {
757
+ const contentType = getContentType(filePath);
758
+ // Handle image files differently
759
+ if (contentType === 'image') {
760
+ return await processImageFile(filePath, pathManager, imageToTextOptions);
761
+ }
762
+ // Handle text files (existing logic)
213
763
  let content;
214
764
  const ext = extname(filePath).toLowerCase();
215
765
  // Extract content based on file type
@@ -241,7 +791,10 @@ async function processFile(filePath, pathManager) {
241
791
  return {
242
792
  source: pathManager.toStoragePath(filePath), // Use path manager
243
793
  title,
244
- content: content.trim()
794
+ content: content.trim(),
795
+ metadata: {
796
+ contentType: 'text'
797
+ }
245
798
  };
246
799
  }, `File Processing: ${filePath}`, {
247
800
  category: ErrorCategory.FILE_SYSTEM,
@@ -256,14 +809,18 @@ async function processFile(filePath, pathManager) {
256
809
  * Process multiple files into Documents
257
810
  * Handles errors gracefully by skipping problematic files
258
811
  */
259
- export async function processFiles(filePaths, pathManager) {
812
+ export async function processFiles(filePaths, pathManager, imageToTextOptions) {
260
813
  const result = {
261
814
  documents: [],
262
815
  errors: []
263
816
  };
264
- for (const filePath of filePaths) {
817
+ // Separate image and text files for optimized processing
818
+ const imageFiles = filePaths.filter(path => getContentType(path) === 'image');
819
+ const textFiles = filePaths.filter(path => getContentType(path) === 'text');
820
+ // Process text files sequentially (existing behavior)
821
+ for (const filePath of textFiles) {
265
822
  try {
266
- const document = await processFile(filePath, pathManager);
823
+ const document = await processFile(filePath, pathManager, imageToTextOptions);
267
824
  result.documents.push(document);
268
825
  }
269
826
  catch (error) {
@@ -273,13 +830,82 @@ export async function processFiles(filePaths, pathManager) {
273
830
  });
274
831
  }
275
832
  }
833
+ // Process image files in batches for efficiency
834
+ if (imageFiles.length > 0) {
835
+ console.log(`Processing ${imageFiles.length} image files with optimized batch processing`);
836
+ try {
837
+ // Use optimized batch processing for image descriptions
838
+ const batchResults = await generateImageDescriptionsBatchOptimized(imageFiles, imageToTextOptions);
839
+ // Convert batch results to documents with metadata extraction
840
+ for (const batchResult of batchResults) {
841
+ try {
842
+ // Extract metadata for each image
843
+ const imageMetadata = await extractImageMetadata(batchResult.path);
844
+ if (batchResult.result) {
845
+ // Create document from successful description generation
846
+ imageMetadata.description = batchResult.result.description;
847
+ imageMetadata.descriptionModel = batchResult.result.model;
848
+ imageMetadata.descriptionConfidence = batchResult.result.confidence;
849
+ const title = extractTitle('', batchResult.path);
850
+ const content = `Image: ${title}\nDescription: ${batchResult.result.description}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
851
+ result.documents.push({
852
+ source: pathManager.toStoragePath(batchResult.path),
853
+ title,
854
+ content: content.trim(),
855
+ metadata: {
856
+ contentType: 'image',
857
+ ...imageMetadata
858
+ }
859
+ });
860
+ }
861
+ else {
862
+ // Create fallback document for failed description generation
863
+ const title = extractTitle('', batchResult.path);
864
+ const content = `Image: ${title}\nDimensions: ${imageMetadata.dimensions.width}x${imageMetadata.dimensions.height}\nFormat: ${imageMetadata.format}`;
865
+ result.documents.push({
866
+ source: pathManager.toStoragePath(batchResult.path),
867
+ title,
868
+ content: content.trim(),
869
+ metadata: {
870
+ contentType: 'image',
871
+ ...imageMetadata,
872
+ processingError: batchResult.error
873
+ }
874
+ });
875
+ }
876
+ }
877
+ catch (error) {
878
+ result.errors.push({
879
+ path: batchResult.path,
880
+ error: error instanceof Error ? error.message : String(error)
881
+ });
882
+ }
883
+ }
884
+ }
885
+ catch (error) {
886
+ // If batch processing fails entirely, fall back to individual processing
887
+ console.warn(`Batch processing failed, falling back to individual processing: ${error instanceof Error ? error.message : String(error)}`);
888
+ for (const filePath of imageFiles) {
889
+ try {
890
+ const document = await processFile(filePath, pathManager, imageToTextOptions);
891
+ result.documents.push(document);
892
+ }
893
+ catch (error) {
894
+ result.errors.push({
895
+ path: filePath,
896
+ error: error instanceof Error ? error.message : String(error)
897
+ });
898
+ }
899
+ }
900
+ }
901
+ }
276
902
  return result;
277
903
  }
278
904
  /**
279
905
  * Complete file discovery and processing pipeline
280
906
  * Discovers files and processes them into Documents
281
907
  */
282
- export async function discoverAndProcessFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIONS, pathManager) {
908
+ export async function discoverAndProcessFiles(path, options = DEFAULT_FILE_PROCESSOR_OPTIONS, pathManager, imageToTextOptions) {
283
909
  console.log(`Discovering files in: ${path}`);
284
910
  // Discover files
285
911
  const discoveryResult = await discoverFiles(path, options);
@@ -291,10 +917,23 @@ export async function discoverAndProcessFiles(path, options = DEFAULT_FILE_PROCE
291
917
  }
292
918
  }
293
919
  console.log(`Found ${discoveryResult.files.length} supported files`);
920
+ // Count different content types
921
+ const imageFiles = discoveryResult.files.filter(file => getContentType(file) === 'image');
922
+ const textFiles = discoveryResult.files.filter(file => getContentType(file) === 'text');
923
+ if (imageFiles.length > 0) {
924
+ console.log(` - ${textFiles.length} text files`);
925
+ console.log(` - ${imageFiles.length} image files`);
926
+ if (imageToTextOptions?.model) {
927
+ console.log(`Using image-to-text model: ${imageToTextOptions.model}`);
928
+ }
929
+ else {
930
+ console.log(`Using default image-to-text model: ${DEFAULT_IMAGE_TO_TEXT_OPTIONS.model}`);
931
+ }
932
+ }
294
933
  // Create default path manager if not provided
295
934
  const effectivePathManager = pathManager || new DocumentPathManager(config.path_storage_strategy, resolve(path));
296
- // Process discovered files with path manager
297
- const processingResult = await processFiles(discoveryResult.files, effectivePathManager);
935
+ // Process discovered files with path manager and image-to-text options
936
+ const processingResult = await processFiles(discoveryResult.files, effectivePathManager, imageToTextOptions);
298
937
  // Log processing results
299
938
  if (processingResult.errors.length > 0) {
300
939
  console.log(`Failed to process ${processingResult.errors.length} files:`);
@@ -309,4 +948,68 @@ export async function discoverAndProcessFiles(path, options = DEFAULT_FILE_PROCE
309
948
  processingResult
310
949
  };
311
950
  }
951
+ /**
952
+ * Clean up image processing resources
953
+ * Call this when shutting down the application to free memory
954
+ */
955
+ export async function cleanupImageProcessingResources() {
956
+ // Clean up image-to-text pipeline
957
+ if (imageToTextPipeline) {
958
+ try {
959
+ // Dispose of the pipeline if it has a dispose method
960
+ if (typeof imageToTextPipeline.dispose === 'function') {
961
+ await imageToTextPipeline.dispose();
962
+ }
963
+ imageToTextPipeline = null;
964
+ console.log('Image-to-text pipeline cleaned up');
965
+ }
966
+ catch (error) {
967
+ console.warn('Error cleaning up image-to-text pipeline:', error);
968
+ }
969
+ }
970
+ }
971
+ /**
972
+ * Clean up image-to-text pipeline resources (legacy function for backward compatibility)
973
+ * @deprecated Use cleanupImageProcessingResources() instead
974
+ */
975
+ export async function cleanupImageToTextPipeline() {
976
+ return cleanupImageProcessingResources();
977
+ }
978
+ /**
979
+ * Generate description for a single image (exported for external use)
980
+ */
981
+ export async function generateImageDescriptionForFile(imagePath, options) {
982
+ return generateImageDescription(imagePath, { ...DEFAULT_IMAGE_TO_TEXT_OPTIONS, ...options });
983
+ }
984
+ /**
985
+ * Generate descriptions for multiple images (exported for external use)
986
+ */
987
+ export async function generateImageDescriptionsForFiles(imagePaths, options) {
988
+ return generateImageDescriptionsBatch(imagePaths, { ...DEFAULT_IMAGE_TO_TEXT_OPTIONS, ...options });
989
+ }
990
+ /**
991
+ * Extract metadata from a single image file (exported for external use)
992
+ */
993
+ export async function extractImageMetadataForFile(imagePath) {
994
+ return extractImageMetadata(imagePath);
995
+ }
996
+ /**
997
+ * Extract metadata from multiple image files (exported for external use)
998
+ */
999
+ export async function extractImageMetadataForFiles(imagePaths) {
1000
+ const results = [];
1001
+ for (const imagePath of imagePaths) {
1002
+ try {
1003
+ const metadata = await extractImageMetadata(imagePath);
1004
+ results.push({ path: imagePath, metadata });
1005
+ }
1006
+ catch (error) {
1007
+ results.push({
1008
+ path: imagePath,
1009
+ error: error instanceof Error ? error.message : String(error)
1010
+ });
1011
+ }
1012
+ }
1013
+ return results;
1014
+ }
312
1015
  //# sourceMappingURL=file-processor.js.map