@aj-archipelago/cortex 1.4.6 → 1.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/helper-apps/cortex-file-handler/package-lock.json +2 -2
  2. package/helper-apps/cortex-file-handler/package.json +1 -1
  3. package/helper-apps/cortex-file-handler/src/index.js +27 -4
  4. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +74 -10
  5. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +23 -2
  6. package/helper-apps/cortex-file-handler/src/start.js +2 -0
  7. package/helper-apps/cortex-file-handler/tests/deleteOperations.test.js +287 -0
  8. package/helper-apps/cortex-file-handler/tests/start.test.js +1 -1
  9. package/lib/entityConstants.js +1 -1
  10. package/lib/fileUtils.js +1481 -0
  11. package/lib/pathwayTools.js +7 -1
  12. package/lib/util.js +2 -313
  13. package/package.json +4 -3
  14. package/pathways/image_qwen.js +1 -1
  15. package/pathways/system/entity/memory/sys_read_memory.js +17 -3
  16. package/pathways/system/entity/memory/sys_save_memory.js +22 -6
  17. package/pathways/system/entity/sys_entity_agent.js +21 -4
  18. package/pathways/system/entity/tools/sys_tool_analyzefile.js +171 -0
  19. package/pathways/system/entity/tools/sys_tool_codingagent.js +38 -4
  20. package/pathways/system/entity/tools/sys_tool_editfile.js +403 -0
  21. package/pathways/system/entity/tools/sys_tool_file_collection.js +433 -0
  22. package/pathways/system/entity/tools/sys_tool_image.js +172 -10
  23. package/pathways/system/entity/tools/sys_tool_image_gemini.js +123 -10
  24. package/pathways/system/entity/tools/sys_tool_readfile.js +217 -124
  25. package/pathways/system/entity/tools/sys_tool_validate_url.js +137 -0
  26. package/pathways/system/entity/tools/sys_tool_writefile.js +211 -0
  27. package/pathways/system/workspaces/run_workspace_prompt.js +4 -3
  28. package/pathways/transcribe_gemini.js +2 -1
  29. package/server/executeWorkspace.js +1 -1
  30. package/server/plugins/neuralSpacePlugin.js +2 -6
  31. package/server/plugins/openAiWhisperPlugin.js +2 -1
  32. package/server/plugins/replicateApiPlugin.js +4 -14
  33. package/server/typeDef.js +10 -1
  34. package/tests/integration/features/tools/fileCollection.test.js +858 -0
  35. package/tests/integration/features/tools/fileOperations.test.js +851 -0
  36. package/tests/integration/features/tools/writefile.test.js +350 -0
  37. package/tests/unit/core/fileCollection.test.js +259 -0
  38. package/tests/unit/core/util.test.js +320 -1
@@ -0,0 +1,1481 @@
1
+ import logger from "./logger.js";
2
+ import stream from 'stream';
3
+ import os from 'os';
4
+ import http from 'http';
5
+ import https from 'https';
6
+ import { URL } from 'url';
7
+ import { v4 as uuidv4 } from 'uuid';
8
+ import { promisify } from 'util';
9
+ import { axios } from './requestExecutor.js';
10
+ import { config } from '../config.js';
11
+ import fs from 'fs';
12
+ import path from 'path';
13
+ import FormData from 'form-data';
14
+ import xxhash from 'xxhash-wasm';
15
+ import mime from 'mime-types';
16
+
17
+ const pipeline = promisify(stream.pipeline);
18
+ const MEDIA_API_URL = config.get('whisperMediaApiUrl');
19
+
20
+ // Cache xxhash instance for reuse
21
+ let xxhashInstance = null;
22
+ let xxhashInitPromise = null;
23
+
24
+ /**
25
+ * Get or initialize xxhash instance (reused for performance)
26
+ * Thread-safe initialization to prevent race conditions in high-volume scenarios
27
+ * @returns {Promise<Object>} xxhash instance
28
+ */
29
+ async function getXXHashInstance() {
30
+ // If already initialized, return immediately
31
+ if (xxhashInstance) {
32
+ return xxhashInstance;
33
+ }
34
+
35
+ // If initialization is in progress, wait for it
36
+ if (xxhashInitPromise) {
37
+ return await xxhashInitPromise;
38
+ }
39
+
40
+ // Start initialization (only one will execute)
41
+ xxhashInitPromise = (async () => {
42
+ try {
43
+ const instance = await xxhash();
44
+ xxhashInstance = instance;
45
+ return instance;
46
+ } finally {
47
+ // Clear the promise so we can retry if initialization fails
48
+ xxhashInitPromise = null;
49
+ }
50
+ })();
51
+
52
+ return await xxhashInitPromise;
53
+ }
54
+
55
+ /**
56
+ * Compute xxhash64 hash of a file (super fast hash for file deduplication)
57
+ * Uses xxhash64 to match the hash format used in labeeb and cortex file handler
58
+ * @param {string} filePath - Path to the file
59
+ * @returns {Promise<string>} xxhash64 hash in hex format
60
+ */
61
+ async function computeFileHash(filePath) {
62
+ const hasher = await getXXHashInstance();
63
+
64
+ return new Promise((resolve, reject) => {
65
+ // Create a new xxh64 instance for this file to avoid concurrency issues
66
+ const xxh64 = hasher.create64();
67
+ const stream = fs.createReadStream(filePath);
68
+
69
+ stream.on('data', (data) => xxh64.update(data));
70
+ stream.on('end', () => resolve(xxh64.digest().toString(16)));
71
+ stream.on('error', (error) => reject(error));
72
+ });
73
+ }
74
+
75
+ /**
76
+ * Compute xxhash64 hash of a buffer
77
+ * @param {Buffer} buffer - Buffer to hash
78
+ * @returns {Promise<string>} xxhash64 hash in hex format
79
+ */
80
+ async function computeBufferHash(buffer) {
81
+ const hasher = await getXXHashInstance();
82
+ const xxh64 = hasher.create64();
83
+ xxh64.update(buffer);
84
+ return xxh64.digest().toString(16);
85
+ }
86
+
87
+ async function deleteTempPath(path) {
88
+ try {
89
+ if (!path) {
90
+ logger.warn('Temporary path is not defined.');
91
+ return;
92
+ }
93
+ if (!fs.existsSync(path)) {
94
+ logger.warn(`Temporary path ${path} does not exist.`);
95
+ return;
96
+ }
97
+ const stats = fs.statSync(path);
98
+ if (stats.isFile()) {
99
+ fs.unlinkSync(path);
100
+ logger.info(`Temporary file ${path} deleted successfully.`);
101
+ } else if (stats.isDirectory()) {
102
+ fs.rmSync(path, { recursive: true });
103
+ logger.info(`Temporary folder ${path} and its contents deleted successfully.`);
104
+ }
105
+ } catch (err) {
106
+ logger.error(`Error occurred while deleting the temporary path: ${err}`);
107
+ }
108
+ }
109
+
110
+ function generateUniqueFilename(extension) {
111
+ return `${uuidv4()}.${extension}`;
112
+ }
113
+
114
+ const downloadFile = async (fileUrl) => {
115
+ const urlObj = new URL(fileUrl);
116
+ const pathname = urlObj.pathname;
117
+ const fileExtension = pathname.substring(pathname.lastIndexOf('.') + 1);
118
+ const uniqueFilename = generateUniqueFilename(fileExtension);
119
+ const tempDir = os.tmpdir();
120
+ const localFilePath = `${tempDir}/${uniqueFilename}`;
121
+
122
+ // eslint-disable-next-line no-async-promise-executor
123
+ return new Promise(async (resolve, reject) => {
124
+ try {
125
+ const parsedUrl = new URL(fileUrl);
126
+ const protocol = parsedUrl.protocol === 'https:' ? https : http;
127
+
128
+ const response = await new Promise((resolve, reject) => {
129
+ protocol.get(parsedUrl, (res) => {
130
+ if (res.statusCode === 200) {
131
+ resolve(res);
132
+ } else {
133
+ reject(new Error(`HTTP request failed with status code ${res.statusCode}`));
134
+ }
135
+ }).on('error', reject);
136
+ });
137
+
138
+ await pipeline(response, fs.createWriteStream(localFilePath));
139
+ logger.info(`Downloaded file to ${localFilePath}`);
140
+ resolve(localFilePath);
141
+ } catch (error) {
142
+ fs.unlink(localFilePath, () => {
143
+ reject(error);
144
+ });
145
+ //throw error;
146
+ }
147
+ });
148
+ };
149
+
150
+ async function getMediaChunks(file, requestId) {
151
+ try {
152
+ if (MEDIA_API_URL) {
153
+ //call helper api and get list of file uris
154
+ const res = await axios.get(MEDIA_API_URL, { params: { uri: file, requestId } });
155
+ return res.data;
156
+ } else {
157
+ logger.info(`No API_URL set, returning file as chunk`);
158
+ return [file];
159
+ }
160
+ } catch (err) {
161
+ logger.error(`Error getting media chunks list from api: ${err}`);
162
+ throw err;
163
+ }
164
+ }
165
+
166
+ async function markCompletedForCleanUp(requestId) {
167
+ try {
168
+ if (MEDIA_API_URL) {
169
+ //call helper api to mark processing as completed
170
+ const res = await axios.delete(MEDIA_API_URL, { params: { requestId } });
171
+ logger.info(`Marked request ${requestId} as completed: ${JSON.stringify(res.data)}`);
172
+ return res.data;
173
+ }
174
+ } catch (err) {
175
+ logger.error(`Error marking request ${requestId} as completed: ${err}`);
176
+ }
177
+ }
178
+
179
+ /**
180
+ * Delete a file from cloud storage by hash
181
+ * @param {string} hash - File hash to delete
182
+ * @param {pathwayResolver} pathwayResolver - Optional pathway resolver for logging
183
+ * @returns {Promise<boolean>} True if file was deleted, false if not found or error
184
+ */
185
+ async function deleteFileByHash(hash, pathwayResolver = null) {
186
+ if (!hash || typeof hash !== 'string') {
187
+ logger.warn('deleteFileByHash: hash is required and must be a string');
188
+ return false;
189
+ }
190
+
191
+ const fileHandlerUrl = MEDIA_API_URL;
192
+ if (!fileHandlerUrl) {
193
+ logger.warn('deleteFileByHash: WHISPER_MEDIA_API_URL is not set, cannot delete file');
194
+ return false;
195
+ }
196
+
197
+ try {
198
+ const separator = fileHandlerUrl.includes('?') ? '&' : '?';
199
+ const deleteUrl = `${fileHandlerUrl}${separator}hash=${encodeURIComponent(hash)}`;
200
+
201
+ const response = await axios.delete(deleteUrl, {
202
+ validateStatus: (status) => status >= 200 && status < 500, // Accept 200-499 as valid responses
203
+ timeout: 30000
204
+ });
205
+
206
+ if (response.status === 200) {
207
+ logger.info(`Successfully deleted file with hash ${hash}`);
208
+ return true;
209
+ } else if (response.status === 404) {
210
+ logger.info(`File with hash ${hash} not found (may have already been deleted)`);
211
+ return false; // Not an error - file doesn't exist
212
+ } else {
213
+ logger.warn(`Unexpected status ${response.status} when deleting file with hash ${hash}`);
214
+ return false;
215
+ }
216
+ } catch (error) {
217
+ // If it's a 404, that's fine - file doesn't exist
218
+ if (error?.response?.status === 404) {
219
+ logger.info(`File with hash ${hash} not found during deletion (may have already been deleted)`);
220
+ return false;
221
+ }
222
+
223
+ // Log other errors but don't throw - deletion failure shouldn't block modification
224
+ const errorMsg = error?.message || String(error);
225
+ logger.warn(`Error deleting file with hash ${hash}: ${errorMsg}`);
226
+ return false;
227
+ }
228
+ }
229
+
230
+ // Helper function to extract file metadata from a content object
231
+ // Returns normalized format with url and gcs (for file collection storage)
232
+ function extractFileMetadataFromContent(contentObj) {
233
+ const files = [];
234
+
235
+ if (contentObj.type === 'image_url' && contentObj.image_url?.url) {
236
+ files.push({
237
+ url: contentObj.image_url.url,
238
+ gcs: contentObj.gcs || null,
239
+ filename: contentObj.originalFilename || contentObj.name || contentObj.filename || null,
240
+ hash: contentObj.hash || null,
241
+ type: 'image_url'
242
+ });
243
+ } else if (contentObj.type === 'file' && contentObj.url) {
244
+ files.push({
245
+ url: contentObj.url,
246
+ gcs: contentObj.gcs || null,
247
+ filename: contentObj.originalFilename || contentObj.name || contentObj.filename || null,
248
+ hash: contentObj.hash || null,
249
+ type: 'file'
250
+ });
251
+ } else if (contentObj.url && (contentObj.type === 'image_url' || !contentObj.type)) {
252
+ // Handle direct URL objects
253
+ files.push({
254
+ url: contentObj.url,
255
+ gcs: contentObj.gcs || null,
256
+ filename: contentObj.originalFilename || contentObj.name || contentObj.filename || null,
257
+ hash: contentObj.hash || null,
258
+ type: contentObj.type || 'file'
259
+ });
260
+ }
261
+
262
+ return files;
263
+ }
264
+
265
+ // Cache for file collections during a request lifecycle
266
+ const fileCollectionCache = new Map();
267
+ const CACHE_TTL = 5000; // 5 seconds
268
+
269
+ /**
270
+ * Get cache key for file collection
271
+ */
272
+ function getCollectionCacheKey(contextId, contextKey) {
273
+ // Use memoryFiles section key for cache
274
+ return `${contextId}-memoryFiles-${contextKey || 'default'}`;
275
+ }
276
+
277
+ /**
278
+ * Extract files from chat history
279
+ * @param {Array} chatHistory - Chat history to scan
280
+ * @returns {Array} Array of file metadata objects
281
+ */
282
+ function extractFilesFromChatHistory(chatHistory) {
283
+ if (!chatHistory || !Array.isArray(chatHistory)) {
284
+ return [];
285
+ }
286
+
287
+ const extractedFiles = [];
288
+ for (const message of chatHistory) {
289
+ if (!message || !message.content) {
290
+ continue;
291
+ }
292
+
293
+ // Handle array content
294
+ if (Array.isArray(message.content)) {
295
+ for (const content of message.content) {
296
+ try {
297
+ const contentObj = typeof content === 'string' ? JSON.parse(content) : content;
298
+ extractedFiles.push(...extractFileMetadataFromContent(contentObj));
299
+ } catch (e) {
300
+ // Not JSON or couldn't be parsed, continue
301
+ continue;
302
+ }
303
+ }
304
+ }
305
+ // Handle string content
306
+ else if (typeof message.content === 'string') {
307
+ try {
308
+ const contentObj = JSON.parse(message.content);
309
+ extractedFiles.push(...extractFileMetadataFromContent(contentObj));
310
+ } catch (e) {
311
+ // Not JSON or couldn't be parsed, continue
312
+ continue;
313
+ }
314
+ }
315
+ // Handle object content
316
+ else if (typeof message.content === 'object') {
317
+ extractedFiles.push(...extractFileMetadataFromContent(message.content));
318
+ }
319
+ }
320
+
321
+ return extractedFiles;
322
+ }
323
+
324
+ /**
325
+ * Load file collection from memory system or cache
326
+ * Returns both the collection data and version for optimistic locking
327
+ * @param {string} contextId - Context ID for the file collection
328
+ * @param {string} contextKey - Optional context key for encryption
329
+ * @param {boolean} useCache - Whether to check cache first (default: true)
330
+ * @returns {Promise<{files: Array, version: string}>} File collection with version
331
+ */
332
+ async function loadFileCollectionWithVersion(contextId, contextKey = null, useCache = true) {
333
+ if (!contextId) {
334
+ return { files: [], version: new Date().toISOString() };
335
+ }
336
+
337
+ const cacheKey = getCollectionCacheKey(contextId, contextKey);
338
+
339
+ // Check cache first
340
+ if (useCache && fileCollectionCache.has(cacheKey)) {
341
+ const cached = fileCollectionCache.get(cacheKey);
342
+ if (Date.now() - cached.timestamp < CACHE_TTL) {
343
+ return { files: cached.collection, version: cached.version || new Date().toISOString() };
344
+ }
345
+ }
346
+
347
+ // Load from memory system
348
+ const { callPathway } = await import('./pathwayTools.js');
349
+ let files = [];
350
+ let version = new Date().toISOString();
351
+
352
+ try {
353
+ const memoryContent = await callPathway('sys_read_memory', {
354
+ contextId,
355
+ section: 'memoryFiles',
356
+ contextKey
357
+ });
358
+ if (memoryContent) {
359
+ const parsed = JSON.parse(memoryContent);
360
+
361
+ // Handle new format: { version, files }
362
+ if (parsed && typeof parsed === 'object' && !Array.isArray(parsed) && parsed.files) {
363
+ files = Array.isArray(parsed.files) ? parsed.files : [];
364
+ version = parsed.version || new Date().toISOString();
365
+ }
366
+ // Handle old format: just an array (backward compatibility)
367
+ else if (Array.isArray(parsed)) {
368
+ files = parsed;
369
+ version = new Date().toISOString(); // Assign new version for migration
370
+ }
371
+ // Invalid format
372
+ else {
373
+ files = [];
374
+ version = new Date().toISOString();
375
+ }
376
+ }
377
+ } catch (e) {
378
+ // Collection doesn't exist yet, start with empty array
379
+ files = [];
380
+ version = new Date().toISOString();
381
+ }
382
+
383
+ // Update cache
384
+ fileCollectionCache.set(cacheKey, {
385
+ collection: files,
386
+ version: version,
387
+ timestamp: Date.now()
388
+ });
389
+
390
+ return { files, version };
391
+ }
392
+
393
+ /**
394
+ * Load file collection from memory system or cache
395
+ * @param {string} contextId - Context ID for the file collection
396
+ * @param {string} contextKey - Optional context key for encryption
397
+ * @param {boolean} useCache - Whether to check cache first (default: true)
398
+ * @returns {Promise<Array>} File collection array
399
+ */
400
+ async function loadFileCollection(contextId, contextKey = null, useCache = true) {
401
+ const { files } = await loadFileCollectionWithVersion(contextId, contextKey, useCache);
402
+ return files;
403
+ }
404
+
405
+ /**
406
+ * Save file collection to memory system with version checking
407
+ * @param {string} contextId - Context ID for the file collection
408
+ * @param {string} contextKey - Optional context key for encryption
409
+ * @param {Array} collection - File collection array
410
+ * @param {string} expectedVersion - Expected version for optimistic locking (if provided)
411
+ * @returns {Promise<boolean>} True if save succeeded, false if version mismatch
412
+ */
413
+ async function saveFileCollectionWithVersion(contextId, contextKey, collection, expectedVersion = null) {
414
+ const cacheKey = getCollectionCacheKey(contextId, contextKey);
415
+ const newVersion = new Date().toISOString();
416
+
417
+ try {
418
+ const { callPathway } = await import('./pathwayTools.js');
419
+
420
+ // If expectedVersion is provided, verify it matches RIGHT before saving
421
+ // This minimizes the race condition window
422
+ if (expectedVersion !== null) {
423
+ // Read directly from memory (bypass cache) to get the absolute latest version
424
+ const memoryContent = await callPathway('sys_read_memory', {
425
+ contextId,
426
+ section: 'memoryFiles',
427
+ contextKey
428
+ });
429
+
430
+ let currentVersion = null;
431
+ let collectionExists = false;
432
+ let isOldFormat = false;
433
+
434
+ if (memoryContent && memoryContent.trim() !== '' && memoryContent.trim() !== '[]') {
435
+ collectionExists = true;
436
+ try {
437
+ const parsed = JSON.parse(memoryContent);
438
+ // Handle new format: { version, files }
439
+ if (parsed && typeof parsed === 'object' && !Array.isArray(parsed) && parsed.version) {
440
+ currentVersion = parsed.version;
441
+ }
442
+ // Handle old format: just an array (no version yet)
443
+ else if (Array.isArray(parsed)) {
444
+ // Old format - we'll allow migration if the content matches
445
+ isOldFormat = true;
446
+ currentVersion = null;
447
+ }
448
+ } catch (e) {
449
+ // Invalid format - treat as version mismatch
450
+ currentVersion = null;
451
+ }
452
+ }
453
+
454
+ // If collection doesn't exist yet (empty memoryContent or just "[]"), allow the save
455
+ // since there's nothing to conflict with. The version check is only needed
456
+ // when there's an existing collection that might have been modified.
457
+ // Also allow save if we're migrating from old format (isOldFormat) - the migration
458
+ // will happen on the next load, so we allow this save to proceed.
459
+ if (collectionExists && !isOldFormat && currentVersion !== expectedVersion) {
460
+ // Version mismatch - return false to trigger retry
461
+ return false;
462
+ }
463
+ }
464
+
465
+ // Save with version (minimal window between check and save)
466
+ const collectionData = {
467
+ version: newVersion,
468
+ files: collection
469
+ };
470
+
471
+ await callPathway('sys_save_memory', {
472
+ contextId,
473
+ section: 'memoryFiles',
474
+ aiMemory: JSON.stringify(collectionData),
475
+ contextKey
476
+ });
477
+
478
+ // Update cache
479
+ fileCollectionCache.set(cacheKey, {
480
+ collection,
481
+ version: newVersion,
482
+ timestamp: Date.now()
483
+ });
484
+
485
+ return true;
486
+ } catch (e) {
487
+ // Log but don't fail - collection update is best effort
488
+ const logger = (await import('./logger.js')).default;
489
+ logger.warn(`Failed to save file collection: ${e.message}`);
490
+ return false;
491
+ }
492
+ }
493
+
494
+ /**
495
+ * Save file collection to memory system
496
+ * @param {string} contextId - Context ID for the file collection
497
+ * @param {string} contextKey - Optional context key for encryption
498
+ * @param {Array} collection - File collection array
499
+ */
500
+ async function saveFileCollection(contextId, contextKey, collection) {
501
+ await saveFileCollectionWithVersion(contextId, contextKey, collection, null);
502
+ }
503
+
504
+ /**
505
+ * Modify file collection with optimistic locking and automatic retries
506
+ * This is the main function that all modify operations should use to ensure concurrency safety
507
+ * @param {string} contextId - Context ID for the file collection
508
+ * @param {string} contextKey - Optional context key for encryption
509
+ * @param {Function} modifierCallback - Callback function that modifies the collection array
510
+ * The callback receives (collection) and should return the modified collection array
511
+ * @param {number} maxRetries - Maximum number of retry attempts (default: 5)
512
+ * @returns {Promise<Array>} The final modified collection array
513
+ */
514
+ async function modifyFileCollectionWithLock(contextId, contextKey, modifierCallback, maxRetries = 5) {
515
+ if (!contextId) {
516
+ throw new Error("contextId is required");
517
+ }
518
+
519
+ if (typeof modifierCallback !== 'function') {
520
+ throw new Error("modifierCallback must be a function");
521
+ }
522
+
523
+ let lastError = null;
524
+
525
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
526
+ try {
527
+ // Load collection with version (skip cache to get latest version)
528
+ const { files, version } = await loadFileCollectionWithVersion(contextId, contextKey, false);
529
+
530
+ // Create a copy to avoid mutating the original
531
+ const collectionCopy = [...files];
532
+
533
+ // Execute the modifier callback
534
+ const modifiedCollection = await modifierCallback(collectionCopy);
535
+
536
+ // Validate that callback returned an array
537
+ if (!Array.isArray(modifiedCollection)) {
538
+ throw new Error("modifierCallback must return an array");
539
+ }
540
+
541
+ // Try to save with version check
542
+ const saved = await saveFileCollectionWithVersion(contextId, contextKey, modifiedCollection, version);
543
+
544
+ if (saved) {
545
+ // Success! Return the modified collection
546
+ return modifiedCollection;
547
+ }
548
+
549
+ // Version mismatch - will retry on next iteration
550
+ // Add a small delay to reduce contention (exponential backoff)
551
+ if (attempt < maxRetries - 1) {
552
+ const delay = Math.min(10 * Math.pow(2, attempt), 100); // Max 100ms
553
+ await new Promise(resolve => setTimeout(resolve, delay));
554
+ }
555
+ } catch (error) {
556
+ lastError = error;
557
+ // For non-version-mismatch errors, we might want to retry or fail immediately
558
+ // For now, we'll retry a few times then throw
559
+ if (attempt === maxRetries - 1) {
560
+ throw error;
561
+ }
562
+ // Small delay before retry
563
+ const delay = Math.min(10 * Math.pow(2, attempt), 100);
564
+ await new Promise(resolve => setTimeout(resolve, delay));
565
+ }
566
+ }
567
+
568
+ // If we get here, all retries failed due to version mismatches
569
+ throw new Error(`Failed to modify file collection after ${maxRetries} attempts due to concurrent modifications`);
570
+ }
571
+
572
+ /**
573
+ * Add a file to the file collection
574
+ * If fileUrl is provided and is not already a cloud URL, it will be uploaded first
575
+ * @param {string} contextId - Context ID for the file collection
576
+ * @param {string} contextKey - Optional context key for encryption
577
+ * @param {string} url - Cloud storage URL (Azure URL) - if fileUrl is provided, this can be null
578
+ * @param {string} gcs - Optional Google Cloud Storage URL
579
+ * @param {string} filename - Filename or title for the file
580
+ * @param {Array<string>} tags - Optional array of tags
581
+ * @param {string} notes - Optional notes or description
582
+ * @param {string} hash - Optional file hash
583
+ * @param {string} fileUrl - Optional: URL of file to upload (if not already in cloud storage)
584
+ * @param {pathwayResolver} pathwayResolver - Optional pathway resolver for logging
585
+ * @returns {Promise<Object>} File entry object with id
586
+ */
587
+ async function addFileToCollection(contextId, contextKey, url, gcs, filename, tags = [], notes = '', hash = null, fileUrl = null, pathwayResolver = null) {
588
+ if (!contextId || !filename) {
589
+ throw new Error("contextId and filename are required");
590
+ }
591
+
592
+ // If fileUrl is provided and url is not already a cloud URL, upload the file first
593
+ let finalUrl = url;
594
+ let finalGcs = gcs;
595
+ let finalHash = hash;
596
+
597
+ if (fileUrl && (!url || (!url.includes('blob.core.windows.net') && !url.includes('storage.googleapis.com')))) {
598
+ // Upload the file from the URL
599
+ // uploadFileToCloud will download it, compute hash, check if it exists, and upload if needed
600
+ // It uploads the local file stream, not the URL, to avoid triggering remoteFile fetch
601
+ const uploadResult = await uploadFileToCloud(fileUrl, null, filename, pathwayResolver);
602
+ finalUrl = uploadResult.url;
603
+ finalGcs = uploadResult.gcs;
604
+ finalHash = uploadResult.hash || hash;
605
+ }
606
+
607
+ if (!finalUrl) {
608
+ throw new Error("url or fileUrl is required");
609
+ }
610
+
611
+ // Create file entry (before locking to avoid recreating on retry)
612
+ const fileEntry = {
613
+ id: `${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
614
+ url: finalUrl,
615
+ gcs: finalGcs || null,
616
+ filename: filename,
617
+ tags: Array.isArray(tags) ? tags : [],
618
+ notes: notes || '',
619
+ hash: finalHash || null,
620
+ addedDate: new Date().toISOString(),
621
+ lastAccessed: new Date().toISOString()
622
+ };
623
+
624
+ // Use optimistic locking to add file to collection
625
+ await modifyFileCollectionWithLock(contextId, contextKey, (collection) => {
626
+ collection.push(fileEntry);
627
+ return collection;
628
+ });
629
+
630
+ return fileEntry;
631
+ }
632
+
633
+ /**
634
+ * Sync files from chat history to file collection
635
+ * @param {Array} chatHistory - Chat history to scan
636
+ * @param {string} contextId - Context ID for the file collection
637
+ * @param {string} contextKey - Optional context key for encryption
638
+ * @returns {Promise<Array>} Array of file metadata objects
639
+ */
640
+ async function syncFilesToCollection(chatHistory, contextId, contextKey = null) {
641
+ if (!chatHistory || !Array.isArray(chatHistory) || !contextId) {
642
+ return [];
643
+ }
644
+
645
+ // Extract all files from chat history
646
+ const extractedFiles = extractFilesFromChatHistory(chatHistory);
647
+
648
+ if (extractedFiles.length === 0) {
649
+ // No new files to add, return existing collection
650
+ return await loadFileCollection(contextId, contextKey, true);
651
+ }
652
+
653
+ // Use optimistic locking to sync files
654
+ const collection = await modifyFileCollectionWithLock(contextId, contextKey, (collection) => {
655
+ // Create a map of existing files by URL and hash for fast lookup
656
+ const existingFilesMap = new Map();
657
+ collection.forEach(file => {
658
+ if (file.url) {
659
+ existingFilesMap.set(file.url, file);
660
+ }
661
+ if (file.gcs) {
662
+ existingFilesMap.set(file.gcs, file);
663
+ }
664
+ if (file.hash) {
665
+ existingFilesMap.set(`hash:${file.hash}`, file);
666
+ }
667
+ });
668
+
669
+ // Add new files that aren't already in the collection
670
+ for (const file of extractedFiles) {
671
+ // Check if file already exists by URL or hash
672
+ const existsByUrl = file.url && existingFilesMap.has(file.url);
673
+ const existsByGcs = file.gcs && existingFilesMap.has(file.gcs);
674
+ const existsByHash = file.hash && existingFilesMap.has(`hash:${file.hash}`);
675
+
676
+ if (!existsByUrl && !existsByGcs && !existsByHash) {
677
+ // New file - add to collection
678
+ const fileEntry = {
679
+ id: `${Date.now()}-${Math.random().toString(36).substring(2, 9)}`,
680
+ url: file.url,
681
+ gcs: file.gcs || null,
682
+ filename: file.filename || (file.url ? file.url.split('/').pop().split('?')[0] : 'unknown'),
683
+ hash: file.hash || null,
684
+ type: file.type || 'file',
685
+ addedDate: new Date().toISOString(),
686
+ lastAccessed: new Date().toISOString()
687
+ };
688
+
689
+ collection.push(fileEntry);
690
+ existingFilesMap.set(file.url, fileEntry);
691
+ if (file.gcs) {
692
+ existingFilesMap.set(file.gcs, fileEntry);
693
+ }
694
+ if (file.hash) {
695
+ existingFilesMap.set(`hash:${file.hash}`, fileEntry);
696
+ }
697
+ } else {
698
+ // File exists - update lastAccessed and merge URLs if needed
699
+ const existingFile = existsByUrl ? existingFilesMap.get(file.url) :
700
+ existsByGcs ? existingFilesMap.get(file.gcs) :
701
+ existingFilesMap.get(`hash:${file.hash}`);
702
+
703
+ if (existingFile) {
704
+ existingFile.lastAccessed = new Date().toISOString();
705
+
706
+ // Merge URLs if we have new ones
707
+ if (file.url && !existingFile.url) {
708
+ existingFile.url = file.url;
709
+ }
710
+ if (file.gcs && !existingFile.gcs) {
711
+ existingFile.gcs = file.gcs;
712
+ }
713
+ if (file.hash && !existingFile.hash) {
714
+ existingFile.hash = file.hash;
715
+ }
716
+ if (file.filename && !existingFile.filename) {
717
+ existingFile.filename = file.filename;
718
+ }
719
+ }
720
+ }
721
+ }
722
+
723
+ return collection;
724
+ });
725
+
726
+ return collection;
727
+ }
728
+
729
+ /**
730
+ * Get available files from file collection and format for template
731
+ * @param {string} contextId - Context ID for the file collection
732
+ * @param {string} contextKey - Optional context key for encryption
733
+ * @returns {Promise<string>} Formatted string of available files
734
+ */
735
+ async function getAvailableFilesFromCollection(contextId, contextKey = null) {
736
+ if (!contextId) {
737
+ return 'No files available.';
738
+ }
739
+
740
+ const collection = await loadFileCollection(contextId, contextKey, true);
741
+ return formatFilesForTemplate(collection);
742
+ }
743
+
744
+ /**
745
+ * Format file collection for template display
746
+ * Shows the last 10 most recently used files in a compact one-line format
747
+ * @param {Array} collection - File collection array
748
+ * @returns {string} Formatted string
749
+ */
750
+ function formatFilesForTemplate(collection) {
751
+ if (!collection || collection.length === 0) {
752
+ return 'No files available.';
753
+ }
754
+
755
+ // Sort by lastAccessed (most recent first), fallback to addedDate
756
+ const sorted = [...collection].sort((a, b) => {
757
+ const aDate = a.lastAccessed || a.addedDate || '';
758
+ const bDate = b.lastAccessed || b.addedDate || '';
759
+ return new Date(bDate) - new Date(aDate);
760
+ });
761
+
762
+ // Take only the last 10 most recently used files
763
+ const recentFiles = sorted.slice(0, 10);
764
+ const totalFiles = collection.length;
765
+ const hasMore = totalFiles > 10;
766
+
767
+ // Format as one line per file: hash | filename | url | date added | notes
768
+ const header = 'Hash | Filename | URL | Date Added | Notes';
769
+ const separator = '-'.repeat(Math.max(header.length, 80));
770
+
771
+ const fileList = recentFiles.map((file) => {
772
+ const hash = file.hash || '';
773
+ const filename = file.filename || 'Unnamed file';
774
+ const url = file.url || '';
775
+ const dateAdded = file.addedDate
776
+ ? new Date(file.addedDate).toLocaleDateString('en-US', { month: 'short', day: 'numeric', year: 'numeric' })
777
+ : '';
778
+ const notes = file.notes || '';
779
+ return `${hash} | ${filename} | ${url} | ${dateAdded} | ${notes}`;
780
+ }).join('\n');
781
+
782
+ let result = `${header}\n${separator}\n${fileList}`;
783
+
784
+ if (hasMore) {
785
+ result += `\n\nNote: Showing the last 10 most recently used files. ${totalFiles - 10} more file(s) are available in your collection. Use ListFileCollection or SearchFileCollection to see all files.`;
786
+ }
787
+
788
+ return result;
789
+ }
790
+
791
+ /**
792
+ * Get available files - now async and works with file collection
793
+ * @param {Array} chatHistory - Chat history to scan
794
+ * @param {string} contextId - Context ID for the file collection
795
+ * @param {string} contextKey - Optional context key for encryption
796
+ * @returns {Promise<string>} Formatted string of available files
797
+ */
798
+ async function getAvailableFiles(chatHistory, contextId, contextKey = null) {
799
+ if (!contextId) {
800
+ // Fallback to old behavior if no contextId
801
+ const files = extractFilesFromChatHistory(chatHistory);
802
+ return files.map(f => f.url).filter(Boolean).join('\n') || 'No files available.';
803
+ }
804
+
805
+ // Sync files from chat history to collection
806
+ await syncFilesToCollection(chatHistory, contextId, contextKey);
807
+
808
+ // Return formatted files from collection
809
+ return await getAvailableFilesFromCollection(contextId, contextKey);
810
+ }
811
+
812
+ /**
813
+ * Find a file in the collection by ID, URL, hash, or filename (with fuzzy matching)
814
+ * @param {string} fileParam - File ID, URL (Azure or GCS), hash, or filename
815
+ * @param {Array} collection - File collection array
816
+ * @returns {Object|null} File entry from collection, or null if not found
817
+ */
818
+ function findFileInCollection(fileParam, collection) {
819
+ if (!fileParam || typeof fileParam !== 'string' || !Array.isArray(collection)) {
820
+ return null;
821
+ }
822
+
823
+ // Normalize the search parameter for comparison
824
+ const normalizedParam = fileParam.trim().toLowerCase();
825
+
826
+ // Try to find the file by ID, URL, GCS URL, or hash (exact matches first)
827
+ let foundFile = null;
828
+
829
+ for (const file of collection) {
830
+ // Check by ID
831
+ if (file.id === fileParam) {
832
+ foundFile = file;
833
+ break;
834
+ }
835
+ // Check by Azure URL
836
+ if (file.url === fileParam) {
837
+ foundFile = file;
838
+ break;
839
+ }
840
+ // Check by GCS URL
841
+ if (file.gcs === fileParam) {
842
+ foundFile = file;
843
+ break;
844
+ }
845
+ // Check by hash
846
+ if (file.hash === fileParam) {
847
+ foundFile = file;
848
+ break;
849
+ }
850
+ }
851
+
852
+ // If no exact match, try fuzzy matching against filenames
853
+ if (!foundFile) {
854
+ const candidates = [];
855
+
856
+ // Extract base filename and extension from parameter for better matching
857
+ const paramBaseName = normalizedParam.split('/').pop().split('\\').pop(); // Get just filename, no path
858
+ const paramNameWithoutExt = paramBaseName.replace(/\.[^.]*$/, ''); // Remove extension
859
+ const paramExt = paramBaseName.includes('.') ? paramBaseName.split('.').pop() : '';
860
+
861
+ for (const file of collection) {
862
+ if (!file.filename) continue;
863
+
864
+ const normalizedFilename = file.filename.toLowerCase();
865
+ const fileBaseName = normalizedFilename.split('/').pop().split('\\').pop(); // Get just filename, no path
866
+ const fileNameWithoutExt = fileBaseName.replace(/\.[^.]*$/, ''); // Remove extension
867
+ const fileExt = fileBaseName.includes('.') ? fileBaseName.split('.').pop() : '';
868
+
869
+ // Exact filename match (case-insensitive)
870
+ if (fileBaseName === paramBaseName) {
871
+ foundFile = file;
872
+ break;
873
+ }
874
+
875
+ // Exact match without path (e.g., "document.pdf" matches "path/to/document.pdf")
876
+ if (fileBaseName === normalizedParam) {
877
+ candidates.push({ file, score: 1.0 }); // Highest score
878
+ continue;
879
+ }
880
+
881
+ // Base name matches (without extension)
882
+ if (fileNameWithoutExt === paramNameWithoutExt && paramNameWithoutExt.length > 0) {
883
+ candidates.push({ file, score: 0.9 }); // Very high score
884
+ continue;
885
+ }
886
+
887
+ // Filename ends with the parameter (e.g., "my-document.pdf" ends with "document.pdf")
888
+ if (fileBaseName.endsWith(normalizedParam) && fileBaseName.length > normalizedParam.length) {
889
+ candidates.push({ file, score: 0.7 }); // High score for end match
890
+ continue;
891
+ }
892
+
893
+ // Base name contains the parameter (e.g., "document" matches "my-document.pdf")
894
+ if (fileNameWithoutExt.includes(paramNameWithoutExt) && paramNameWithoutExt.length > 2) {
895
+ candidates.push({ file, score: 0.6 }); // Good score
896
+ continue;
897
+ }
898
+
899
+ // Filename contains the parameter
900
+ if (normalizedFilename.includes(normalizedParam)) {
901
+ candidates.push({ file, score: 0.5 }); // Lower score for contains match
902
+ continue;
903
+ }
904
+
905
+ // Extension matches (as last resort)
906
+ if (paramExt && fileExt === paramExt && paramExt.length > 0) {
907
+ candidates.push({ file, score: 0.2 }); // Low score for extension-only match
908
+ }
909
+ }
910
+
911
+ // If we found exact match, use it; otherwise use best candidate
912
+ if (!foundFile && candidates.length > 0) {
913
+ // Sort by score (highest first) and take the best match
914
+ candidates.sort((a, b) => b.score - a.score);
915
+ foundFile = candidates[0].file;
916
+ }
917
+ }
918
+
919
+ return foundFile;
920
+ }
921
+
922
+ /**
923
+ * Resolve a file parameter to a URL by looking it up in the file collection
924
+ * If the parameter is already a URL (starts with http:// or https://), returns it as-is
925
+ * If contextId is provided, looks up the file in the collection and returns its URL
926
+ * @param {string} fileParam - File ID, URL (Azure or GCS), hash, or filename from collection
927
+ * @param {string} contextId - Context ID for the file collection
928
+ * @param {string} contextKey - Optional context key for encryption
929
+ * @param {Object} options - Optional configuration
930
+ * @param {boolean} options.preferGcs - If true, prefer GCS URL over Azure URL when available
931
+ * @returns {Promise<string|null>} Resolved file URL, or null if not found
932
+ */
933
+ export async function resolveFileParameter(fileParam, contextId, contextKey = null, options = {}) {
934
+ if (!fileParam || typeof fileParam !== 'string') {
935
+ return null;
936
+ }
937
+
938
+ const trimmed = fileParam.trim();
939
+ const { preferGcs = false } = options;
940
+
941
+ // If no contextId, can't look up in collection - return null
942
+ if (!contextId) {
943
+ return null;
944
+ }
945
+
946
+ try {
947
+ // Load file collection and find the file
948
+ const collection = await loadFileCollection(contextId, contextKey, true);
949
+ const foundFile = findFileInCollection(trimmed, collection);
950
+
951
+ if (foundFile) {
952
+ // If preferGcs is true and GCS URL is available, return it
953
+ if (preferGcs && foundFile.gcs) {
954
+ return foundFile.gcs;
955
+ }
956
+ // Otherwise return the regular URL (Azure)
957
+ if (foundFile.url) {
958
+ return foundFile.url;
959
+ }
960
+ }
961
+
962
+ // File not found in collection
963
+ return null;
964
+ } catch (error) {
965
+ // Log error but return null
966
+ logger.warn(`Failed to resolve file parameter "${trimmed}": ${error.message}`);
967
+ return null;
968
+ }
969
+ }
970
+
971
+ /**
972
+ * Generate file message content by looking up a file parameter in the file collection
973
+ * @param {string} fileParam - File URL (Azure or GCS), file ID from collection, or file hash
974
+ * @param {string} contextId - Context ID for the file collection
975
+ * @param {string} contextKey - Optional context key for encryption
976
+ * @returns {Promise<Object|null>} Content object in the format for chat history, or null if not found
977
+ */
978
+ async function generateFileMessageContent(fileParam, contextId, contextKey = null) {
979
+ if (!fileParam || typeof fileParam !== 'string') {
980
+ return null;
981
+ }
982
+
983
+ if (!contextId) {
984
+ // Without contextId, we can't look up in collection
985
+ // Return a basic content object from the URL
986
+ return {
987
+ type: 'file',
988
+ url: fileParam
989
+ };
990
+ }
991
+
992
+ // Load file collection
993
+ const collection = await loadFileCollection(contextId, contextKey, true);
994
+
995
+ // Find the file using shared matching logic
996
+ const foundFile = findFileInCollection(fileParam, collection);
997
+
998
+ if (!foundFile) {
999
+ // File not found in collection, return null
1000
+ return null;
1001
+ }
1002
+
1003
+ // Determine file type based on filename extension or existing type
1004
+ const filename = foundFile.filename || '';
1005
+ const extension = filename.split('.').pop()?.toLowerCase() || '';
1006
+ const isImage = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'svg'].includes(extension);
1007
+ const fileType = foundFile.type || (isImage ? 'image_url' : 'file');
1008
+
1009
+ // Create content object in the proper format for plugins (url and gcs)
1010
+ if (fileType === 'image_url') {
1011
+ return {
1012
+ type: 'image_url',
1013
+ image_url: { url: foundFile.url },
1014
+ url: foundFile.url,
1015
+ gcs: foundFile.gcs || null,
1016
+ originalFilename: foundFile.filename || null,
1017
+ hash: foundFile.hash || null
1018
+ };
1019
+ } else {
1020
+ return {
1021
+ type: 'file',
1022
+ url: foundFile.url,
1023
+ gcs: foundFile.gcs || null,
1024
+ originalFilename: foundFile.filename || null,
1025
+ hash: foundFile.hash || null
1026
+ };
1027
+ }
1028
+ }
1029
+
1030
+ /**
1031
+ * Inject a file into chat history as a content object
1032
+ * Only injects if the file is not already present in the chat history
1033
+ * @param {Array} chatHistory - Chat history array to modify
1034
+ * @param {Object} fileContent - Content object from generateFileMessageContent
1035
+ * @returns {Array} Modified chat history with file injected (or unchanged if already present)
1036
+ */
1037
+ function injectFileIntoChatHistory(chatHistory, fileContent) {
1038
+ if (!chatHistory || !Array.isArray(chatHistory)) {
1039
+ return [{ role: 'user', content: [fileContent] }];
1040
+ }
1041
+
1042
+ if (!fileContent) {
1043
+ return chatHistory;
1044
+ }
1045
+
1046
+ // Extract URLs and hash from the file content to check for duplicates
1047
+ const fileUrl = fileContent.url || fileContent.image_url?.url;
1048
+ const fileGcs = fileContent.gcs;
1049
+ const fileHash = fileContent.hash;
1050
+
1051
+ // Check if file already exists in chat history
1052
+ const existingFiles = extractFilesFromChatHistory(chatHistory);
1053
+ const fileAlreadyExists = existingFiles.some(existingFile => {
1054
+ // Check by URL (existingFile uses url from extractFileMetadataFromContent)
1055
+ if (fileUrl && existingFile.url === fileUrl) {
1056
+ return true;
1057
+ }
1058
+ // Check by GCS URL
1059
+ if (fileGcs && existingFile.gcs === fileGcs) {
1060
+ return true;
1061
+ }
1062
+ // Check by hash
1063
+ if (fileHash && existingFile.hash === fileHash) {
1064
+ return true;
1065
+ }
1066
+ return false;
1067
+ });
1068
+
1069
+ // If file already exists, return chat history unchanged
1070
+ if (fileAlreadyExists) {
1071
+ return chatHistory;
1072
+ }
1073
+
1074
+ // Create a new user message with the file content
1075
+ // Use OpenAI-compatible format: content is an array of objects (not JSON strings)
1076
+ const fileMessage = {
1077
+ role: 'user',
1078
+ content: [fileContent]
1079
+ };
1080
+
1081
+ // Add to the end of chat history
1082
+ return [...chatHistory, fileMessage];
1083
+ }
1084
+
1085
+ /**
1086
+ * Check if a file exists by hash using the file handler
1087
+ * @param {string} hash - File hash to check
1088
+ * @param {string} fileHandlerUrl - File handler service URL
1089
+ * @param {pathwayResolver} pathwayResolver - Optional pathway resolver for logging
1090
+ * @returns {Promise<Object|null>} {url, gcs, hash} if file exists, null otherwise
1091
+ */
1092
+ async function checkHashExists(hash, fileHandlerUrl, pathwayResolver = null) {
1093
+ if (!hash || !fileHandlerUrl) {
1094
+ return null;
1095
+ }
1096
+
1097
+ try {
1098
+ const separator = fileHandlerUrl.includes('?') ? '&' : '?';
1099
+ const checkHashUrl = `${fileHandlerUrl}${separator}hash=${hash}&checkHash=true`;
1100
+
1101
+ const checkResponse = await axios.get(checkHashUrl, {
1102
+ timeout: 10000,
1103
+ validateStatus: (status) => status >= 200 && status < 500
1104
+ });
1105
+
1106
+ // If file exists (200), return existing URLs
1107
+ // Use converted URLs if available (for converted files like XLSX->CSV, DOCX->TXT, etc.)
1108
+ if (checkResponse.status === 200 && checkResponse.data && checkResponse.data.url) {
1109
+ const data = checkResponse.data;
1110
+ // Prefer converted URLs if they exist, otherwise use original URLs
1111
+ const url = data.converted?.url || data.url;
1112
+ const gcs = data.converted?.gcs || data.gcs || null;
1113
+
1114
+ return {
1115
+ url: url,
1116
+ gcs: gcs,
1117
+ hash: data.hash || hash
1118
+ };
1119
+ }
1120
+
1121
+ return null;
1122
+ } catch (checkError) {
1123
+ // If checkHash fails, log but don't throw - this is an optimization
1124
+ let errorMsg;
1125
+ if (checkError?.message) {
1126
+ errorMsg = checkError.message;
1127
+ } else if (checkError?.errors && Array.isArray(checkError.errors)) {
1128
+ // Handle AggregateError
1129
+ errorMsg = checkError.errors.map(e => e?.message || String(e)).join('; ');
1130
+ } else {
1131
+ errorMsg = String(checkError);
1132
+ }
1133
+ if (pathwayResolver && pathwayResolver.logWarning) {
1134
+ pathwayResolver.logWarning(`checkHash failed: ${errorMsg}`);
1135
+ }
1136
+ return null;
1137
+ }
1138
+ }
1139
+
1140
+ /**
1141
+ * Generic function to upload a file to cloud storage
1142
+ * Handles both URLs (downloads then uploads) and base64 data
1143
+ * Checks hash before uploading to avoid duplicates
1144
+ * @param {string|Buffer} fileInput - URL to download from, or base64 string, or Buffer
1145
+ * @param {string} mimeType - MIME type of the file (optional for URLs)
1146
+ * @param {string} filename - Optional filename (will be inferred if not provided)
1147
+ * @param {pathwayResolver} pathwayResolver - Optional pathway resolver for logging
1148
+ * @returns {Promise<Object>} {url, gcs, hash}
1149
+ */
1150
+ async function uploadFileToCloud(fileInput, mimeType = null, filename = null, pathwayResolver = null) {
1151
+ let tempFilePath = null;
1152
+ let tempDir = null;
1153
+ let fileBuffer = null;
1154
+ let fileHash = null;
1155
+
1156
+ try {
1157
+ const fileHandlerUrl = MEDIA_API_URL;
1158
+ if (!fileHandlerUrl) {
1159
+ throw new Error('WHISPER_MEDIA_API_URL is not set');
1160
+ }
1161
+
1162
+ // Handle different input types
1163
+ if (typeof fileInput === 'string') {
1164
+ // Check if it's a URL or base64 data
1165
+ if (fileInput.startsWith('http://') || fileInput.startsWith('https://')) {
1166
+ // It's a URL (could be remote or cloud) - download it directly so we can compute the hash
1167
+ // Even if it's a cloud URL, we need to download it to compute hash and check if it exists
1168
+ // We'll upload the local file stream, not the URL, to avoid triggering remoteFile fetch
1169
+ // Download the file to a temporary location
1170
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'file-upload-'));
1171
+
1172
+ // Determine file extension from URL or filename
1173
+ let extension = 'bin';
1174
+ if (filename) {
1175
+ const extMatch = filename.match(/\.([^.]+)$/);
1176
+ if (extMatch) extension = extMatch[1];
1177
+ } else {
1178
+ try {
1179
+ const urlObj = new URL(fileInput);
1180
+ const pathname = urlObj.pathname;
1181
+ const extMatch = pathname.match(/\.([^.]+)$/);
1182
+ if (extMatch) extension = extMatch[1];
1183
+ } catch (e) {
1184
+ // URL parsing failed, use default
1185
+ }
1186
+ }
1187
+
1188
+ const downloadFilename = filename || `download_${Date.now()}.${extension}`;
1189
+ tempFilePath = path.join(tempDir, downloadFilename);
1190
+
1191
+ // Download the file directly using axios so we can compute hash
1192
+ const downloadResponse = await axios.get(fileInput, {
1193
+ responseType: 'stream',
1194
+ timeout: 60000,
1195
+ validateStatus: (status) => status >= 200 && status < 400
1196
+ });
1197
+
1198
+ if (downloadResponse.status !== 200) {
1199
+ throw new Error(`Failed to download file: ${downloadResponse.status}`);
1200
+ }
1201
+
1202
+ const writeStream = fs.createWriteStream(tempFilePath);
1203
+ await pipeline(downloadResponse.data, writeStream);
1204
+
1205
+ // Read the downloaded file into buffer to compute hash
1206
+ fileBuffer = fs.readFileSync(tempFilePath);
1207
+ } else {
1208
+ // It's base64 data
1209
+ fileBuffer = Buffer.from(fileInput, 'base64');
1210
+ }
1211
+ } else if (Buffer.isBuffer(fileInput)) {
1212
+ fileBuffer = fileInput;
1213
+ } else {
1214
+ throw new Error('fileInput must be a URL string, base64 string, or Buffer');
1215
+ }
1216
+
1217
+ // For buffer data, compute hash and check if file exists
1218
+ if (fileBuffer) {
1219
+ fileHash = await computeBufferHash(fileBuffer);
1220
+
1221
+ // Check if file already exists using checkHash
1222
+ const existingFile = await checkHashExists(fileHash, fileHandlerUrl, pathwayResolver);
1223
+ if (existingFile) {
1224
+ return existingFile;
1225
+ }
1226
+
1227
+ // File doesn't exist or checkHash failed - proceed with upload
1228
+ // If we don't already have a tempFilePath (from URL download), create one
1229
+ if (!tempFilePath) {
1230
+ // Determine file extension from mime type or filename
1231
+ let extension = 'bin';
1232
+ if (mimeType) {
1233
+ extension = mimeType.split('/')[1] || 'bin';
1234
+ } else if (filename) {
1235
+ const extMatch = filename.match(/\.([^.]+)$/);
1236
+ if (extMatch) extension = extMatch[1];
1237
+ }
1238
+
1239
+ const uploadFilename = filename || `upload_${Date.now()}.${extension}`;
1240
+
1241
+ // Create temporary file
1242
+ if (!tempDir) {
1243
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'file-upload-'));
1244
+ }
1245
+ tempFilePath = path.join(tempDir, uploadFilename);
1246
+
1247
+ // Write buffer to temp file
1248
+ fs.writeFileSync(tempFilePath, fileBuffer);
1249
+ }
1250
+ // If tempFilePath already exists (from URL download), we can use it directly
1251
+ }
1252
+
1253
+ // Upload the file (only if we have buffer data and created tempFilePath)
1254
+ if (!tempFilePath) {
1255
+ throw new Error('No file to upload - tempFilePath not created');
1256
+ }
1257
+
1258
+ const requestId = uuidv4();
1259
+ const formData = new FormData();
1260
+ formData.append('file', fs.createReadStream(tempFilePath), {
1261
+ filename: path.basename(tempFilePath),
1262
+ contentType: mimeType || 'application/octet-stream'
1263
+ });
1264
+ // Add hash for deduplication if we computed it
1265
+ if (fileHash) {
1266
+ formData.append('hash', fileHash);
1267
+ }
1268
+
1269
+ // Append requestId parameter
1270
+ const separator = fileHandlerUrl.includes('?') ? '&' : '?';
1271
+ const uploadUrl = `${fileHandlerUrl}${separator}requestId=${requestId}`;
1272
+
1273
+ // Upload file
1274
+ const uploadResponse = await axios.post(uploadUrl, formData, {
1275
+ headers: {
1276
+ ...formData.getHeaders()
1277
+ },
1278
+ timeout: 30000
1279
+ });
1280
+
1281
+ if (uploadResponse.data && uploadResponse.data.url) {
1282
+ // Return both url and gcs if available
1283
+ return {
1284
+ url: uploadResponse.data.url,
1285
+ gcs: uploadResponse.data.gcs || null,
1286
+ hash: uploadResponse.data.hash || fileHash
1287
+ };
1288
+ } else {
1289
+ throw new Error('No URL returned from file handler');
1290
+ }
1291
+
1292
+ } catch (error) {
1293
+ let errorMsg;
1294
+ if (error?.message) {
1295
+ errorMsg = error.message;
1296
+ } else if (error?.errors && Array.isArray(error.errors)) {
1297
+ // Handle AggregateError
1298
+ errorMsg = error.errors.map(e => e?.message || String(e)).join('; ');
1299
+ } else {
1300
+ errorMsg = String(error);
1301
+ }
1302
+ const errorMessage = `Failed to upload file: ${errorMsg}`;
1303
+ if (pathwayResolver && pathwayResolver.logError) {
1304
+ pathwayResolver.logError(errorMessage);
1305
+ } else {
1306
+ logger.error(errorMessage);
1307
+ }
1308
+ throw error;
1309
+ } finally {
1310
+ // Clean up temp files - always runs regardless of success or failure
1311
+ if (tempDir && fs.existsSync(tempDir)) {
1312
+ try {
1313
+ fs.rmSync(tempDir, { recursive: true, force: true });
1314
+ } catch (cleanupError) {
1315
+ const warningMessage = `Failed to clean up temp directory: ${cleanupError.message}`;
1316
+ if (pathwayResolver && pathwayResolver.logWarning) {
1317
+ pathwayResolver.logWarning(warningMessage);
1318
+ } else {
1319
+ logger.warn(warningMessage);
1320
+ }
1321
+ }
1322
+ } else if (tempFilePath && fs.existsSync(tempFilePath)) {
1323
+ // Fallback: if tempDir doesn't exist but tempFilePath does, delete just the file
1324
+ try {
1325
+ fs.unlinkSync(tempFilePath);
1326
+ } catch (cleanupError) {
1327
+ const warningMessage = `Failed to clean up temp file: ${cleanupError.message}`;
1328
+ if (pathwayResolver && pathwayResolver.logWarning) {
1329
+ pathwayResolver.logWarning(warningMessage);
1330
+ } else {
1331
+ logger.warn(warningMessage);
1332
+ }
1333
+ }
1334
+ }
1335
+ }
1336
+ }
1337
+
1338
+ // Helper function to upload base64 image data to cloud storage
1339
+ // Now uses the generic uploadFileToCloud function
1340
+ const uploadImageToCloud = async (base64Data, mimeType, pathwayResolver = null) => {
1341
+ return await uploadFileToCloud(base64Data, mimeType, null, pathwayResolver);
1342
+ };
1343
+
1344
+ /**
1345
+ * Convert file hashes to content format suitable for LLM processing
1346
+ * @param {Array<string>} fileHashes - Array of file hashes to resolve
1347
+ * @param {Object} config - Configuration object with file service endpoints
1348
+ * @returns {Promise<Array<string>>} Array of stringified file content objects
1349
+ */
1350
+ async function resolveFileHashesToContent(fileHashes, config) {
1351
+ if (!fileHashes || fileHashes.length === 0) return [];
1352
+
1353
+ const fileContentPromises = fileHashes.map(async (hash) => {
1354
+ try {
1355
+ // Use the existing file handler (cortex-file-handler) to resolve file hashes
1356
+ const fileHandlerUrl = config?.get?.('whisperMediaApiUrl');
1357
+
1358
+ if (fileHandlerUrl && fileHandlerUrl !== 'null') {
1359
+ // Use shared checkHashExists function
1360
+ const existingFile = await checkHashExists(hash, fileHandlerUrl);
1361
+ if (existingFile) {
1362
+ const fileData = existingFile;
1363
+ const fileUrl = fileData.url;
1364
+ const convertedUrl = fileData.converted?.url;
1365
+ const convertedGcsUrl = fileData.converted?.gcs;
1366
+
1367
+ return JSON.stringify({
1368
+ type: "image_url",
1369
+ url: convertedUrl || fileUrl,
1370
+ image_url: { url: convertedUrl || fileUrl },
1371
+ gcs: convertedGcsUrl || fileData.gcs, // Add GCS URL for Gemini models
1372
+ originalFilename: fileData.filename,
1373
+ hash: hash
1374
+ });
1375
+ }
1376
+
1377
+ // Fallback: try direct axios call for backward compatibility (in case checkHashExists doesn't work)
1378
+ const response = await axios.get(fileHandlerUrl, {
1379
+ params: { hash: hash, checkHash: true }
1380
+ });
1381
+ if (response.status === 200) {
1382
+ const fileData = response.data;
1383
+ const fileUrl = fileData.shortLivedUrl || fileData.url;
1384
+ const convertedUrl = fileData.converted?.url;
1385
+ const convertedGcsUrl = fileData.converted?.gcs;
1386
+
1387
+ return JSON.stringify({
1388
+ type: "image_url",
1389
+ url: convertedUrl || fileUrl,
1390
+ image_url: { url: convertedUrl || fileUrl },
1391
+ gcs: convertedGcsUrl || fileData.gcs, // Add GCS URL for Gemini models
1392
+ originalFilename: fileData.filename,
1393
+ hash: hash
1394
+ });
1395
+ }
1396
+ }
1397
+
1398
+ // Fallback: create a placeholder that indicates file resolution is needed
1399
+ return JSON.stringify({
1400
+ type: "file_hash",
1401
+ hash: hash,
1402
+ _cortex_needs_resolution: true
1403
+ });
1404
+ } catch (error) {
1405
+ // Return error indicator
1406
+ return JSON.stringify({
1407
+ type: "file_error",
1408
+ hash: hash,
1409
+ error: error.message
1410
+ });
1411
+ }
1412
+ });
1413
+
1414
+ return Promise.all(fileContentPromises);
1415
+ }
1416
+
1417
+ /**
1418
+ * Get MIME type from filename or file path
1419
+ * Uses the mime-types package for comprehensive MIME type detection
1420
+ * @param {string} filenameOrPath - Filename or full file path
1421
+ * @param {string} defaultMimeType - Optional default MIME type if detection fails (default: 'application/octet-stream')
1422
+ * @returns {string} MIME type string
1423
+ */
1424
+ function getMimeTypeFromFilename(filenameOrPath, defaultMimeType = 'application/octet-stream') {
1425
+ if (!filenameOrPath) {
1426
+ return defaultMimeType;
1427
+ }
1428
+
1429
+ // mime.lookup can handle both filenames and paths
1430
+ const mimeType = mime.lookup(filenameOrPath);
1431
+ return mimeType || defaultMimeType;
1432
+ }
1433
+
1434
+ /**
1435
+ * Get MIME type from file extension
1436
+ * @param {string} extension - File extension (with or without leading dot, e.g., '.txt' or 'txt')
1437
+ * @param {string} defaultMimeType - Optional default MIME type if detection fails (default: 'application/octet-stream')
1438
+ * @returns {string} MIME type string
1439
+ */
1440
+ function getMimeTypeFromExtension(extension, defaultMimeType = 'application/octet-stream') {
1441
+ if (!extension) {
1442
+ return defaultMimeType;
1443
+ }
1444
+
1445
+ // Ensure extension starts with a dot for mime.lookup
1446
+ const normalizedExt = extension.startsWith('.') ? extension : `.${extension}`;
1447
+ const mimeType = mime.lookup(normalizedExt);
1448
+ return mimeType || defaultMimeType;
1449
+ }
1450
+
1451
+ export {
1452
+ computeFileHash,
1453
+ computeBufferHash,
1454
+ deleteTempPath,
1455
+ deleteFileByHash,
1456
+ downloadFile,
1457
+ generateUniqueFilename,
1458
+ getMediaChunks,
1459
+ markCompletedForCleanUp,
1460
+ extractFileMetadataFromContent,
1461
+ extractFilesFromChatHistory,
1462
+ syncFilesToCollection,
1463
+ getAvailableFilesFromCollection,
1464
+ formatFilesForTemplate,
1465
+ getAvailableFiles,
1466
+ findFileInCollection,
1467
+ // resolveFileParameter is exported inline above
1468
+ generateFileMessageContent,
1469
+ injectFileIntoChatHistory,
1470
+ addFileToCollection,
1471
+ loadFileCollection,
1472
+ saveFileCollection,
1473
+ modifyFileCollectionWithLock,
1474
+ checkHashExists,
1475
+ uploadFileToCloud,
1476
+ uploadImageToCloud,
1477
+ resolveFileHashesToContent,
1478
+ getMimeTypeFromFilename,
1479
+ getMimeTypeFromExtension
1480
+ };
1481
+