@juspay/neurolink 9.5.2 → 9.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (356) hide show
  1. package/CHANGELOG.md +16 -0
  2. package/README.md +29 -25
  3. package/dist/agent/directTools.d.ts +5 -5
  4. package/dist/cli/commands/config.d.ts +9 -9
  5. package/dist/cli/commands/serve.d.ts +37 -0
  6. package/dist/cli/commands/serve.js +302 -229
  7. package/dist/cli/commands/setup-anthropic.d.ts +2 -2
  8. package/dist/cli/commands/setup-azure.d.ts +2 -2
  9. package/dist/cli/commands/setup-bedrock.d.ts +2 -2
  10. package/dist/cli/commands/setup-gcp.d.ts +2 -2
  11. package/dist/cli/commands/setup-google-ai.d.ts +2 -2
  12. package/dist/cli/commands/setup-huggingface.d.ts +2 -2
  13. package/dist/cli/commands/setup-mistral.d.ts +2 -2
  14. package/dist/cli/commands/setup-openai.d.ts +2 -2
  15. package/dist/cli/commands/setup.d.ts +2 -2
  16. package/dist/cli/factories/commandFactory.js +16 -2
  17. package/dist/cli/loop/optionsSchema.d.ts +2 -2
  18. package/dist/cli/loop/session.d.ts +4 -0
  19. package/dist/cli/loop/session.js +49 -4
  20. package/dist/cli/utils/interactiveSetup.d.ts +4 -4
  21. package/dist/config/conversationMemory.d.ts +2 -0
  22. package/dist/config/conversationMemory.js +5 -5
  23. package/dist/constants/contextWindows.d.ts +46 -0
  24. package/dist/constants/contextWindows.js +156 -0
  25. package/dist/context/budgetChecker.d.ts +18 -0
  26. package/dist/context/budgetChecker.js +71 -0
  27. package/dist/context/contextCompactor.d.ts +22 -0
  28. package/dist/context/contextCompactor.js +106 -0
  29. package/dist/context/effectiveHistory.d.ts +52 -0
  30. package/dist/context/effectiveHistory.js +105 -0
  31. package/dist/context/errorDetection.d.ts +14 -0
  32. package/dist/context/errorDetection.js +124 -0
  33. package/dist/context/fileSummarizationService.d.ts +54 -0
  34. package/dist/context/fileSummarizationService.js +255 -0
  35. package/dist/context/fileSummarizer.d.ts +56 -0
  36. package/dist/context/fileSummarizer.js +145 -0
  37. package/dist/context/fileTokenBudget.d.ts +53 -0
  38. package/dist/context/fileTokenBudget.js +127 -0
  39. package/dist/context/prompts/summarizationPrompt.d.ts +17 -0
  40. package/dist/context/prompts/summarizationPrompt.js +110 -0
  41. package/dist/context/stages/fileReadDeduplicator.d.ts +10 -0
  42. package/dist/context/stages/fileReadDeduplicator.js +66 -0
  43. package/dist/context/stages/slidingWindowTruncator.d.ts +11 -0
  44. package/dist/context/stages/slidingWindowTruncator.js +42 -0
  45. package/dist/context/stages/structuredSummarizer.d.ts +10 -0
  46. package/dist/context/stages/structuredSummarizer.js +49 -0
  47. package/dist/context/stages/toolOutputPruner.d.ts +10 -0
  48. package/dist/context/stages/toolOutputPruner.js +52 -0
  49. package/dist/context/summarizationEngine.d.ts +45 -0
  50. package/dist/context/summarizationEngine.js +110 -0
  51. package/dist/context/toolOutputLimits.d.ts +17 -0
  52. package/dist/context/toolOutputLimits.js +84 -0
  53. package/dist/context/toolPairRepair.d.ts +16 -0
  54. package/dist/context/toolPairRepair.js +66 -0
  55. package/dist/core/conversationMemoryManager.d.ts +5 -15
  56. package/dist/core/conversationMemoryManager.js +15 -75
  57. package/dist/core/modules/MessageBuilder.d.ts +1 -1
  58. package/dist/core/modules/MessageBuilder.js +2 -0
  59. package/dist/core/modules/TelemetryHandler.d.ts +2 -3
  60. package/dist/core/modules/TelemetryHandler.js +3 -3
  61. package/dist/core/modules/ToolsManager.d.ts +2 -2
  62. package/dist/core/redisConversationMemoryManager.d.ts +8 -14
  63. package/dist/core/redisConversationMemoryManager.js +69 -78
  64. package/dist/factories/providerFactory.d.ts +2 -2
  65. package/dist/files/fileReferenceRegistry.d.ts +276 -0
  66. package/dist/files/fileReferenceRegistry.js +1543 -0
  67. package/dist/files/fileTools.d.ts +423 -0
  68. package/dist/files/fileTools.js +449 -0
  69. package/dist/files/index.d.ts +14 -0
  70. package/dist/files/index.js +13 -0
  71. package/dist/files/streamingReader.d.ts +93 -0
  72. package/dist/files/streamingReader.js +321 -0
  73. package/dist/files/types.d.ts +23 -0
  74. package/dist/files/types.js +23 -0
  75. package/dist/image-gen/imageGenTools.d.ts +2 -2
  76. package/dist/image-gen/types.d.ts +12 -12
  77. package/dist/lib/agent/directTools.d.ts +7 -7
  78. package/dist/lib/config/conversationMemory.d.ts +2 -0
  79. package/dist/lib/config/conversationMemory.js +5 -5
  80. package/dist/lib/constants/contextWindows.d.ts +46 -0
  81. package/dist/lib/constants/contextWindows.js +157 -0
  82. package/dist/lib/context/budgetChecker.d.ts +18 -0
  83. package/dist/lib/context/budgetChecker.js +72 -0
  84. package/dist/lib/context/contextCompactor.d.ts +22 -0
  85. package/dist/lib/context/contextCompactor.js +107 -0
  86. package/dist/lib/context/effectiveHistory.d.ts +52 -0
  87. package/dist/lib/context/effectiveHistory.js +106 -0
  88. package/dist/lib/context/errorDetection.d.ts +14 -0
  89. package/dist/lib/context/errorDetection.js +125 -0
  90. package/dist/lib/context/fileSummarizationService.d.ts +54 -0
  91. package/dist/lib/context/fileSummarizationService.js +256 -0
  92. package/dist/lib/context/fileSummarizer.d.ts +56 -0
  93. package/dist/lib/context/fileSummarizer.js +146 -0
  94. package/dist/lib/context/fileTokenBudget.d.ts +53 -0
  95. package/dist/lib/context/fileTokenBudget.js +128 -0
  96. package/dist/lib/context/prompts/summarizationPrompt.d.ts +17 -0
  97. package/dist/lib/context/prompts/summarizationPrompt.js +111 -0
  98. package/dist/lib/context/stages/fileReadDeduplicator.d.ts +10 -0
  99. package/dist/lib/context/stages/fileReadDeduplicator.js +67 -0
  100. package/dist/lib/context/stages/slidingWindowTruncator.d.ts +11 -0
  101. package/dist/lib/context/stages/slidingWindowTruncator.js +43 -0
  102. package/dist/lib/context/stages/structuredSummarizer.d.ts +10 -0
  103. package/dist/lib/context/stages/structuredSummarizer.js +50 -0
  104. package/dist/lib/context/stages/toolOutputPruner.d.ts +10 -0
  105. package/dist/lib/context/stages/toolOutputPruner.js +53 -0
  106. package/dist/lib/context/summarizationEngine.d.ts +45 -0
  107. package/dist/lib/context/summarizationEngine.js +111 -0
  108. package/dist/lib/context/toolOutputLimits.d.ts +17 -0
  109. package/dist/lib/context/toolOutputLimits.js +85 -0
  110. package/dist/lib/context/toolPairRepair.d.ts +16 -0
  111. package/dist/lib/context/toolPairRepair.js +67 -0
  112. package/dist/lib/core/conversationMemoryManager.d.ts +5 -15
  113. package/dist/lib/core/conversationMemoryManager.js +15 -75
  114. package/dist/lib/core/modules/MessageBuilder.d.ts +1 -1
  115. package/dist/lib/core/modules/MessageBuilder.js +2 -0
  116. package/dist/lib/core/modules/TelemetryHandler.d.ts +2 -3
  117. package/dist/lib/core/modules/TelemetryHandler.js +3 -3
  118. package/dist/lib/core/modules/ToolsManager.d.ts +2 -2
  119. package/dist/lib/core/redisConversationMemoryManager.d.ts +8 -14
  120. package/dist/lib/core/redisConversationMemoryManager.js +69 -78
  121. package/dist/lib/factories/providerFactory.d.ts +2 -2
  122. package/dist/lib/files/fileReferenceRegistry.d.ts +276 -0
  123. package/dist/lib/files/fileReferenceRegistry.js +1544 -0
  124. package/dist/lib/files/fileTools.d.ts +423 -0
  125. package/dist/lib/files/fileTools.js +450 -0
  126. package/dist/lib/files/index.d.ts +14 -0
  127. package/dist/lib/files/index.js +14 -0
  128. package/dist/lib/files/streamingReader.d.ts +93 -0
  129. package/dist/lib/files/streamingReader.js +322 -0
  130. package/dist/lib/files/types.d.ts +23 -0
  131. package/dist/lib/files/types.js +24 -0
  132. package/dist/lib/image-gen/imageGenTools.d.ts +2 -2
  133. package/dist/lib/image-gen/types.d.ts +12 -12
  134. package/dist/lib/memory/mem0Initializer.d.ts +2 -2
  135. package/dist/lib/neurolink.d.ts +61 -2
  136. package/dist/lib/neurolink.js +619 -307
  137. package/dist/lib/processors/archive/ArchiveProcessor.d.ts +327 -0
  138. package/dist/lib/processors/archive/ArchiveProcessor.js +1309 -0
  139. package/dist/lib/processors/archive/index.d.ts +33 -0
  140. package/dist/lib/processors/archive/index.js +43 -0
  141. package/dist/lib/processors/base/types.d.ts +70 -64
  142. package/dist/lib/processors/base/types.js +6 -0
  143. package/dist/lib/processors/cli/fileProcessorCli.d.ts +8 -8
  144. package/dist/lib/processors/cli/fileProcessorCli.js +5 -5
  145. package/dist/lib/processors/config/mimeTypes.js +25 -0
  146. package/dist/lib/processors/config/sizeLimits.d.ts +52 -40
  147. package/dist/lib/processors/config/sizeLimits.js +56 -44
  148. package/dist/lib/processors/document/ExcelProcessor.d.ts +14 -0
  149. package/dist/lib/processors/document/ExcelProcessor.js +72 -1
  150. package/dist/lib/processors/document/PptxProcessor.d.ts +63 -0
  151. package/dist/lib/processors/document/PptxProcessor.js +158 -0
  152. package/dist/lib/processors/document/index.d.ts +1 -0
  153. package/dist/lib/processors/document/index.js +6 -0
  154. package/dist/lib/processors/errors/FileErrorCode.d.ts +2 -2
  155. package/dist/lib/processors/errors/errorHelpers.d.ts +2 -2
  156. package/dist/lib/processors/errors/errorSerializer.d.ts +4 -4
  157. package/dist/lib/processors/index.d.ts +8 -2
  158. package/dist/lib/processors/index.js +5 -2
  159. package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +8 -8
  160. package/dist/lib/processors/integration/FileProcessorIntegration.js +7 -7
  161. package/dist/lib/processors/media/AudioProcessor.d.ts +328 -0
  162. package/dist/lib/processors/media/AudioProcessor.js +708 -0
  163. package/dist/lib/processors/media/VideoProcessor.d.ts +350 -0
  164. package/dist/lib/processors/media/VideoProcessor.js +992 -0
  165. package/dist/lib/processors/media/index.d.ts +27 -0
  166. package/dist/lib/processors/media/index.js +37 -0
  167. package/dist/lib/processors/registry/ProcessorRegistry.d.ts +19 -5
  168. package/dist/lib/processors/registry/ProcessorRegistry.js +103 -8
  169. package/dist/lib/processors/registry/index.d.ts +1 -1
  170. package/dist/lib/processors/registry/index.js +1 -1
  171. package/dist/lib/processors/registry/types.d.ts +2 -2
  172. package/dist/lib/providers/googleAiStudio.d.ts +34 -0
  173. package/dist/lib/providers/googleAiStudio.js +267 -397
  174. package/dist/lib/providers/googleVertex.d.ts +55 -1
  175. package/dist/lib/providers/googleVertex.js +452 -719
  176. package/dist/lib/providers/sagemaker/detection.d.ts +6 -6
  177. package/dist/lib/providers/sagemaker/diagnostics.d.ts +4 -4
  178. package/dist/lib/providers/sagemaker/parsers.d.ts +4 -4
  179. package/dist/lib/rag/chunkers/RecursiveChunker.js +2 -2
  180. package/dist/lib/rag/document/loaders.d.ts +6 -71
  181. package/dist/lib/rag/document/loaders.js +5 -5
  182. package/dist/lib/rag/graphRag/graphRAG.js +26 -9
  183. package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +5 -55
  184. package/dist/lib/rag/metadata/metadataExtractor.js +6 -3
  185. package/dist/lib/rag/pipeline/RAGPipeline.d.ts +8 -126
  186. package/dist/lib/rag/pipeline/RAGPipeline.js +11 -11
  187. package/dist/lib/rag/pipeline/contextAssembly.d.ts +3 -42
  188. package/dist/lib/rag/pipeline/contextAssembly.js +6 -3
  189. package/dist/lib/rag/reranker/RerankerFactory.d.ts +5 -60
  190. package/dist/lib/rag/resilience/CircuitBreaker.d.ts +3 -33
  191. package/dist/lib/rag/resilience/RetryHandler.d.ts +2 -21
  192. package/dist/lib/rag/retrieval/hybridSearch.d.ts +3 -41
  193. package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +2 -13
  194. package/dist/lib/rag/retrieval/vectorQueryTool.js +4 -3
  195. package/dist/lib/rag/types.d.ts +3 -3
  196. package/dist/lib/sdk/toolRegistration.d.ts +2 -2
  197. package/dist/lib/server/middleware/cache.d.ts +2 -2
  198. package/dist/lib/server/middleware/rateLimit.d.ts +2 -2
  199. package/dist/lib/server/routes/mcpRoutes.js +277 -249
  200. package/dist/lib/server/routes/memoryRoutes.js +287 -281
  201. package/dist/lib/server/utils/validation.d.ts +10 -10
  202. package/dist/lib/session/globalSessionState.d.ts +2 -2
  203. package/dist/lib/telemetry/telemetryService.d.ts +2 -2
  204. package/dist/lib/types/common.d.ts +39 -0
  205. package/dist/lib/types/contextTypes.d.ts +255 -0
  206. package/dist/lib/types/contextTypes.js +0 -2
  207. package/dist/lib/types/conversation.d.ts +62 -0
  208. package/dist/lib/types/conversationMemoryInterface.d.ts +27 -0
  209. package/dist/lib/types/conversationMemoryInterface.js +7 -0
  210. package/dist/lib/types/fileReferenceTypes.d.ts +222 -0
  211. package/dist/lib/types/fileReferenceTypes.js +9 -0
  212. package/dist/lib/types/fileTypes.d.ts +26 -3
  213. package/dist/lib/types/generateTypes.d.ts +22 -1
  214. package/dist/lib/types/index.d.ts +4 -5
  215. package/dist/lib/types/index.js +8 -10
  216. package/dist/lib/types/modelTypes.d.ts +2 -2
  217. package/dist/lib/types/processorTypes.d.ts +597 -0
  218. package/dist/lib/types/processorTypes.js +91 -0
  219. package/dist/lib/types/ragTypes.d.ts +481 -0
  220. package/dist/lib/types/ragTypes.js +8 -0
  221. package/dist/lib/types/sdkTypes.d.ts +17 -18
  222. package/dist/lib/types/streamTypes.d.ts +11 -1
  223. package/dist/lib/utils/async/retry.d.ts +2 -2
  224. package/dist/lib/utils/async/withTimeout.js +3 -1
  225. package/dist/lib/utils/conversationMemory.d.ts +12 -6
  226. package/dist/lib/utils/conversationMemory.js +76 -36
  227. package/dist/lib/utils/fileDetector.d.ts +62 -0
  228. package/dist/lib/utils/fileDetector.js +1014 -14
  229. package/dist/lib/utils/json/safeParse.d.ts +2 -2
  230. package/dist/lib/utils/messageBuilder.js +806 -153
  231. package/dist/lib/utils/modelChoices.d.ts +2 -2
  232. package/dist/lib/utils/multimodalOptionsBuilder.d.ts +2 -1
  233. package/dist/lib/utils/multimodalOptionsBuilder.js +1 -0
  234. package/dist/lib/utils/rateLimiter.d.ts +2 -2
  235. package/dist/lib/utils/sanitizers/filename.d.ts +4 -4
  236. package/dist/lib/utils/sanitizers/svg.d.ts +2 -2
  237. package/dist/lib/utils/thinkingConfig.d.ts +6 -6
  238. package/dist/lib/utils/tokenEstimation.d.ts +68 -0
  239. package/dist/lib/utils/tokenEstimation.js +113 -0
  240. package/dist/lib/utils/tokenUtils.d.ts +4 -4
  241. package/dist/lib/utils/ttsProcessor.d.ts +2 -2
  242. package/dist/lib/workflow/config.d.ts +150 -150
  243. package/dist/memory/mem0Initializer.d.ts +2 -2
  244. package/dist/neurolink.d.ts +61 -2
  245. package/dist/neurolink.js +619 -307
  246. package/dist/processors/archive/ArchiveProcessor.d.ts +327 -0
  247. package/dist/processors/archive/ArchiveProcessor.js +1308 -0
  248. package/dist/processors/archive/index.d.ts +33 -0
  249. package/dist/processors/archive/index.js +42 -0
  250. package/dist/processors/base/types.d.ts +70 -64
  251. package/dist/processors/base/types.js +6 -0
  252. package/dist/processors/cli/fileProcessorCli.d.ts +8 -8
  253. package/dist/processors/cli/fileProcessorCli.js +5 -5
  254. package/dist/processors/config/mimeTypes.js +25 -0
  255. package/dist/processors/config/sizeLimits.d.ts +52 -40
  256. package/dist/processors/config/sizeLimits.js +56 -44
  257. package/dist/processors/document/ExcelProcessor.d.ts +14 -0
  258. package/dist/processors/document/ExcelProcessor.js +72 -1
  259. package/dist/processors/document/PptxProcessor.d.ts +63 -0
  260. package/dist/processors/document/PptxProcessor.js +157 -0
  261. package/dist/processors/document/index.d.ts +1 -0
  262. package/dist/processors/document/index.js +6 -0
  263. package/dist/processors/errors/FileErrorCode.d.ts +2 -2
  264. package/dist/processors/errors/errorHelpers.d.ts +2 -2
  265. package/dist/processors/errors/errorSerializer.d.ts +4 -4
  266. package/dist/processors/index.d.ts +8 -2
  267. package/dist/processors/index.js +5 -2
  268. package/dist/processors/integration/FileProcessorIntegration.d.ts +8 -8
  269. package/dist/processors/integration/FileProcessorIntegration.js +7 -7
  270. package/dist/processors/media/AudioProcessor.d.ts +328 -0
  271. package/dist/processors/media/AudioProcessor.js +707 -0
  272. package/dist/processors/media/VideoProcessor.d.ts +350 -0
  273. package/dist/processors/media/VideoProcessor.js +991 -0
  274. package/dist/processors/media/ffprobe-static.d.ts +4 -0
  275. package/dist/processors/media/index.d.ts +27 -0
  276. package/dist/processors/media/index.js +36 -0
  277. package/dist/processors/registry/ProcessorRegistry.d.ts +19 -5
  278. package/dist/processors/registry/ProcessorRegistry.js +103 -8
  279. package/dist/processors/registry/index.d.ts +1 -1
  280. package/dist/processors/registry/index.js +1 -1
  281. package/dist/processors/registry/types.d.ts +2 -2
  282. package/dist/providers/googleAiStudio.d.ts +34 -0
  283. package/dist/providers/googleAiStudio.js +267 -397
  284. package/dist/providers/googleVertex.d.ts +55 -1
  285. package/dist/providers/googleVertex.js +452 -719
  286. package/dist/providers/sagemaker/detection.d.ts +6 -6
  287. package/dist/providers/sagemaker/diagnostics.d.ts +4 -4
  288. package/dist/providers/sagemaker/parsers.d.ts +4 -4
  289. package/dist/rag/chunkers/RecursiveChunker.js +2 -2
  290. package/dist/rag/document/loaders.d.ts +6 -71
  291. package/dist/rag/document/loaders.js +5 -5
  292. package/dist/rag/graphRag/graphRAG.js +26 -9
  293. package/dist/rag/metadata/MetadataExtractorFactory.d.ts +5 -55
  294. package/dist/rag/metadata/metadataExtractor.js +6 -3
  295. package/dist/rag/pipeline/RAGPipeline.d.ts +8 -126
  296. package/dist/rag/pipeline/RAGPipeline.js +11 -11
  297. package/dist/rag/pipeline/contextAssembly.d.ts +3 -42
  298. package/dist/rag/pipeline/contextAssembly.js +6 -3
  299. package/dist/rag/reranker/RerankerFactory.d.ts +5 -60
  300. package/dist/rag/resilience/CircuitBreaker.d.ts +3 -33
  301. package/dist/rag/resilience/RetryHandler.d.ts +2 -21
  302. package/dist/rag/retrieval/hybridSearch.d.ts +3 -41
  303. package/dist/rag/retrieval/vectorQueryTool.d.ts +2 -13
  304. package/dist/rag/retrieval/vectorQueryTool.js +4 -3
  305. package/dist/rag/types.d.ts +3 -3
  306. package/dist/sdk/toolRegistration.d.ts +2 -2
  307. package/dist/server/middleware/cache.d.ts +2 -2
  308. package/dist/server/middleware/rateLimit.d.ts +2 -2
  309. package/dist/server/routes/mcpRoutes.js +277 -249
  310. package/dist/server/routes/memoryRoutes.js +287 -281
  311. package/dist/server/utils/validation.d.ts +4 -4
  312. package/dist/session/globalSessionState.d.ts +2 -2
  313. package/dist/telemetry/telemetryService.d.ts +2 -2
  314. package/dist/types/common.d.ts +39 -0
  315. package/dist/types/contextTypes.d.ts +255 -0
  316. package/dist/types/contextTypes.js +0 -2
  317. package/dist/types/conversation.d.ts +62 -0
  318. package/dist/types/conversationMemoryInterface.d.ts +27 -0
  319. package/dist/types/conversationMemoryInterface.js +6 -0
  320. package/dist/types/fileReferenceTypes.d.ts +222 -0
  321. package/dist/types/fileReferenceTypes.js +8 -0
  322. package/dist/types/fileTypes.d.ts +26 -3
  323. package/dist/types/generateTypes.d.ts +22 -1
  324. package/dist/types/index.d.ts +4 -5
  325. package/dist/types/index.js +8 -10
  326. package/dist/types/processorTypes.d.ts +597 -0
  327. package/dist/types/processorTypes.js +90 -0
  328. package/dist/types/ragTypes.d.ts +481 -0
  329. package/dist/types/ragTypes.js +7 -0
  330. package/dist/types/sdkTypes.d.ts +17 -18
  331. package/dist/types/streamTypes.d.ts +11 -1
  332. package/dist/utils/async/retry.d.ts +2 -2
  333. package/dist/utils/async/withTimeout.js +3 -1
  334. package/dist/utils/conversationMemory.d.ts +12 -6
  335. package/dist/utils/conversationMemory.js +76 -36
  336. package/dist/utils/fileDetector.d.ts +62 -0
  337. package/dist/utils/fileDetector.js +1014 -14
  338. package/dist/utils/json/safeParse.d.ts +2 -2
  339. package/dist/utils/messageBuilder.js +806 -153
  340. package/dist/utils/modelChoices.d.ts +2 -2
  341. package/dist/utils/multimodalOptionsBuilder.d.ts +2 -1
  342. package/dist/utils/multimodalOptionsBuilder.js +1 -0
  343. package/dist/utils/rateLimiter.d.ts +2 -2
  344. package/dist/utils/sanitizers/filename.d.ts +4 -4
  345. package/dist/utils/sanitizers/svg.d.ts +2 -2
  346. package/dist/utils/thinkingConfig.d.ts +6 -6
  347. package/dist/utils/tokenEstimation.d.ts +68 -0
  348. package/dist/utils/tokenEstimation.js +112 -0
  349. package/dist/utils/tokenUtils.d.ts +4 -4
  350. package/dist/utils/ttsProcessor.d.ts +2 -2
  351. package/dist/workflow/config.d.ts +104 -104
  352. package/package.json +18 -6
  353. package/dist/lib/utils/conversationMemoryUtils.d.ts +0 -25
  354. package/dist/lib/utils/conversationMemoryUtils.js +0 -138
  355. package/dist/utils/conversationMemoryUtils.d.ts +0 -25
  356. package/dist/utils/conversationMemoryUtils.js +0 -137
@@ -0,0 +1,1543 @@
1
+ /**
2
+ * File Reference Registry
3
+ *
4
+ * Central registry for managing file references in on-demand processing mode.
5
+ * Files are registered with lightweight metadata and previews. Full content
6
+ * is processed on-demand when the LLM requests it via tools.
7
+ *
8
+ * This module is the core of the file reference architecture, replacing
9
+ * the previous "load everything upfront" pattern for files that exceed
10
+ * the tiny/small size tiers.
11
+ *
12
+ * @module files/fileReferenceRegistry
13
+ */
14
+ import { randomUUID } from "node:crypto";
15
+ import { mkdir, readFile, stat, unlink, writeFile } from "node:fs/promises";
16
+ import { tmpdir } from "node:os";
17
+ import { basename, extname, join } from "node:path";
18
+ import { estimatePostProcessingTokens } from "../context/fileTokenBudget.js";
19
+ import { logger } from "../utils/logger.js";
20
+ import { StreamingReader } from "./streamingReader.js";
21
+ import { SIZE_TIER_THRESHOLDS } from "./types.js";
22
+ /** Default maximum files in registry before LRU eviction */
23
+ const DEFAULT_MAX_FILES = 100;
24
+ /** Default maximum temp bytes (1 GB) */
25
+ const DEFAULT_MAX_TEMP_BYTES = 1024 * 1024 * 1024;
26
+ /** Default preview length in characters */
27
+ const DEFAULT_PREVIEW_CHARS = 2000;
28
+ /** Maximum file size we'll accept (2 GB) */
29
+ const MAX_ACCEPTED_SIZE = 2 * 1024 * 1024 * 1024;
30
+ /**
31
+ * Registry for managing file references with on-demand processing.
32
+ *
33
+ * Design decisions:
34
+ * - One instance per NeuroLink SDK instance (not global singleton)
35
+ * - File buffers persisted to temp dir for later streaming access
36
+ * - LRU eviction when maxFiles exceeded
37
+ * - Thread-safe via sequential async operations (Node.js single-threaded)
38
+ *
39
+ * @example
40
+ * ```typescript
41
+ * const registry = new FileReferenceRegistry();
42
+ * const ref = await registry.register(buffer, {
43
+ * filename: 'report.xlsx',
44
+ * });
45
+ * console.log(ref.sizeTier); // 'medium'
46
+ * console.log(ref.preview); // First 2000 chars of processed content
47
+ * console.log(ref.estimatedTokens); // Type-aware estimate
48
+ *
49
+ * // Later, LLM requests specific section
50
+ * const section = await registry.readSection(ref.id, 1, 50, 5000);
51
+ * ```
52
+ */
53
+ export class FileReferenceRegistry {
54
+ files = new Map();
55
+ tempDir;
56
+ maxFiles;
57
+ maxTempBytes;
58
+ defaultPreviewChars;
59
+ currentTempBytes = 0;
60
+ tempDirCreated = false;
61
+ constructor(options = {}) {
62
+ this.tempDir =
63
+ options.tempDir || join(tmpdir(), "neurolink-files", randomUUID());
64
+ this.maxFiles = options.maxFiles ?? DEFAULT_MAX_FILES;
65
+ this.maxTempBytes = options.maxTempBytes ?? DEFAULT_MAX_TEMP_BYTES;
66
+ this.defaultPreviewChars =
67
+ options.defaultPreviewChars ?? DEFAULT_PREVIEW_CHARS;
68
+ }
69
+ /**
70
+ * Register a file from a Buffer.
71
+ *
72
+ * This is the primary registration method. It performs lightweight analysis:
73
+ * 1. Detect file type from magic bytes (first 1KB)
74
+ * 2. Determine size tier
75
+ * 3. Extract preview (first N chars of text, or metadata for binary)
76
+ * 4. Persist buffer to temp directory for later streaming access
77
+ *
78
+ * Total time: ~1-5ms for most files (no full processing).
79
+ *
80
+ * @param buffer - File content as Buffer
81
+ * @param source - How the file was provided ('buffer', 'url', 'path', 'datauri')
82
+ * @param options - Registration options
83
+ * @returns FileReference with metadata and preview
84
+ */
85
+ async register(buffer, source = "buffer", options = {}) {
86
+ const sizeBytes = buffer.length;
87
+ // Reject oversized files
88
+ if (sizeBytes > MAX_ACCEPTED_SIZE) {
89
+ const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(1);
90
+ throw new Error(`File too large (${sizeMB} MB). Maximum accepted size is 2 GB.`);
91
+ }
92
+ // Detect file type from magic bytes and extension.
93
+ // If the provided filename has no extension, append one guessed from magic bytes
94
+ // so downstream processors (e.g., VideoProcessor) can validate by extension.
95
+ let filename = options.filename || `file-${Date.now()}${this.guessExtension(buffer)}`;
96
+ if (!extname(filename)) {
97
+ const guessedExt = this.guessExtension(buffer);
98
+ if (guessedExt) {
99
+ filename = `${filename}${guessedExt}`;
100
+ }
101
+ }
102
+ const ext = extname(filename).toLowerCase().replace(".", "");
103
+ const detectedType = options.fileType || this.detectType(buffer, ext);
104
+ const mimeType = this.guessMimeType(detectedType, ext);
105
+ const sizeTier = FileReferenceRegistry.classifySizeTier(sizeBytes);
106
+ // Generate preview (fast — only reads first N chars)
107
+ const preview = this.extractPreview(buffer, detectedType, options.maxPreviewChars ?? this.defaultPreviewChars);
108
+ // Estimate post-processing tokens (type-aware)
109
+ const estimatedTokens = estimatePostProcessingTokens(sizeBytes, detectedType);
110
+ // Create reference
111
+ const ref = {
112
+ id: randomUUID(),
113
+ source,
114
+ filename,
115
+ sizeBytes,
116
+ detectedType,
117
+ mimeType,
118
+ sizeTier,
119
+ estimatedTokens,
120
+ preview,
121
+ status: "registered",
122
+ registeredAt: Date.now(),
123
+ lastAccessedAt: Date.now(),
124
+ extension: ext || undefined,
125
+ };
126
+ // Persist buffer to temp directory (unless skipped or tiny)
127
+ if (!options.skipTempPersist && sizeTier !== "tiny") {
128
+ try {
129
+ const tempPath = await this.persistToTemp(ref.id, buffer, ext);
130
+ ref.tempPath = tempPath;
131
+ }
132
+ catch (err) {
133
+ logger.warn(`[FileReferenceRegistry] Failed to persist ${filename} to temp: ${err instanceof Error ? err.message : String(err)}`);
134
+ // Continue without temp persistence — buffer-based access still works
135
+ }
136
+ }
137
+ // For tiny files, store the processed content inline
138
+ if (sizeTier === "tiny") {
139
+ ref.processedContent = this.isTextType(detectedType, buffer)
140
+ ? buffer.toString("utf-8")
141
+ : preview;
142
+ ref.status = "processed";
143
+ }
144
+ else {
145
+ ref.status = "previewed";
146
+ }
147
+ // Evict LRU entries if at capacity
148
+ if (this.files.size >= this.maxFiles) {
149
+ this.evictLRU();
150
+ }
151
+ this.files.set(ref.id, ref);
152
+ logger.info(`[FileReferenceRegistry] Registered "${filename}" (${this.formatSize(sizeBytes)}, ` +
153
+ `tier=${sizeTier}, type=${detectedType}, ~${estimatedTokens} tokens)`);
154
+ return ref;
155
+ }
156
+ /**
157
+ * Register a file from a file path on disk.
158
+ *
159
+ * Does NOT read the entire file — only reads the first 1KB for type detection
160
+ * and preview. The file path is stored for later streaming access.
161
+ *
162
+ * @param filePath - Absolute path to the file
163
+ * @param options - Registration options
164
+ * @returns FileReference with metadata and preview
165
+ */
166
+ async registerFromPath(filePath, options = {}) {
167
+ const fileStat = await stat(filePath);
168
+ const sizeBytes = fileStat.size;
169
+ if (sizeBytes > MAX_ACCEPTED_SIZE) {
170
+ const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(1);
171
+ throw new Error(`File too large (${sizeMB} MB). Maximum accepted size is 2 GB.`);
172
+ }
173
+ const filename = options.filename || basename(filePath);
174
+ const ext = extname(filename).toLowerCase().replace(".", "");
175
+ const detectedType = options.fileType || this.detectTypeFromExtension(ext);
176
+ const mimeType = this.guessMimeType(detectedType, ext);
177
+ const sizeTier = FileReferenceRegistry.classifySizeTier(sizeBytes);
178
+ const estimatedTokens = estimatePostProcessingTokens(sizeBytes, detectedType);
179
+ // Read preview from file (streaming — only first N bytes)
180
+ let preview;
181
+ try {
182
+ preview = await StreamingReader.readPreview(filePath, options.maxPreviewChars ?? this.defaultPreviewChars);
183
+ }
184
+ catch {
185
+ preview = `[File: ${filename}, ${this.formatSize(sizeBytes)}, type: ${detectedType}]`;
186
+ }
187
+ const ref = {
188
+ id: randomUUID(),
189
+ source: "path",
190
+ originalPath: filePath,
191
+ filename,
192
+ sizeBytes,
193
+ detectedType,
194
+ mimeType,
195
+ sizeTier,
196
+ estimatedTokens,
197
+ preview,
198
+ status: "previewed",
199
+ registeredAt: Date.now(),
200
+ lastAccessedAt: Date.now(),
201
+ extension: ext || undefined,
202
+ };
203
+ // For path-based files, no need to persist — we already have the path
204
+ // Store the original path as the access point
205
+ ref.tempPath = filePath;
206
+ if (this.files.size >= this.maxFiles) {
207
+ this.evictLRU();
208
+ }
209
+ this.files.set(ref.id, ref);
210
+ logger.info(`[FileReferenceRegistry] Registered from path "${filename}" ` +
211
+ `(${this.formatSize(sizeBytes)}, tier=${sizeTier}, type=${detectedType})`);
212
+ return ref;
213
+ }
214
+ /**
215
+ * Get a file reference by ID.
216
+ * Updates lastAccessedAt for LRU tracking.
217
+ */
218
+ get(id) {
219
+ const ref = this.files.get(id);
220
+ if (ref) {
221
+ ref.lastAccessedAt = Date.now();
222
+ }
223
+ return ref;
224
+ }
225
+ /**
226
+ * Get a file reference by ID or filename.
227
+ * Tries ID lookup first, then falls back to filename match.
228
+ * This handles the common case where an LLM uses the filename
229
+ * instead of the UUID when calling file tools.
230
+ *
231
+ * @param idOrName - UUID or filename to search for
232
+ * @returns File reference if found, undefined otherwise
233
+ */
234
+ getByIdOrFilename(idOrName) {
235
+ // Try direct ID lookup first (most common, O(1))
236
+ const byId = this.get(idOrName);
237
+ if (byId) {
238
+ return byId;
239
+ }
240
+ // Fallback: search by filename (case-insensitive)
241
+ const lowerName = idOrName.toLowerCase();
242
+ for (const ref of this.files.values()) {
243
+ if (ref.filename.toLowerCase() === lowerName) {
244
+ ref.lastAccessedAt = Date.now();
245
+ return ref;
246
+ }
247
+ }
248
+ // Fallback: search by basename (without path)
249
+ for (const ref of this.files.values()) {
250
+ const refBasename = ref.filename.split("/").pop()?.toLowerCase() ?? "";
251
+ if (refBasename === lowerName) {
252
+ ref.lastAccessedAt = Date.now();
253
+ return ref;
254
+ }
255
+ }
256
+ return undefined;
257
+ }
258
+ /**
259
+ * Ensure a file has been processed (binary content extracted to text).
260
+ *
261
+ * For text files this is a no-op. For binary files (PDF, XLSX, video, etc.)
262
+ * this triggers on-demand processing if it hasn't happened yet. After this
263
+ * call, ref.processedContent and ref.preview contain extracted text.
264
+ *
265
+ * Used by file tools (get_file_preview) to ensure the preview contains
266
+ * real content instead of placeholder metadata strings.
267
+ */
268
+ async ensureProcessed(fileId) {
269
+ const ref = this.get(fileId);
270
+ if (!ref) {
271
+ return;
272
+ }
273
+ if (!ref.processedContent && !this.isTextType(ref.detectedType)) {
274
+ await this.processFileOnDemand(ref);
275
+ }
276
+ }
277
+ /**
278
+ * Extract targeted content from a registered file.
279
+ *
280
+ * This is the core dispatch method for the `extract_file_content` tool.
281
+ * Routes extraction to the appropriate processor based on file type and
282
+ * the parameters provided.
283
+ *
284
+ * @param params - Extraction parameters (file_id + type-specific options)
285
+ * @returns Extraction result with text and/or images
286
+ */
287
+ async extractContent(params) {
288
+ const ref = this.getByIdOrFilename(params.file_id);
289
+ if (!ref) {
290
+ return {
291
+ success: false,
292
+ error: `File not found: "${params.file_id}". Use list_attached_files to see available files.`,
293
+ };
294
+ }
295
+ try {
296
+ // Text-like types don't need raw buffer — they use readSection
297
+ // which works from processedContent (tiny files) or tempPath (larger files)
298
+ if (this.isTextType(ref.detectedType) ||
299
+ ref.detectedType === "csv" ||
300
+ ref.detectedType === "svg" ||
301
+ ref.detectedType === "unknown") {
302
+ return await this.extractTextTargeted(ref, params);
303
+ }
304
+ // Binary types need the raw buffer for processor-specific extraction
305
+ const buffer = ref.tempPath ? await readFile(ref.tempPath) : null;
306
+ if (!buffer) {
307
+ return {
308
+ success: false,
309
+ error: `No file data available for "${ref.filename}". The file may have been evicted from cache.`,
310
+ };
311
+ }
312
+ switch (ref.detectedType) {
313
+ case "video":
314
+ return await this.extractVideoTargeted(buffer, ref, params);
315
+ case "pdf":
316
+ return await this.extractPdfTargeted(buffer, ref, params);
317
+ case "xlsx":
318
+ return await this.extractExcelTargeted(buffer, ref, params);
319
+ case "pptx":
320
+ return await this.extractPptxTargeted(buffer, ref, params);
321
+ case "archive":
322
+ return await this.extractArchiveTargeted(buffer, ref, params);
323
+ case "audio":
324
+ return await this.extractAudioTargeted(buffer, ref, params);
325
+ default:
326
+ // Fallback for any unrecognized binary type
327
+ return await this.extractTextTargeted(ref, params);
328
+ }
329
+ }
330
+ catch (err) {
331
+ return {
332
+ success: false,
333
+ error: `Extraction failed for "${ref.filename}": ${err instanceof Error ? err.message : String(err)}`,
334
+ };
335
+ }
336
+ }
337
+ // ─── Targeted Extraction Dispatchers ──────────────────────────────
338
+ async extractVideoTargeted(buffer, ref, params) {
339
+ const { videoProcessor } = await import("../processors/media/VideoProcessor.js");
340
+ // If time range specified, extract frames from that range
341
+ if (params.start_time !== undefined && params.end_time !== undefined) {
342
+ const frames = await videoProcessor.extractFrameRange(buffer, ref.filename, params.start_time, params.end_time, params.frame_count ?? 5);
343
+ return {
344
+ success: true,
345
+ text: `Extracted ${frames.length} frames from ${ref.filename} (${params.start_time}s - ${params.end_time}s)`,
346
+ images: frames,
347
+ metadata: {
348
+ startTime: params.start_time,
349
+ endTime: params.end_time,
350
+ frameCount: frames.length,
351
+ },
352
+ };
353
+ }
354
+ // No time range: return full metadata + initial keyframes
355
+ if (!ref.processedContent) {
356
+ await this.processFileOnDemand(ref);
357
+ }
358
+ return {
359
+ success: true,
360
+ text: ref.processedContent || `[Video: ${ref.filename}]`,
361
+ images: ref.extractedImages ?? undefined,
362
+ };
363
+ }
364
+ async extractPdfTargeted(buffer, ref, params) {
365
+ // If specific pages requested, extract those pages
366
+ const pages = params.pages ??
367
+ (params.page_range
368
+ ? Array.from({ length: params.page_range.end - params.page_range.start + 1 }, (_, i) => (params.page_range ?? { start: 0 }).start + i)
369
+ : undefined);
370
+ if (pages && pages.length > 0) {
371
+ try {
372
+ const { PDFParse } = await import("pdf-parse");
373
+ const pdf = new PDFParse({ data: new Uint8Array(buffer) });
374
+ try {
375
+ const firstPage = Math.min(...pages);
376
+ const lastPage = Math.max(...pages);
377
+ const textResult = await pdf.getText({
378
+ first: firstPage,
379
+ last: lastPage,
380
+ });
381
+ const totalPages = textResult.total || 0;
382
+ const text = textResult.text?.trim() || "(No text found on the requested pages)";
383
+ // Note: pdf-parse extracts a contiguous range (first..last).
384
+ // For non-contiguous page requests (e.g., [1, 5, 12]), the result
385
+ // includes all pages in the range. This is a limitation of pdf-parse.
386
+ const rangeNote = firstPage !== lastPage
387
+ ? ` (extracted pages ${firstPage}-${lastPage})`
388
+ : "";
389
+ return {
390
+ success: true,
391
+ text: `## Pages ${pages.join(", ")} of ${ref.filename}${rangeNote}\n` +
392
+ `Total pages in document: ${totalPages}\n\n${text}`,
393
+ metadata: {
394
+ requestedPages: pages,
395
+ extractedRange: { first: firstPage, last: lastPage },
396
+ totalPages,
397
+ },
398
+ };
399
+ }
400
+ finally {
401
+ await pdf.destroy().catch(() => {
402
+ /* cleanup - ignore destroy errors */
403
+ });
404
+ }
405
+ }
406
+ catch (err) {
407
+ return {
408
+ success: false,
409
+ error: `PDF page extraction failed: ${err instanceof Error ? err.message : String(err)}`,
410
+ };
411
+ }
412
+ }
413
+ // No specific pages: return full content
414
+ if (!ref.processedContent) {
415
+ await this.processFileOnDemand(ref);
416
+ }
417
+ return {
418
+ success: true,
419
+ text: ref.processedContent || `[PDF: ${ref.filename}]`,
420
+ };
421
+ }
422
+ async extractExcelTargeted(buffer, ref, params) {
423
+ const { excelProcessor } = await import("../processors/document/ExcelProcessor.js");
424
+ const text = await excelProcessor.extractSheetRange(buffer, params.sheet, params.row_range?.start ?? 1, params.row_range?.end, params.columns);
425
+ return {
426
+ success: true,
427
+ text,
428
+ metadata: {
429
+ sheet: params.sheet,
430
+ rowRange: params.row_range,
431
+ columns: params.columns,
432
+ },
433
+ };
434
+ }
435
+ async extractPptxTargeted(buffer, ref, params) {
436
+ const pages = params.pages ??
437
+ (params.page_range
438
+ ? Array.from({ length: params.page_range.end - params.page_range.start + 1 }, (_, i) => (params.page_range ?? { start: 0 }).start + i)
439
+ : undefined);
440
+ if (pages && pages.length > 0) {
441
+ const { PptxProcessor } = await import("../processors/document/PptxProcessor.js");
442
+ const text = await PptxProcessor.extractSlides(buffer, pages);
443
+ return {
444
+ success: true,
445
+ text,
446
+ metadata: { slides: pages },
447
+ };
448
+ }
449
+ // Full extraction
450
+ if (!ref.processedContent) {
451
+ await this.processFileOnDemand(ref);
452
+ }
453
+ return {
454
+ success: true,
455
+ text: ref.processedContent || `[PPTX: ${ref.filename}]`,
456
+ };
457
+ }
458
+ async extractArchiveTargeted(buffer, ref, params) {
459
+ if (params.entry_path) {
460
+ const { archiveProcessor } = await import("../processors/archive/ArchiveProcessor.js");
461
+ const text = await archiveProcessor.extractEntry(buffer, params.entry_path);
462
+ return {
463
+ success: true,
464
+ text,
465
+ metadata: { entryPath: params.entry_path },
466
+ };
467
+ }
468
+ // No specific entry: return full listing
469
+ if (!ref.processedContent) {
470
+ await this.processFileOnDemand(ref);
471
+ }
472
+ return {
473
+ success: true,
474
+ text: ref.processedContent || `[Archive: ${ref.filename}]`,
475
+ };
476
+ }
477
+ async extractAudioTargeted(_buffer, ref, _params) {
478
+ // Audio doesn't have sub-section extraction yet — return full metadata
479
+ if (!ref.processedContent) {
480
+ await this.processFileOnDemand(ref);
481
+ }
482
+ return {
483
+ success: true,
484
+ text: ref.processedContent || `[Audio: ${ref.filename}]`,
485
+ };
486
+ }
487
+ async extractTextTargeted(ref, params) {
488
+ // For text files, use line-range reading
489
+ const startLine = params.page_range?.start ?? params.row_range?.start ?? 1;
490
+ const endLine = params.page_range?.end ?? params.row_range?.end;
491
+ const result = await this.readSection(ref.id, startLine, endLine, 50_000);
492
+ return {
493
+ success: true,
494
+ text: result.content,
495
+ metadata: {
496
+ startLine: result.startLine,
497
+ endLine: result.endLine,
498
+ totalLines: result.totalLines,
499
+ truncated: result.truncated,
500
+ },
501
+ };
502
+ }
503
+ /**
504
+ * List all registered files.
505
+ * Returns a lightweight summary suitable for the LLM.
506
+ */
507
+ list() {
508
+ return Array.from(this.files.values());
509
+ }
510
+ /**
511
+ * Generate a formatted table of all registered files for the LLM.
512
+ */
513
+ listFormatted() {
514
+ const files = this.list();
515
+ if (files.length === 0) {
516
+ return "No files attached.";
517
+ }
518
+ const header = "| # | Filename | Type | Size | Tier | Est. Tokens | Status |\n" +
519
+ "|---|----------|------|------|------|-------------|--------|\n";
520
+ const rows = files.map((f, i) => `| ${i + 1} | ${f.filename} | ${f.detectedType} | ${this.formatSize(f.sizeBytes)} | ` +
521
+ `${f.sizeTier} | ~${f.estimatedTokens.toLocaleString()} | ${f.status} |`);
522
+ return header + rows.join("\n");
523
+ }
524
+ /**
525
+ * Read a section of a registered file.
526
+ *
527
+ * Uses StreamingReader for memory-efficient access.
528
+ *
529
+ * @param fileId - File reference ID
530
+ * @param startLine - Starting line (1-indexed)
531
+ * @param endLine - Ending line (1-indexed)
532
+ * @param tokenBudget - Maximum tokens to return
533
+ * @param provider - Provider name for token estimation
534
+ * @returns FileReadResult
535
+ */
536
+ async readSection(fileId, startLine = 1, endLine, tokenBudget = 50_000, provider) {
537
+ const ref = this.get(fileId);
538
+ if (!ref) {
539
+ throw new Error(`File reference not found: ${fileId}`);
540
+ }
541
+ // Process binary files on first read — the lazy registration path
542
+ // stores raw binary to temp but never runs processors. We must process
543
+ // on-demand so the LLM gets extracted text, not garbled binary.
544
+ if (!ref.processedContent && !this.isTextType(ref.detectedType)) {
545
+ await this.processFileOnDemand(ref);
546
+ }
547
+ // If content is already cached (or was just processed), use buffer reader
548
+ if (ref.processedContent) {
549
+ return StreamingReader.readFromBuffer(Buffer.from(ref.processedContent, "utf-8"), {
550
+ startLine,
551
+ endLine,
552
+ tokenBudget,
553
+ provider,
554
+ });
555
+ }
556
+ // If we have a temp path or original path, use streaming reader
557
+ // (text files that were not processed on-demand)
558
+ const filePath = ref.tempPath || ref.originalPath;
559
+ if (filePath) {
560
+ const result = await StreamingReader.readLines(filePath, {
561
+ startLine,
562
+ endLine,
563
+ tokenBudget,
564
+ provider,
565
+ });
566
+ // Cache total lines for future reference
567
+ if (!ref.totalLines) {
568
+ ref.totalLines = result.totalLines;
569
+ }
570
+ return result;
571
+ }
572
+ throw new Error(`No accessible content for file "${ref.filename}" (id: ${fileId})`);
573
+ }
574
+ /**
575
+ * Search within a registered file.
576
+ *
577
+ * @param fileId - File reference ID
578
+ * @param pattern - Search pattern (string or regex)
579
+ * @param maxMatches - Maximum matches to return
580
+ * @returns FileSearchResult
581
+ */
582
+ async search(fileId, pattern, maxMatches = 50) {
583
+ const ref = this.get(fileId);
584
+ if (!ref) {
585
+ throw new Error(`File reference not found: ${fileId}`);
586
+ }
587
+ // Process binary files on first search — same lazy processing as readSection().
588
+ // Without this, search would scan raw PDF/XLSX binary bytes for text patterns.
589
+ if (!ref.processedContent && !this.isTextType(ref.detectedType)) {
590
+ await this.processFileOnDemand(ref);
591
+ }
592
+ // Search in processedContent if available (binary files after on-demand processing, or tiny files)
593
+ if (ref.processedContent) {
594
+ return FileReferenceRegistry.searchInMemory(ref.processedContent, pattern, maxMatches);
595
+ }
596
+ // For text files: use streaming search on the raw temp file (content IS valid UTF-8)
597
+ const filePath = ref.tempPath || ref.originalPath;
598
+ if (filePath) {
599
+ return StreamingReader.searchInFile(filePath, pattern, {
600
+ maxMatches,
601
+ });
602
+ }
603
+ throw new Error(`No searchable content for file "${ref.filename}" (id: ${fileId})`);
604
+ }
605
+ /**
606
+ * Search within in-memory content (for tiny files without temp paths).
607
+ */
608
+ static searchInMemory(content, pattern, maxMatches) {
609
+ const regex = new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "i");
610
+ const lines = content.split("\n");
611
+ const matches = [];
612
+ let totalMatches = 0;
613
+ for (let i = 0; i < lines.length; i++) {
614
+ if (regex.test(lines[i])) {
615
+ totalMatches++;
616
+ if (matches.length < maxMatches) {
617
+ matches.push({
618
+ lineNumber: i + 1,
619
+ line: lines[i],
620
+ contextBefore: lines.slice(Math.max(0, i - 3), i),
621
+ contextAfter: lines.slice(i + 1, Math.min(lines.length, i + 4)),
622
+ });
623
+ }
624
+ }
625
+ }
626
+ return {
627
+ matches,
628
+ totalMatches,
629
+ truncated: totalMatches > maxMatches,
630
+ };
631
+ }
632
+ /**
633
+ * Store a summary for a file reference.
634
+ */
635
+ setSummary(fileId, summary) {
636
+ const ref = this.files.get(fileId);
637
+ if (ref) {
638
+ ref.summary = summary;
639
+ ref.status = "processed";
640
+ ref.lastAccessedAt = Date.now();
641
+ }
642
+ }
643
+ /**
644
+ * Remove a file reference and clean up its temp file.
645
+ */
646
+ async remove(fileId) {
647
+ const ref = this.files.get(fileId);
648
+ if (!ref) {
649
+ return false;
650
+ }
651
+ // Clean up temp file (only if we created it, not for original paths)
652
+ if (ref.tempPath && ref.source !== "path") {
653
+ try {
654
+ await unlink(ref.tempPath);
655
+ this.currentTempBytes -= ref.sizeBytes;
656
+ }
657
+ catch {
658
+ // Temp file may already be cleaned up
659
+ }
660
+ }
661
+ this.files.delete(fileId);
662
+ return true;
663
+ }
664
+ /**
665
+ * Clear all file references and clean up temp directory.
666
+ */
667
+ async clear() {
668
+ const ids = Array.from(this.files.keys());
669
+ for (const id of ids) {
670
+ await this.remove(id);
671
+ }
672
+ this.files.clear();
673
+ this.currentTempBytes = 0;
674
+ }
675
+ /**
676
+ * Get the number of registered files.
677
+ */
678
+ get size() {
679
+ return this.files.size;
680
+ }
681
+ /**
682
+ * Generate the preview text for the initial prompt.
683
+ *
684
+ * Returns a compact summary of all registered files that uses ~50-100 tokens
685
+ * per file instead of full content. The LLM can use file tools to access
686
+ * more content as needed.
687
+ *
688
+ * @returns Formatted string for prompt injection
689
+ */
690
+ async generatePromptPreview() {
691
+ const files = this.list();
692
+ if (files.length === 0) {
693
+ return "";
694
+ }
695
+ // Ensure binary files are processed so previews contain real content
696
+ // (e.g., video metadata, audio tags) instead of placeholder strings.
697
+ for (const ref of files) {
698
+ if (!ref.processedContent && !this.isTextType(ref.detectedType)) {
699
+ await this.processFileOnDemand(ref);
700
+ }
701
+ }
702
+ const sections = [];
703
+ sections.push(`\n\n## Attached Files (${files.length})\n`);
704
+ for (const ref of files) {
705
+ const sizeStr = this.formatSize(ref.sizeBytes);
706
+ sections.push(`### File: "${ref.filename}" (${sizeStr}, ${ref.detectedType})`);
707
+ if (ref.sizeTier === "tiny" && ref.processedContent) {
708
+ // Tiny files: include full content inline
709
+ sections.push(ref.processedContent);
710
+ }
711
+ else {
712
+ // Larger files: include preview + guidance
713
+ sections.push(`**Preview** (first ${this.defaultPreviewChars} chars):`);
714
+ sections.push(ref.preview);
715
+ // Add type-specific extraction hints
716
+ const hint = FileReferenceRegistry.getExtractionHint(ref.detectedType, sizeStr);
717
+ if (hint) {
718
+ sections.push(`\n> ${hint}`);
719
+ }
720
+ else if (ref.sizeTier !== "small") {
721
+ sections.push(`\n> This file is ${sizeStr}. Use \`read_file_section\` to read specific ` +
722
+ `sections, \`search_in_file\` to search, or \`summarize_file\` for a full summary.`);
723
+ }
724
+ }
725
+ sections.push(""); // blank line between files
726
+ }
727
+ return sections.join("\n");
728
+ }
729
+ // ─── Private Methods ────────────────────────────────────────────
730
+ /**
731
+ * Get type-specific extraction hints for the LLM prompt.
732
+ * Tells the LLM what parameters it can use with extract_file_content.
733
+ */
734
+ static getExtractionHint(type, sizeStr) {
735
+ switch (type) {
736
+ case "video":
737
+ return (`This video is ${sizeStr}. Use \`extract_file_content\` with \`start_time\`/\`end_time\` ` +
738
+ `to get frames from specific time ranges (e.g., start_time=5, end_time=10, frame_count=3). ` +
739
+ `Initial keyframes are already provided above.`);
740
+ case "pdf":
741
+ return (`This PDF is ${sizeStr}. Use \`extract_file_content\` with \`pages\` (e.g., [1, 3, 5]) ` +
742
+ `or \`page_range\` (e.g., {start: 1, end: 10}) to get specific pages. ` +
743
+ `Use \`read_file_section\` for line-range access or \`search_in_file\` to search.`);
744
+ case "xlsx":
745
+ return (`This spreadsheet is ${sizeStr}. Use \`extract_file_content\` with \`sheet\` (name or index), ` +
746
+ `\`row_range\` (e.g., {start: 1, end: 50}), and \`columns\` (e.g., ["A", "B", "D"]) ` +
747
+ `for targeted data extraction.`);
748
+ case "pptx":
749
+ return (`This presentation is ${sizeStr}. Use \`extract_file_content\` with \`pages\` ` +
750
+ `(e.g., [1, 3, 5]) to extract specific slides.`);
751
+ case "archive":
752
+ return (`This archive is ${sizeStr}. Use \`extract_file_content\` with \`entry_path\` ` +
753
+ `(e.g., "src/index.ts") to extract a specific file from the archive.`);
754
+ case "audio":
755
+ return (`This audio file is ${sizeStr}. Metadata is shown above. ` +
756
+ `Use \`read_file_section\` or \`search_in_file\` for text-based access.`);
757
+ default:
758
+ return null;
759
+ }
760
+ }
761
+ /**
762
+ * Classify a file into a size tier based on byte size.
763
+ */
764
+ static classifySizeTier(sizeBytes) {
765
+ if (sizeBytes <= SIZE_TIER_THRESHOLDS.TINY_MAX) {
766
+ return "tiny";
767
+ }
768
+ if (sizeBytes <= SIZE_TIER_THRESHOLDS.SMALL_MAX) {
769
+ return "small";
770
+ }
771
+ if (sizeBytes <= SIZE_TIER_THRESHOLDS.MEDIUM_MAX) {
772
+ return "medium";
773
+ }
774
+ if (sizeBytes <= SIZE_TIER_THRESHOLDS.LARGE_MAX) {
775
+ return "large";
776
+ }
777
+ if (sizeBytes <= SIZE_TIER_THRESHOLDS.HUGE_MAX) {
778
+ return "huge";
779
+ }
780
+ return "oversized";
781
+ }
782
+ /**
783
+ * Process a binary file on-demand, extracting text content via the
784
+ * appropriate processor. This bridges the gap between the lazy registration
785
+ * path (which stores raw binary) and the LLM read tools (which need text).
786
+ *
787
+ * Called lazily on first readSection() or search() for non-text files.
788
+ * Results are cached in ref.processedContent for subsequent reads.
789
+ */
790
+ async processFileOnDemand(ref) {
791
+ // Prevent concurrent processing of the same file
792
+ if (ref.status === "processing") {
793
+ return;
794
+ }
795
+ ref.status = "processing";
796
+ try {
797
+ const buffer = ref.tempPath ? await readFile(ref.tempPath) : null;
798
+ if (!buffer) {
799
+ ref.status = "error";
800
+ logger.warn(`[FileReferenceRegistry] No buffer available for on-demand processing: "${ref.filename}"`);
801
+ return;
802
+ }
803
+ let extractedText = null;
804
+ switch (ref.detectedType) {
805
+ case "pdf":
806
+ extractedText = await this.extractPdfText(buffer);
807
+ break;
808
+ case "xlsx":
809
+ extractedText = await this.extractExcelText(buffer, ref);
810
+ break;
811
+ case "docx":
812
+ extractedText = await this.extractWordText(buffer, ref);
813
+ break;
814
+ case "pptx":
815
+ extractedText = await this.extractPptxText(buffer);
816
+ break;
817
+ case "video":
818
+ extractedText = await this.extractVideoContent(buffer, ref);
819
+ break;
820
+ case "audio":
821
+ extractedText = await this.extractAudioContent(buffer, ref);
822
+ break;
823
+ case "archive":
824
+ extractedText = await this.extractArchiveContent(buffer, ref);
825
+ break;
826
+ default:
827
+ // For unknown binary types, provide a descriptive fallback
828
+ extractedText =
829
+ `[Binary file: ${ref.filename}, ${this.formatSize(ref.sizeBytes)}, type: ${ref.detectedType}]\n` +
830
+ `This file could not be processed into text content.`;
831
+ break;
832
+ }
833
+ if (extractedText) {
834
+ ref.processedContent = extractedText;
835
+ ref.status = "processed";
836
+ // Update the preview with actual content instead of placeholder metadata
837
+ const previewChars = this.defaultPreviewChars;
838
+ if (extractedText.length <= previewChars) {
839
+ ref.preview = extractedText;
840
+ }
841
+ else {
842
+ const lastNewline = extractedText.lastIndexOf("\n", previewChars);
843
+ ref.preview =
844
+ lastNewline > previewChars * 0.8
845
+ ? extractedText.substring(0, lastNewline)
846
+ : extractedText.substring(0, previewChars) + "\n...[truncated]";
847
+ }
848
+ logger.info(`[FileReferenceRegistry] On-demand processed "${ref.filename}" ` +
849
+ `(${ref.detectedType}, ${this.formatSize(ref.sizeBytes)}) → ${extractedText.length} chars`);
850
+ }
851
+ else {
852
+ ref.processedContent =
853
+ `[${ref.detectedType.toUpperCase()} file: ${ref.filename}, ${this.formatSize(ref.sizeBytes)}]\n` +
854
+ `Content could not be extracted. The file may be corrupted or in an unsupported format.`;
855
+ ref.preview = ref.processedContent;
856
+ ref.status = "processed";
857
+ }
858
+ }
859
+ catch (err) {
860
+ const errorMsg = err instanceof Error ? err.message : String(err);
861
+ logger.warn(`[FileReferenceRegistry] On-demand processing failed for "${ref.filename}": ${errorMsg}`);
862
+ ref.processedContent =
863
+ `[Processing error for ${ref.filename}]\n` +
864
+ `Type: ${ref.detectedType}, Size: ${this.formatSize(ref.sizeBytes)}\n` +
865
+ `Error: ${errorMsg}`;
866
+ ref.preview = ref.processedContent;
867
+ ref.status = "error";
868
+ }
869
+ }
870
+ /**
871
+ * Extract text from a PDF buffer using pdf-parse v2 (pdfjs-dist under the hood).
872
+ *
873
+ * Handles compressed streams (FlateDecode), CMap-encoded text, modern PDFs,
874
+ * and most text-based PDF formats. For scanned/image-only PDFs where no text
875
+ * can be extracted, falls back to a descriptive message.
876
+ */
877
+ async extractPdfText(buffer) {
878
+ try {
879
+ const { PDFParse } = await import("pdf-parse");
880
+ const pdf = new PDFParse({
881
+ data: new Uint8Array(buffer),
882
+ });
883
+ try {
884
+ const textResult = await pdf.getText({
885
+ // Limit to first 100 pages to avoid unbounded processing
886
+ last: 100,
887
+ });
888
+ const text = textResult.text?.trim();
889
+ if (!text || text.length === 0) {
890
+ // No text found — likely a scanned/image-only PDF
891
+ const pageCount = textResult.total || 0;
892
+ return (`[PDF document: ${this.formatSize(buffer.length)}, ${pageCount} page(s)]\n` +
893
+ `This PDF appears to contain scanned images or non-extractable content.\n` +
894
+ `Text could not be extracted from the document. The content may consist of:\n` +
895
+ `- Scanned pages (images of text, not searchable text)\n` +
896
+ `- Forms or graphical content\n` +
897
+ `- Protected/encrypted content`);
898
+ }
899
+ // Clean up excessive blank lines
900
+ const cleaned = text.replace(/\n{3,}/g, "\n\n");
901
+ return cleaned;
902
+ }
903
+ finally {
904
+ // Always clean up the PDF instance to free pdfjs-dist resources
905
+ await pdf.destroy().catch(() => {
906
+ /* cleanup - ignore destroy errors */
907
+ });
908
+ }
909
+ }
910
+ catch (err) {
911
+ logger.warn(`[FileReferenceRegistry] PDF text extraction failed: ${err instanceof Error ? err.message : String(err)}`);
912
+ return null;
913
+ }
914
+ }
915
+ /**
916
+ * Extract text content from an Excel file using ExcelProcessor.
917
+ */
918
+ async extractExcelText(buffer, ref) {
919
+ try {
920
+ const { processExcel } = await import("../processors/document/ExcelProcessor.js");
921
+ const result = await processExcel({
922
+ id: ref.id,
923
+ name: ref.filename,
924
+ mimetype: ref.mimeType,
925
+ size: ref.sizeBytes,
926
+ buffer,
927
+ });
928
+ if (!result.success || !result.data) {
929
+ return null;
930
+ }
931
+ // Format worksheets as TSV text for LLM consumption
932
+ const worksheets = result.data.worksheets;
933
+ if (worksheets && worksheets.length > 0) {
934
+ const sections = [];
935
+ for (const ws of worksheets) {
936
+ sections.push(`## Sheet: ${ws.name}`);
937
+ if (ws.headers.length > 0) {
938
+ sections.push(ws.headers.join("\t"));
939
+ }
940
+ for (const row of ws.rows) {
941
+ sections.push(row.map((cell) => (cell === null ? "" : String(cell))).join("\t"));
942
+ }
943
+ sections.push("");
944
+ }
945
+ return sections.join("\n");
946
+ }
947
+ return null;
948
+ }
949
+ catch (err) {
950
+ logger.warn(`[FileReferenceRegistry] Excel extraction failed: ${err instanceof Error ? err.message : String(err)}`);
951
+ return null;
952
+ }
953
+ }
954
+ /**
955
+ * Extract text content from a Word document using WordProcessor.
956
+ */
957
+ async extractWordText(buffer, ref) {
958
+ try {
959
+ const { processWord } = await import("../processors/document/WordProcessor.js");
960
+ const result = await processWord({
961
+ id: ref.id,
962
+ name: ref.filename,
963
+ mimetype: ref.mimeType,
964
+ size: ref.sizeBytes,
965
+ buffer,
966
+ });
967
+ if (!result.success || !result.data) {
968
+ return null;
969
+ }
970
+ return result.data.textContent || null;
971
+ }
972
+ catch (err) {
973
+ logger.warn(`[FileReferenceRegistry] Word extraction failed: ${err instanceof Error ? err.message : String(err)}`);
974
+ return null;
975
+ }
976
+ }
977
+ /**
978
+ * Extract text from a PowerPoint file using PptxProcessor.
979
+ */
980
+ async extractPptxText(buffer) {
981
+ try {
982
+ const { PptxProcessor } = await import("../processors/document/PptxProcessor.js");
983
+ return await PptxProcessor.extractText(buffer);
984
+ }
985
+ catch (err) {
986
+ logger.warn(`[FileReferenceRegistry] PPTX extraction failed: ${err instanceof Error ? err.message : String(err)}`);
987
+ return null;
988
+ }
989
+ }
990
+ /**
991
+ * Extract metadata and content from a video file using VideoProcessor.
992
+ */
993
+ async extractVideoContent(buffer, ref) {
994
+ try {
995
+ const { processVideo } = await import("../processors/media/VideoProcessor.js");
996
+ const result = await processVideo({
997
+ id: ref.id,
998
+ name: ref.filename,
999
+ mimetype: ref.mimeType,
1000
+ size: ref.sizeBytes,
1001
+ buffer,
1002
+ });
1003
+ if (!result.success || !result.data) {
1004
+ return null;
1005
+ }
1006
+ // Store keyframe images on the reference for injection into the prompt
1007
+ if (result.data.keyframes && result.data.keyframes.length > 0) {
1008
+ ref.extractedImages = result.data.keyframes;
1009
+ logger.info(`[FileReferenceRegistry] Extracted ${result.data.keyframes.length} keyframes from "${ref.filename}"`);
1010
+ }
1011
+ return result.data.textContent || null;
1012
+ }
1013
+ catch (err) {
1014
+ logger.warn(`[FileReferenceRegistry] Video extraction failed: ${err instanceof Error ? err.message : String(err)}`);
1015
+ // Provide basic metadata even on failure
1016
+ return (`[Video file: ${ref.filename}, ${this.formatSize(ref.sizeBytes)}]\n` +
1017
+ `Video processing requires ffmpeg/ffprobe. Metadata could not be extracted.\n` +
1018
+ `Error: ${err instanceof Error ? err.message : String(err)}`);
1019
+ }
1020
+ }
1021
+ /**
1022
+ * Extract metadata and content from an audio file using AudioProcessor.
1023
+ */
1024
+ async extractAudioContent(buffer, ref) {
1025
+ try {
1026
+ const { processAudio } = await import("../processors/media/AudioProcessor.js");
1027
+ const result = await processAudio({
1028
+ id: ref.id,
1029
+ name: ref.filename,
1030
+ mimetype: ref.mimeType,
1031
+ size: ref.sizeBytes,
1032
+ buffer,
1033
+ });
1034
+ if (!result.success || !result.data) {
1035
+ return null;
1036
+ }
1037
+ return result.data.textContent || null;
1038
+ }
1039
+ catch (err) {
1040
+ logger.warn(`[FileReferenceRegistry] Audio extraction failed: ${err instanceof Error ? err.message : String(err)}`);
1041
+ return (`[Audio file: ${ref.filename}, ${this.formatSize(ref.sizeBytes)}]\n` +
1042
+ `Audio processing failed. Error: ${err instanceof Error ? err.message : String(err)}`);
1043
+ }
1044
+ }
1045
+ /**
1046
+ * Extract file listing from an archive using ArchiveProcessor.
1047
+ */
1048
+ async extractArchiveContent(buffer, ref) {
1049
+ try {
1050
+ const { processArchive } = await import("../processors/archive/ArchiveProcessor.js");
1051
+ const result = await processArchive({
1052
+ id: ref.id,
1053
+ name: ref.filename,
1054
+ mimetype: ref.mimeType,
1055
+ size: ref.sizeBytes,
1056
+ buffer,
1057
+ });
1058
+ if (!result.success || !result.data) {
1059
+ return null;
1060
+ }
1061
+ return result.data.textContent || null;
1062
+ }
1063
+ catch (err) {
1064
+ logger.warn(`[FileReferenceRegistry] Archive extraction failed: ${err instanceof Error ? err.message : String(err)}`);
1065
+ return null;
1066
+ }
1067
+ }
1068
+ /**
1069
+ * Extract a preview from a buffer.
1070
+ * For text: first N characters.
1071
+ * For binary: type-specific metadata.
1072
+ */
1073
+ extractPreview(buffer, type, maxChars) {
1074
+ if (this.isTextType(type, buffer)) {
1075
+ // Text-based: extract first N characters
1076
+ const text = buffer.toString("utf-8", 0, Math.min(buffer.length, maxChars + 100));
1077
+ if (text.length <= maxChars) {
1078
+ return text;
1079
+ }
1080
+ // Break at line boundary
1081
+ const lastNewline = text.lastIndexOf("\n", maxChars);
1082
+ if (lastNewline > maxChars * 0.8) {
1083
+ return text.substring(0, lastNewline);
1084
+ }
1085
+ return text.substring(0, maxChars) + "\n...[truncated]";
1086
+ }
1087
+ // Binary types: type-specific preview
1088
+ const sizeMB = (buffer.length / (1024 * 1024)).toFixed(2);
1089
+ switch (type) {
1090
+ case "image":
1091
+ return `[Image file: ${sizeMB} MB]`;
1092
+ case "video":
1093
+ return `[Video file: ${sizeMB} MB — use read tools for metadata/keyframes]`;
1094
+ case "audio":
1095
+ return `[Audio file: ${sizeMB} MB — use read tools for metadata/transcript]`;
1096
+ case "archive":
1097
+ return `[Archive file: ${sizeMB} MB — use read tools for file listing]`;
1098
+ case "pdf":
1099
+ return `[PDF document: ${sizeMB} MB — use read tools for page content]`;
1100
+ default:
1101
+ return `[Binary file: ${sizeMB} MB, type: ${type}]`;
1102
+ }
1103
+ }
1104
+ /**
1105
+ * Detect file type from buffer magic bytes and extension.
1106
+ */
1107
+ detectType(buffer, ext) {
1108
+ // Check magic bytes first
1109
+ if (buffer.length >= 4) {
1110
+ const header = buffer.subarray(0, 8);
1111
+ // PNG: 89 50 4E 47
1112
+ if (header[0] === 0x89 &&
1113
+ header[1] === 0x50 &&
1114
+ header[2] === 0x4e &&
1115
+ header[3] === 0x47) {
1116
+ return "image";
1117
+ }
1118
+ // JPEG: FF D8 FF
1119
+ if (header[0] === 0xff && header[1] === 0xd8 && header[2] === 0xff) {
1120
+ return "image";
1121
+ }
1122
+ // GIF: 47 49 46
1123
+ if (header[0] === 0x47 && header[1] === 0x49 && header[2] === 0x46) {
1124
+ return "image";
1125
+ }
1126
+ // WebP: 52 49 46 46 ... 57 45 42 50
1127
+ if (header[0] === 0x52 &&
1128
+ header[1] === 0x49 &&
1129
+ header[2] === 0x46 &&
1130
+ header[3] === 0x46 &&
1131
+ buffer.length >= 12 &&
1132
+ buffer[8] === 0x57 &&
1133
+ buffer[9] === 0x45 &&
1134
+ buffer[10] === 0x42 &&
1135
+ buffer[11] === 0x50) {
1136
+ return "image";
1137
+ }
1138
+ // PDF: 25 50 44 46
1139
+ if (header[0] === 0x25 &&
1140
+ header[1] === 0x50 &&
1141
+ header[2] === 0x44 &&
1142
+ header[3] === 0x46) {
1143
+ return "pdf";
1144
+ }
1145
+ // ZIP (and derivatives: xlsx, docx, pptx)
1146
+ if (header[0] === 0x50 && header[1] === 0x4b) {
1147
+ // Differentiate by extension
1148
+ if (ext === "xlsx") {
1149
+ return "xlsx";
1150
+ }
1151
+ if (ext === "docx") {
1152
+ return "docx";
1153
+ }
1154
+ if (ext === "pptx") {
1155
+ return "pptx";
1156
+ }
1157
+ return "archive";
1158
+ }
1159
+ // MP4/M4A: ftyp
1160
+ if (buffer.length >= 8 &&
1161
+ buffer[4] === 0x66 &&
1162
+ buffer[5] === 0x74 &&
1163
+ buffer[6] === 0x79 &&
1164
+ buffer[7] === 0x70) {
1165
+ if (["m4a", "aac"].includes(ext)) {
1166
+ return "audio";
1167
+ }
1168
+ return "video";
1169
+ }
1170
+ // ID3 (MP3): 49 44 33
1171
+ if (header[0] === 0x49 && header[1] === 0x44 && header[2] === 0x33) {
1172
+ return "audio";
1173
+ }
1174
+ // OGG: 4F 67 67 53
1175
+ if (header[0] === 0x4f &&
1176
+ header[1] === 0x67 &&
1177
+ header[2] === 0x67 &&
1178
+ header[3] === 0x53) {
1179
+ return "audio";
1180
+ }
1181
+ // FLAC: 66 4C 61 43
1182
+ if (header[0] === 0x66 &&
1183
+ header[1] === 0x4c &&
1184
+ header[2] === 0x61 &&
1185
+ header[3] === 0x43) {
1186
+ return "audio";
1187
+ }
1188
+ // WAV: 52 49 46 46 ... 57 41 56 45
1189
+ if (header[0] === 0x52 &&
1190
+ header[1] === 0x49 &&
1191
+ header[2] === 0x46 &&
1192
+ header[3] === 0x46 &&
1193
+ buffer.length >= 12 &&
1194
+ buffer[8] === 0x57 &&
1195
+ buffer[9] === 0x41 &&
1196
+ buffer[10] === 0x56 &&
1197
+ buffer[11] === 0x45) {
1198
+ return "audio";
1199
+ }
1200
+ // MKV/WebM: 1A 45 DF A3
1201
+ if (header[0] === 0x1a &&
1202
+ header[1] === 0x45 &&
1203
+ header[2] === 0xdf &&
1204
+ header[3] === 0xa3) {
1205
+ if (ext === "webm") {
1206
+ return "video";
1207
+ }
1208
+ return "video";
1209
+ }
1210
+ // AVI: 52 49 46 46 ... 41 56 49 20
1211
+ if (header[0] === 0x52 &&
1212
+ header[1] === 0x49 &&
1213
+ header[2] === 0x46 &&
1214
+ header[3] === 0x46 &&
1215
+ buffer.length >= 12 &&
1216
+ buffer[8] === 0x41 &&
1217
+ buffer[9] === 0x56 &&
1218
+ buffer[10] === 0x49 &&
1219
+ buffer[11] === 0x20) {
1220
+ return "video";
1221
+ }
1222
+ }
1223
+ // Fall back to extension
1224
+ return this.detectTypeFromExtension(ext);
1225
+ }
1226
+ /**
1227
+ * Detect file type from extension alone.
1228
+ */
1229
+ detectTypeFromExtension(ext) {
1230
+ const extensionMap = {
1231
+ // Images
1232
+ png: "image",
1233
+ jpg: "image",
1234
+ jpeg: "image",
1235
+ gif: "image",
1236
+ webp: "image",
1237
+ bmp: "image",
1238
+ tiff: "image",
1239
+ ico: "image",
1240
+ // Video
1241
+ mp4: "video",
1242
+ mkv: "video",
1243
+ webm: "video",
1244
+ avi: "video",
1245
+ mov: "video",
1246
+ m4v: "video",
1247
+ // Audio
1248
+ mp3: "audio",
1249
+ wav: "audio",
1250
+ ogg: "audio",
1251
+ flac: "audio",
1252
+ aac: "audio",
1253
+ m4a: "audio",
1254
+ wma: "audio",
1255
+ // Documents
1256
+ pdf: "pdf",
1257
+ docx: "docx",
1258
+ pptx: "pptx",
1259
+ xlsx: "xlsx",
1260
+ // Data
1261
+ csv: "csv",
1262
+ tsv: "csv",
1263
+ // Markup
1264
+ svg: "svg",
1265
+ // Archives
1266
+ zip: "archive",
1267
+ tar: "archive",
1268
+ gz: "archive",
1269
+ tgz: "archive",
1270
+ "7z": "archive",
1271
+ rar: "archive",
1272
+ // Text & Code
1273
+ txt: "text",
1274
+ md: "text",
1275
+ log: "text",
1276
+ json: "text",
1277
+ yaml: "text",
1278
+ yml: "text",
1279
+ xml: "text",
1280
+ html: "text",
1281
+ htm: "text",
1282
+ css: "text",
1283
+ js: "text",
1284
+ ts: "text",
1285
+ jsx: "text",
1286
+ tsx: "text",
1287
+ py: "text",
1288
+ java: "text",
1289
+ go: "text",
1290
+ rs: "text",
1291
+ rb: "text",
1292
+ php: "text",
1293
+ c: "text",
1294
+ cpp: "text",
1295
+ h: "text",
1296
+ cs: "text",
1297
+ swift: "text",
1298
+ kt: "text",
1299
+ scala: "text",
1300
+ sql: "text",
1301
+ sh: "text",
1302
+ bash: "text",
1303
+ zsh: "text",
1304
+ toml: "text",
1305
+ ini: "text",
1306
+ cfg: "text",
1307
+ env: "text",
1308
+ dockerfile: "text",
1309
+ makefile: "text",
1310
+ };
1311
+ return extensionMap[ext.toLowerCase()] || "unknown";
1312
+ }
1313
+ /**
1314
+ * Whether a file type contains readable text content.
1315
+ * For "unknown" types, optionally checks the buffer for valid UTF-8 text.
1316
+ */
1317
+ isTextType(type, buffer) {
1318
+ if (["text", "csv", "svg"].includes(type)) {
1319
+ return true;
1320
+ }
1321
+ // For unknown types, heuristically check if the buffer is likely text
1322
+ if (type === "unknown" && buffer && buffer.length > 0) {
1323
+ return FileReferenceRegistry.looksLikeText(buffer);
1324
+ }
1325
+ return false;
1326
+ }
1327
+ /**
1328
+ * Heuristic check: does a buffer look like valid text content?
1329
+ * Checks the first 512 bytes for mostly printable ASCII/UTF-8 characters.
1330
+ * Returns true if >90% of bytes are printable (ASCII 0x20-0x7E, tab, newline, CR).
1331
+ */
1332
+ static looksLikeText(buffer) {
1333
+ const sampleSize = Math.min(buffer.length, 512);
1334
+ let printable = 0;
1335
+ for (let i = 0; i < sampleSize; i++) {
1336
+ const b = buffer[i];
1337
+ // Printable ASCII, tab, newline, carriage return, or high bytes (UTF-8 multibyte)
1338
+ if ((b >= 0x20 && b <= 0x7e) ||
1339
+ b === 0x09 ||
1340
+ b === 0x0a ||
1341
+ b === 0x0d ||
1342
+ b >= 0x80) {
1343
+ printable++;
1344
+ }
1345
+ }
1346
+ return printable / sampleSize > 0.9;
1347
+ }
1348
+ /**
1349
+ * Guess MIME type from file type and extension.
1350
+ */
1351
+ guessMimeType(type, ext) {
1352
+ const mimeMap = {
1353
+ // By file type
1354
+ csv: "text/csv",
1355
+ svg: "image/svg+xml",
1356
+ pdf: "application/pdf",
1357
+ docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1358
+ pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
1359
+ xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1360
+ video: "video/mp4",
1361
+ audio: "audio/mpeg",
1362
+ archive: "application/zip",
1363
+ image: "image/png",
1364
+ };
1365
+ if (mimeMap[type]) {
1366
+ return mimeMap[type];
1367
+ }
1368
+ // By extension
1369
+ const extMime = {
1370
+ png: "image/png",
1371
+ jpg: "image/jpeg",
1372
+ jpeg: "image/jpeg",
1373
+ gif: "image/gif",
1374
+ webp: "image/webp",
1375
+ mp4: "video/mp4",
1376
+ mkv: "video/x-matroska",
1377
+ webm: "video/webm",
1378
+ avi: "video/x-msvideo",
1379
+ mov: "video/quicktime",
1380
+ mp3: "audio/mpeg",
1381
+ wav: "audio/wav",
1382
+ ogg: "audio/ogg",
1383
+ flac: "audio/flac",
1384
+ json: "application/json",
1385
+ xml: "application/xml",
1386
+ html: "text/html",
1387
+ css: "text/css",
1388
+ js: "text/javascript",
1389
+ ts: "text/typescript",
1390
+ py: "text/x-python",
1391
+ zip: "application/zip",
1392
+ tar: "application/x-tar",
1393
+ gz: "application/gzip",
1394
+ };
1395
+ return extMime[ext.toLowerCase()] || "application/octet-stream";
1396
+ }
1397
+ /**
1398
+ * Guess file extension from magic bytes.
1399
+ */
1400
+ guessExtension(buffer) {
1401
+ if (buffer.length < 4) {
1402
+ return "";
1403
+ }
1404
+ if (buffer[0] === 0x89 && buffer[1] === 0x50) {
1405
+ return ".png";
1406
+ }
1407
+ if (buffer[0] === 0xff && buffer[1] === 0xd8) {
1408
+ return ".jpg";
1409
+ }
1410
+ if (buffer[0] === 0x25 && buffer[1] === 0x50) {
1411
+ return ".pdf";
1412
+ }
1413
+ if (buffer[0] === 0x50 && buffer[1] === 0x4b) {
1414
+ return ".zip";
1415
+ }
1416
+ if (buffer[0] === 0x49 && buffer[1] === 0x44) {
1417
+ return ".mp3";
1418
+ }
1419
+ // MP4/MOV/M4V — ftyp atom at offset 4
1420
+ if (buffer.length >= 8 &&
1421
+ buffer[4] === 0x66 &&
1422
+ buffer[5] === 0x74 &&
1423
+ buffer[6] === 0x79 &&
1424
+ buffer[7] === 0x70) {
1425
+ // Check the brand to distinguish MOV vs MP4
1426
+ const brand = buffer.toString("ascii", 8, 12);
1427
+ if (brand === "qt ") {
1428
+ return ".mov";
1429
+ }
1430
+ return ".mp4";
1431
+ }
1432
+ // MKV/WebM — EBML header (0x1A 0x45 0xDF 0xA3)
1433
+ if (buffer.length >= 4 &&
1434
+ buffer[0] === 0x1a &&
1435
+ buffer[1] === 0x45 &&
1436
+ buffer[2] === 0xdf &&
1437
+ buffer[3] === 0xa3) {
1438
+ return ".mkv";
1439
+ }
1440
+ // AVI — RIFF....AVI
1441
+ if (buffer.length >= 12 &&
1442
+ buffer[0] === 0x52 &&
1443
+ buffer[1] === 0x49 &&
1444
+ buffer[2] === 0x46 &&
1445
+ buffer[3] === 0x46 &&
1446
+ buffer[8] === 0x41 &&
1447
+ buffer[9] === 0x56 &&
1448
+ buffer[10] === 0x49) {
1449
+ return ".avi";
1450
+ }
1451
+ // WAV — RIFF....WAVE
1452
+ if (buffer.length >= 12 &&
1453
+ buffer[0] === 0x52 &&
1454
+ buffer[1] === 0x49 &&
1455
+ buffer[2] === 0x46 &&
1456
+ buffer[3] === 0x46 &&
1457
+ buffer[8] === 0x57 &&
1458
+ buffer[9] === 0x41 &&
1459
+ buffer[10] === 0x56 &&
1460
+ buffer[11] === 0x45) {
1461
+ return ".wav";
1462
+ }
1463
+ // FLAC
1464
+ if (buffer.length >= 4 &&
1465
+ buffer[0] === 0x66 &&
1466
+ buffer[1] === 0x4c &&
1467
+ buffer[2] === 0x61 &&
1468
+ buffer[3] === 0x43) {
1469
+ return ".flac";
1470
+ }
1471
+ // OGG
1472
+ if (buffer.length >= 4 &&
1473
+ buffer[0] === 0x4f &&
1474
+ buffer[1] === 0x67 &&
1475
+ buffer[2] === 0x67 &&
1476
+ buffer[3] === 0x53) {
1477
+ return ".ogg";
1478
+ }
1479
+ return "";
1480
+ }
1481
+ /**
1482
+ * Persist a buffer to the temp directory.
1483
+ */
1484
+ async persistToTemp(id, buffer, ext) {
1485
+ // Check temp space budget
1486
+ if (this.currentTempBytes + buffer.length > this.maxTempBytes) {
1487
+ // Try evicting oldest files
1488
+ this.evictLRU();
1489
+ if (this.currentTempBytes + buffer.length > this.maxTempBytes) {
1490
+ throw new Error(`Temp directory budget exceeded (${this.formatSize(this.maxTempBytes)})`);
1491
+ }
1492
+ }
1493
+ // Ensure temp directory exists
1494
+ if (!this.tempDirCreated) {
1495
+ await mkdir(this.tempDir, { recursive: true });
1496
+ this.tempDirCreated = true;
1497
+ }
1498
+ const tempPath = join(this.tempDir, `${id}${ext ? `.${ext}` : ""}`);
1499
+ await writeFile(tempPath, buffer);
1500
+ this.currentTempBytes += buffer.length;
1501
+ return tempPath;
1502
+ }
1503
+ /**
1504
+ * Evict the least recently used file reference.
1505
+ */
1506
+ evictLRU() {
1507
+ let oldest = null;
1508
+ let oldestId = null;
1509
+ for (const [id, ref] of this.files) {
1510
+ if (!oldest || ref.lastAccessedAt < oldest.lastAccessedAt) {
1511
+ oldest = ref;
1512
+ oldestId = id;
1513
+ }
1514
+ }
1515
+ if (oldestId && oldest) {
1516
+ logger.info(`[FileReferenceRegistry] Evicting LRU: "${oldest.filename}" ` +
1517
+ `(last accessed ${new Date(oldest.lastAccessedAt).toISOString()})`);
1518
+ // Clean up temp file if we created it
1519
+ if (oldest.tempPath && oldest.source !== "path") {
1520
+ unlink(oldest.tempPath).catch(() => {
1521
+ // Ignore cleanup errors
1522
+ });
1523
+ this.currentTempBytes -= oldest.sizeBytes;
1524
+ }
1525
+ this.files.delete(oldestId);
1526
+ }
1527
+ }
1528
+ /**
1529
+ * Format byte size as human-readable string.
1530
+ */
1531
+ formatSize(bytes) {
1532
+ if (bytes < 1024) {
1533
+ return `${bytes} B`;
1534
+ }
1535
+ if (bytes < 1024 * 1024) {
1536
+ return `${(bytes / 1024).toFixed(1)} KB`;
1537
+ }
1538
+ if (bytes < 1024 * 1024 * 1024) {
1539
+ return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
1540
+ }
1541
+ return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
1542
+ }
1543
+ }