@juspay/neurolink 9.32.0 → 9.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (475) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/dist/auth/anthropicOAuth.js +1 -1
  3. package/dist/cli/commands/proxy.js +18 -5
  4. package/dist/client/aiSdkAdapter.js +1 -1
  5. package/dist/client/index.js +137 -501
  6. package/dist/core/factory.js +0 -1
  7. package/dist/core/redisConversationMemoryManager.js +1 -1
  8. package/dist/features/ppt/slideGenerator.js +0 -1
  9. package/dist/features/ppt/utils.js +0 -1
  10. package/dist/lib/neurolink.d.ts +10 -0
  11. package/dist/lib/neurolink.js +41 -7
  12. package/dist/lib/server/routes/claudeProxyRoutes.js +45 -9
  13. package/dist/lib/types/generateTypes.d.ts +16 -0
  14. package/dist/lib/types/streamTypes.d.ts +15 -0
  15. package/dist/mcp/elicitationProtocol.js +1 -1
  16. package/dist/mcp/servers/agent/directToolsServer.js +0 -1
  17. package/dist/neurolink.d.ts +10 -0
  18. package/dist/neurolink.js +41 -7
  19. package/dist/providers/azureOpenai.js +1 -1
  20. package/dist/providers/huggingFace.js +0 -1
  21. package/dist/providers/openaiCompatible.js +0 -1
  22. package/dist/sdk/toolRegistration.js +0 -1
  23. package/dist/server/openapi/generator.js +1 -1
  24. package/dist/server/routes/claudeProxyRoutes.js +45 -9
  25. package/dist/types/configTypes.js +0 -5
  26. package/dist/types/generateTypes.d.ts +16 -0
  27. package/dist/types/modelTypes.js +0 -1
  28. package/dist/types/streamTypes.d.ts +15 -0
  29. package/dist/types/tools.js +0 -1
  30. package/dist/types/typeAliases.js +0 -1
  31. package/dist/types/utilities.js +1 -1
  32. package/dist/types/workflowTypes.js +0 -1
  33. package/dist/utils/providerRetry.js +0 -1
  34. package/dist/utils/providerUtils.js +0 -1
  35. package/package.json +2 -2
  36. package/dist/client/adapters/providerImageAdapter.js +0 -588
  37. package/dist/client/adapters/tts/googleTTSHandler.js +0 -344
  38. package/dist/client/adapters/video/directorPipeline.js +0 -516
  39. package/dist/client/adapters/video/ffmpegAdapter.js +0 -206
  40. package/dist/client/adapters/video/frameExtractor.js +0 -143
  41. package/dist/client/adapters/video/vertexVideoHandler.js +0 -763
  42. package/dist/client/adapters/video/videoAnalyzer.js +0 -238
  43. package/dist/client/adapters/video/videoMerger.js +0 -171
  44. package/dist/client/agent/directTools.js +0 -840
  45. package/dist/client/auth/AuthProviderFactory.js +0 -111
  46. package/dist/client/auth/AuthProviderRegistry.js +0 -190
  47. package/dist/client/auth/RequestContext.js +0 -78
  48. package/dist/client/auth/accountPool.js +0 -178
  49. package/dist/client/auth/anthropicOAuth.js +0 -974
  50. package/dist/client/auth/authContext.js +0 -314
  51. package/dist/client/auth/errors.js +0 -39
  52. package/dist/client/auth/index.js +0 -61
  53. package/dist/client/auth/middleware/AuthMiddleware.js +0 -519
  54. package/dist/client/auth/middleware/rateLimitByUser.js +0 -554
  55. package/dist/client/auth/providers/BaseAuthProvider.js +0 -723
  56. package/dist/client/auth/providers/CognitoProvider.js +0 -304
  57. package/dist/client/auth/providers/KeycloakProvider.js +0 -393
  58. package/dist/client/auth/providers/auth0.js +0 -274
  59. package/dist/client/auth/providers/betterAuth.js +0 -182
  60. package/dist/client/auth/providers/clerk.js +0 -317
  61. package/dist/client/auth/providers/custom.js +0 -112
  62. package/dist/client/auth/providers/firebase.js +0 -226
  63. package/dist/client/auth/providers/jwt.js +0 -212
  64. package/dist/client/auth/providers/oauth2.js +0 -303
  65. package/dist/client/auth/providers/supabase.js +0 -259
  66. package/dist/client/auth/providers/workos.js +0 -284
  67. package/dist/client/auth/serverBridge.js +0 -25
  68. package/dist/client/auth/sessionManager.js +0 -437
  69. package/dist/client/auth/tokenStore.js +0 -799
  70. package/dist/client/client/aiSdkAdapter.js +0 -487
  71. package/dist/client/client/auth.js +0 -473
  72. package/dist/client/client/errors.js +0 -552
  73. package/dist/client/client/httpClient.js +0 -837
  74. package/dist/client/client/index.js +0 -172
  75. package/dist/client/client/interceptors.js +0 -601
  76. package/dist/client/client/sseClient.js +0 -545
  77. package/dist/client/client/streamingClient.js +0 -917
  78. package/dist/client/client/wsClient.js +0 -369
  79. package/dist/client/config/configManager.js +0 -303
  80. package/dist/client/config/conversationMemory.js +0 -86
  81. package/dist/client/config/taskClassificationConfig.js +0 -148
  82. package/dist/client/constants/contextWindows.js +0 -295
  83. package/dist/client/constants/enums.js +0 -853
  84. package/dist/client/constants/index.js +0 -207
  85. package/dist/client/constants/performance.js +0 -389
  86. package/dist/client/constants/retry.js +0 -266
  87. package/dist/client/constants/timeouts.js +0 -182
  88. package/dist/client/constants/tokens.js +0 -380
  89. package/dist/client/constants/videoErrors.js +0 -46
  90. package/dist/client/context/budgetChecker.js +0 -98
  91. package/dist/client/context/contextCompactor.js +0 -205
  92. package/dist/client/context/emergencyTruncation.js +0 -88
  93. package/dist/client/context/errorDetection.js +0 -171
  94. package/dist/client/context/errors.js +0 -21
  95. package/dist/client/context/fileTokenBudget.js +0 -127
  96. package/dist/client/context/prompts/summarizationPrompt.js +0 -117
  97. package/dist/client/context/stages/fileReadDeduplicator.js +0 -66
  98. package/dist/client/context/stages/slidingWindowTruncator.js +0 -190
  99. package/dist/client/context/stages/structuredSummarizer.js +0 -99
  100. package/dist/client/context/stages/toolOutputPruner.js +0 -52
  101. package/dist/client/context/summarizationEngine.js +0 -136
  102. package/dist/client/context/toolOutputLimits.js +0 -78
  103. package/dist/client/context/toolPairRepair.js +0 -66
  104. package/dist/client/core/analytics.js +0 -88
  105. package/dist/client/core/baseProvider.js +0 -1385
  106. package/dist/client/core/constants.js +0 -140
  107. package/dist/client/core/conversationMemoryFactory.js +0 -141
  108. package/dist/client/core/conversationMemoryInitializer.js +0 -128
  109. package/dist/client/core/conversationMemoryManager.js +0 -344
  110. package/dist/client/core/dynamicModels.js +0 -358
  111. package/dist/client/core/evaluation.js +0 -309
  112. package/dist/client/core/evaluationProviders.js +0 -248
  113. package/dist/client/core/factory.js +0 -412
  114. package/dist/client/core/infrastructure/baseError.js +0 -22
  115. package/dist/client/core/infrastructure/baseFactory.js +0 -54
  116. package/dist/client/core/infrastructure/baseRegistry.js +0 -53
  117. package/dist/client/core/infrastructure/index.js +0 -5
  118. package/dist/client/core/infrastructure/retry.js +0 -20
  119. package/dist/client/core/infrastructure/typedEventEmitter.js +0 -23
  120. package/dist/client/core/modelConfiguration.js +0 -851
  121. package/dist/client/core/modules/GenerationHandler.js +0 -588
  122. package/dist/client/core/modules/MessageBuilder.js +0 -273
  123. package/dist/client/core/modules/StreamHandler.js +0 -185
  124. package/dist/client/core/modules/TelemetryHandler.js +0 -203
  125. package/dist/client/core/modules/ToolsManager.js +0 -499
  126. package/dist/client/core/modules/Utilities.js +0 -331
  127. package/dist/client/core/redisConversationMemoryManager.js +0 -1435
  128. package/dist/client/core/streamAnalytics.js +0 -131
  129. package/dist/client/evaluation/contextBuilder.js +0 -134
  130. package/dist/client/evaluation/index.js +0 -61
  131. package/dist/client/evaluation/prompts.js +0 -73
  132. package/dist/client/evaluation/ragasEvaluator.js +0 -110
  133. package/dist/client/evaluation/retryManager.js +0 -78
  134. package/dist/client/evaluation/scoring.js +0 -61
  135. package/dist/client/factories/providerFactory.js +0 -166
  136. package/dist/client/factories/providerRegistry.js +0 -166
  137. package/dist/client/features/ppt/constants.js +0 -896
  138. package/dist/client/features/ppt/contentPlanner.js +0 -529
  139. package/dist/client/features/ppt/presentationOrchestrator.js +0 -236
  140. package/dist/client/features/ppt/slideGenerator.js +0 -532
  141. package/dist/client/features/ppt/slideRenderers.js +0 -2383
  142. package/dist/client/features/ppt/slideTypeInference.js +0 -405
  143. package/dist/client/features/ppt/types.js +0 -13
  144. package/dist/client/features/ppt/utils.js +0 -443
  145. package/dist/client/files/fileReferenceRegistry.js +0 -1543
  146. package/dist/client/files/fileTools.js +0 -450
  147. package/dist/client/files/streamingReader.js +0 -321
  148. package/dist/client/files/types.js +0 -23
  149. package/dist/client/hitl/hitlErrors.js +0 -54
  150. package/dist/client/hitl/hitlManager.js +0 -460
  151. package/dist/client/mcp/agentExposure.js +0 -356
  152. package/dist/client/mcp/auth/index.js +0 -11
  153. package/dist/client/mcp/auth/oauthClientProvider.js +0 -325
  154. package/dist/client/mcp/auth/tokenStorage.js +0 -134
  155. package/dist/client/mcp/batching/index.js +0 -10
  156. package/dist/client/mcp/batching/requestBatcher.js +0 -441
  157. package/dist/client/mcp/caching/index.js +0 -10
  158. package/dist/client/mcp/caching/toolCache.js +0 -433
  159. package/dist/client/mcp/elicitation/elicitationManager.js +0 -376
  160. package/dist/client/mcp/elicitation/index.js +0 -11
  161. package/dist/client/mcp/elicitation/types.js +0 -10
  162. package/dist/client/mcp/elicitationProtocol.js +0 -375
  163. package/dist/client/mcp/enhancedToolDiscovery.js +0 -481
  164. package/dist/client/mcp/externalServerManager.js +0 -1478
  165. package/dist/client/mcp/factory.js +0 -161
  166. package/dist/client/mcp/flexibleToolValidator.js +0 -161
  167. package/dist/client/mcp/httpRateLimiter.js +0 -391
  168. package/dist/client/mcp/httpRetryHandler.js +0 -178
  169. package/dist/client/mcp/index.js +0 -74
  170. package/dist/client/mcp/mcpCircuitBreaker.js +0 -427
  171. package/dist/client/mcp/mcpClientFactory.js +0 -708
  172. package/dist/client/mcp/mcpRegistryClient.js +0 -488
  173. package/dist/client/mcp/mcpServerBase.js +0 -373
  174. package/dist/client/mcp/multiServerManager.js +0 -579
  175. package/dist/client/mcp/registry.js +0 -158
  176. package/dist/client/mcp/routing/index.js +0 -10
  177. package/dist/client/mcp/routing/toolRouter.js +0 -416
  178. package/dist/client/mcp/serverCapabilities.js +0 -502
  179. package/dist/client/mcp/servers/agent/directToolsServer.js +0 -150
  180. package/dist/client/mcp/toolAnnotations.js +0 -239
  181. package/dist/client/mcp/toolConverter.js +0 -258
  182. package/dist/client/mcp/toolDiscoveryService.js +0 -798
  183. package/dist/client/mcp/toolIntegration.js +0 -334
  184. package/dist/client/mcp/toolRegistry.js +0 -729
  185. package/dist/client/memory/hippocampusInitializer.js +0 -19
  186. package/dist/client/memory/memoryRetrievalTools.js +0 -166
  187. package/dist/client/middleware/builtin/analytics.js +0 -132
  188. package/dist/client/middleware/builtin/autoEvaluation.js +0 -203
  189. package/dist/client/middleware/builtin/guardrails.js +0 -109
  190. package/dist/client/middleware/builtin/lifecycle.js +0 -168
  191. package/dist/client/middleware/factory.js +0 -327
  192. package/dist/client/middleware/registry.js +0 -295
  193. package/dist/client/middleware/utils/guardrailsUtils.js +0 -396
  194. package/dist/client/models/anthropicModels.js +0 -527
  195. package/dist/client/neurolink.js +0 -8233
  196. package/dist/client/observability/exporterRegistry.js +0 -413
  197. package/dist/client/observability/exporters/arizeExporter.js +0 -138
  198. package/dist/client/observability/exporters/baseExporter.js +0 -190
  199. package/dist/client/observability/exporters/braintrustExporter.js +0 -154
  200. package/dist/client/observability/exporters/datadogExporter.js +0 -196
  201. package/dist/client/observability/exporters/laminarExporter.js +0 -302
  202. package/dist/client/observability/exporters/langfuseExporter.js +0 -209
  203. package/dist/client/observability/exporters/langsmithExporter.js +0 -143
  204. package/dist/client/observability/exporters/otelExporter.js +0 -164
  205. package/dist/client/observability/exporters/posthogExporter.js +0 -287
  206. package/dist/client/observability/exporters/sentryExporter.js +0 -165
  207. package/dist/client/observability/index.js +0 -31
  208. package/dist/client/observability/metricsAggregator.js +0 -556
  209. package/dist/client/observability/otelBridge.js +0 -131
  210. package/dist/client/observability/retryPolicy.js +0 -383
  211. package/dist/client/observability/sampling/samplers.js +0 -216
  212. package/dist/client/observability/spanProcessor.js +0 -303
  213. package/dist/client/observability/tokenTracker.js +0 -413
  214. package/dist/client/observability/types/exporterTypes.js +0 -5
  215. package/dist/client/observability/types/index.js +0 -4
  216. package/dist/client/observability/types/spanTypes.js +0 -92
  217. package/dist/client/observability/utils/safeMetadata.js +0 -25
  218. package/dist/client/observability/utils/spanSerializer.js +0 -292
  219. package/dist/client/processors/archive/ArchiveProcessor.js +0 -1308
  220. package/dist/client/processors/base/BaseFileProcessor.js +0 -614
  221. package/dist/client/processors/base/types.js +0 -82
  222. package/dist/client/processors/config/fileTypes.js +0 -520
  223. package/dist/client/processors/config/index.js +0 -92
  224. package/dist/client/processors/config/languageMap.js +0 -410
  225. package/dist/client/processors/config/mimeTypes.js +0 -363
  226. package/dist/client/processors/config/sizeLimits.js +0 -258
  227. package/dist/client/processors/document/ExcelProcessor.js +0 -590
  228. package/dist/client/processors/document/OpenDocumentProcessor.js +0 -212
  229. package/dist/client/processors/document/PptxProcessor.js +0 -157
  230. package/dist/client/processors/document/RtfProcessor.js +0 -361
  231. package/dist/client/processors/document/WordProcessor.js +0 -353
  232. package/dist/client/processors/errors/FileErrorCode.js +0 -255
  233. package/dist/client/processors/errors/errorHelpers.js +0 -386
  234. package/dist/client/processors/errors/errorSerializer.js +0 -507
  235. package/dist/client/processors/errors/index.js +0 -49
  236. package/dist/client/processors/markup/SvgProcessor.js +0 -240
  237. package/dist/client/processors/media/AudioProcessor.js +0 -707
  238. package/dist/client/processors/media/VideoProcessor.js +0 -1045
  239. package/dist/client/providers/amazonBedrock.js +0 -1512
  240. package/dist/client/providers/amazonSagemaker.js +0 -162
  241. package/dist/client/providers/anthropic.js +0 -831
  242. package/dist/client/providers/azureOpenai.js +0 -143
  243. package/dist/client/providers/googleAiStudio.js +0 -1200
  244. package/dist/client/providers/googleNativeGemini3.js +0 -543
  245. package/dist/client/providers/googleVertex.js +0 -2936
  246. package/dist/client/providers/huggingFace.js +0 -315
  247. package/dist/client/providers/litellm.js +0 -488
  248. package/dist/client/providers/mistral.js +0 -157
  249. package/dist/client/providers/ollama.js +0 -1579
  250. package/dist/client/providers/openAI.js +0 -627
  251. package/dist/client/providers/openRouter.js +0 -543
  252. package/dist/client/providers/openaiCompatible.js +0 -290
  253. package/dist/client/providers/providerTypeUtils.js +0 -46
  254. package/dist/client/providers/sagemaker/adaptive-semaphore.js +0 -215
  255. package/dist/client/providers/sagemaker/client.js +0 -472
  256. package/dist/client/providers/sagemaker/config.js +0 -317
  257. package/dist/client/providers/sagemaker/detection.js +0 -606
  258. package/dist/client/providers/sagemaker/error-constants.js +0 -227
  259. package/dist/client/providers/sagemaker/errors.js +0 -299
  260. package/dist/client/providers/sagemaker/language-model.js +0 -775
  261. package/dist/client/providers/sagemaker/parsers.js +0 -634
  262. package/dist/client/providers/sagemaker/streaming.js +0 -331
  263. package/dist/client/providers/sagemaker/structured-parser.js +0 -625
  264. package/dist/client/proxy/accountQuota.js +0 -162
  265. package/dist/client/proxy/claudeFormat.js +0 -595
  266. package/dist/client/proxy/modelRouter.js +0 -29
  267. package/dist/client/proxy/oauthFetch.js +0 -367
  268. package/dist/client/proxy/proxyFetch.js +0 -586
  269. package/dist/client/proxy/requestLogger.js +0 -207
  270. package/dist/client/proxy/tokenRefresh.js +0 -124
  271. package/dist/client/proxy/usageStats.js +0 -74
  272. package/dist/client/proxy/utils/noProxyUtils.js +0 -149
  273. package/dist/client/rag/ChunkerFactory.js +0 -320
  274. package/dist/client/rag/ChunkerRegistry.js +0 -421
  275. package/dist/client/rag/chunkers/BaseChunker.js +0 -143
  276. package/dist/client/rag/chunkers/CharacterChunker.js +0 -28
  277. package/dist/client/rag/chunkers/HTMLChunker.js +0 -38
  278. package/dist/client/rag/chunkers/JSONChunker.js +0 -68
  279. package/dist/client/rag/chunkers/LaTeXChunker.js +0 -63
  280. package/dist/client/rag/chunkers/MarkdownChunker.js +0 -306
  281. package/dist/client/rag/chunkers/RecursiveChunker.js +0 -139
  282. package/dist/client/rag/chunkers/SemanticMarkdownChunker.js +0 -138
  283. package/dist/client/rag/chunkers/SentenceChunker.js +0 -66
  284. package/dist/client/rag/chunkers/TokenChunker.js +0 -61
  285. package/dist/client/rag/chunkers/index.js +0 -15
  286. package/dist/client/rag/chunking/characterChunker.js +0 -142
  287. package/dist/client/rag/chunking/chunkerRegistry.js +0 -194
  288. package/dist/client/rag/chunking/htmlChunker.js +0 -247
  289. package/dist/client/rag/chunking/index.js +0 -17
  290. package/dist/client/rag/chunking/jsonChunker.js +0 -281
  291. package/dist/client/rag/chunking/latexChunker.js +0 -251
  292. package/dist/client/rag/chunking/markdownChunker.js +0 -373
  293. package/dist/client/rag/chunking/recursiveChunker.js +0 -148
  294. package/dist/client/rag/chunking/semanticChunker.js +0 -306
  295. package/dist/client/rag/chunking/sentenceChunker.js +0 -230
  296. package/dist/client/rag/chunking/tokenChunker.js +0 -183
  297. package/dist/client/rag/document/MDocument.js +0 -392
  298. package/dist/client/rag/document/index.js +0 -5
  299. package/dist/client/rag/document/loaders.js +0 -500
  300. package/dist/client/rag/errors/RAGError.js +0 -274
  301. package/dist/client/rag/errors/index.js +0 -6
  302. package/dist/client/rag/graphRag/graphRAG.js +0 -401
  303. package/dist/client/rag/graphRag/index.js +0 -4
  304. package/dist/client/rag/index.js +0 -141
  305. package/dist/client/rag/metadata/MetadataExtractorFactory.js +0 -418
  306. package/dist/client/rag/metadata/MetadataExtractorRegistry.js +0 -362
  307. package/dist/client/rag/metadata/index.js +0 -9
  308. package/dist/client/rag/metadata/metadataExtractor.js +0 -280
  309. package/dist/client/rag/pipeline/RAGPipeline.js +0 -436
  310. package/dist/client/rag/pipeline/contextAssembly.js +0 -341
  311. package/dist/client/rag/pipeline/index.js +0 -5
  312. package/dist/client/rag/ragIntegration.js +0 -321
  313. package/dist/client/rag/reranker/RerankerFactory.js +0 -430
  314. package/dist/client/rag/reranker/RerankerRegistry.js +0 -402
  315. package/dist/client/rag/reranker/index.js +0 -9
  316. package/dist/client/rag/reranker/reranker.js +0 -277
  317. package/dist/client/rag/resilience/CircuitBreaker.js +0 -431
  318. package/dist/client/rag/resilience/RetryHandler.js +0 -304
  319. package/dist/client/rag/resilience/index.js +0 -7
  320. package/dist/client/rag/retrieval/hybridSearch.js +0 -335
  321. package/dist/client/rag/retrieval/index.js +0 -5
  322. package/dist/client/rag/retrieval/vectorQueryTool.js +0 -307
  323. package/dist/client/rag/types.js +0 -8
  324. package/dist/client/sdk/toolRegistration.js +0 -377
  325. package/dist/client/server/abstract/baseServerAdapter.js +0 -575
  326. package/dist/client/server/adapters/expressAdapter.js +0 -486
  327. package/dist/client/server/adapters/fastifyAdapter.js +0 -472
  328. package/dist/client/server/adapters/honoAdapter.js +0 -632
  329. package/dist/client/server/adapters/koaAdapter.js +0 -510
  330. package/dist/client/server/errors.js +0 -486
  331. package/dist/client/server/factory/serverAdapterFactory.js +0 -160
  332. package/dist/client/server/index.js +0 -108
  333. package/dist/client/server/middleware/abortSignal.js +0 -111
  334. package/dist/client/server/middleware/auth.js +0 -388
  335. package/dist/client/server/middleware/cache.js +0 -359
  336. package/dist/client/server/middleware/common.js +0 -281
  337. package/dist/client/server/middleware/deprecation.js +0 -190
  338. package/dist/client/server/middleware/mcpBodyAttachment.js +0 -63
  339. package/dist/client/server/middleware/rateLimit.js +0 -227
  340. package/dist/client/server/middleware/validation.js +0 -388
  341. package/dist/client/server/openapi/generator.js +0 -398
  342. package/dist/client/server/openapi/index.js +0 -36
  343. package/dist/client/server/openapi/schemas.js +0 -695
  344. package/dist/client/server/openapi/templates.js +0 -374
  345. package/dist/client/server/routes/agentRoutes.js +0 -189
  346. package/dist/client/server/routes/claudeProxyRoutes.js +0 -1600
  347. package/dist/client/server/routes/healthRoutes.js +0 -187
  348. package/dist/client/server/routes/index.js +0 -57
  349. package/dist/client/server/routes/mcpRoutes.js +0 -342
  350. package/dist/client/server/routes/memoryRoutes.js +0 -350
  351. package/dist/client/server/routes/openApiRoutes.js +0 -126
  352. package/dist/client/server/routes/toolRoutes.js +0 -199
  353. package/dist/client/server/streaming/dataStream.js +0 -486
  354. package/dist/client/server/streaming/index.js +0 -11
  355. package/dist/client/server/types.js +0 -67
  356. package/dist/client/server/utils/redaction.js +0 -334
  357. package/dist/client/server/utils/validation.js +0 -243
  358. package/dist/client/server/websocket/WebSocketHandler.js +0 -383
  359. package/dist/client/server/websocket/index.js +0 -4
  360. package/dist/client/services/server/ai/observability/instrumentation.js +0 -808
  361. package/dist/client/telemetry/attributes.js +0 -100
  362. package/dist/client/telemetry/index.js +0 -26
  363. package/dist/client/telemetry/telemetryService.js +0 -308
  364. package/dist/client/telemetry/tracers.js +0 -17
  365. package/dist/client/telemetry/withSpan.js +0 -34
  366. package/dist/client/types/actionTypes.js +0 -6
  367. package/dist/client/types/analytics.js +0 -5
  368. package/dist/client/types/authTypes.js +0 -9
  369. package/dist/client/types/circuitBreakerErrors.js +0 -34
  370. package/dist/client/types/cli.js +0 -21
  371. package/dist/client/types/clientTypes.js +0 -10
  372. package/dist/client/types/common.js +0 -51
  373. package/dist/client/types/configTypes.js +0 -49
  374. package/dist/client/types/content.js +0 -19
  375. package/dist/client/types/contextTypes.js +0 -400
  376. package/dist/client/types/conversation.js +0 -47
  377. package/dist/client/types/conversationMemoryInterface.js +0 -6
  378. package/dist/client/types/domainTypes.js +0 -5
  379. package/dist/client/types/errors.js +0 -167
  380. package/dist/client/types/evaluation.js +0 -5
  381. package/dist/client/types/evaluationProviders.js +0 -5
  382. package/dist/client/types/evaluationTypes.js +0 -1
  383. package/dist/client/types/externalMcp.js +0 -6
  384. package/dist/client/types/fileReferenceTypes.js +0 -8
  385. package/dist/client/types/fileTypes.js +0 -4
  386. package/dist/client/types/generateTypes.js +0 -1
  387. package/dist/client/types/guardrails.js +0 -1
  388. package/dist/client/types/hitlTypes.js +0 -8
  389. package/dist/client/types/index.js +0 -57
  390. package/dist/client/types/mcpTypes.js +0 -5
  391. package/dist/client/types/middlewareTypes.js +0 -1
  392. package/dist/client/types/modelTypes.js +0 -30
  393. package/dist/client/types/multimodal.js +0 -135
  394. package/dist/client/types/observability.js +0 -6
  395. package/dist/client/types/pptTypes.js +0 -82
  396. package/dist/client/types/providers.js +0 -111
  397. package/dist/client/types/proxyTypes.js +0 -16
  398. package/dist/client/types/ragTypes.js +0 -7
  399. package/dist/client/types/sdkTypes.js +0 -8
  400. package/dist/client/types/serviceTypes.js +0 -5
  401. package/dist/client/types/streamTypes.js +0 -1
  402. package/dist/client/types/subscriptionTypes.js +0 -9
  403. package/dist/client/types/taskClassificationTypes.js +0 -5
  404. package/dist/client/types/tools.js +0 -24
  405. package/dist/client/types/ttsTypes.js +0 -57
  406. package/dist/client/types/typeAliases.js +0 -48
  407. package/dist/client/types/utilities.js +0 -4
  408. package/dist/client/types/workflowTypes.js +0 -30
  409. package/dist/client/utils/async/withTimeout.js +0 -98
  410. package/dist/client/utils/asyncMutex.js +0 -60
  411. package/dist/client/utils/conversationMemory.js +0 -431
  412. package/dist/client/utils/csvProcessor.js +0 -846
  413. package/dist/client/utils/errorHandling.js +0 -936
  414. package/dist/client/utils/evaluationUtils.js +0 -131
  415. package/dist/client/utils/factoryProcessing.js +0 -589
  416. package/dist/client/utils/fileDetector.js +0 -2161
  417. package/dist/client/utils/imageCache.js +0 -376
  418. package/dist/client/utils/imageProcessor.js +0 -704
  419. package/dist/client/utils/logger.js +0 -491
  420. package/dist/client/utils/mcpDefaults.js +0 -134
  421. package/dist/client/utils/messageBuilder.js +0 -1653
  422. package/dist/client/utils/modelAliasResolver.js +0 -54
  423. package/dist/client/utils/modelDetection.js +0 -80
  424. package/dist/client/utils/modelRouter.js +0 -292
  425. package/dist/client/utils/multimodalOptionsBuilder.js +0 -65
  426. package/dist/client/utils/observabilityHelpers.js +0 -47
  427. package/dist/client/utils/parameterValidation.js +0 -966
  428. package/dist/client/utils/pdfProcessor.js +0 -410
  429. package/dist/client/utils/performance.js +0 -222
  430. package/dist/client/utils/pricing.js +0 -340
  431. package/dist/client/utils/promptRedaction.js +0 -62
  432. package/dist/client/utils/providerConfig.js +0 -1009
  433. package/dist/client/utils/providerHealth.js +0 -1237
  434. package/dist/client/utils/providerRetry.js +0 -112
  435. package/dist/client/utils/providerUtils.js +0 -434
  436. package/dist/client/utils/rateLimiter.js +0 -200
  437. package/dist/client/utils/redis.js +0 -368
  438. package/dist/client/utils/retryHandler.js +0 -269
  439. package/dist/client/utils/retryability.js +0 -22
  440. package/dist/client/utils/sanitizers/svg.js +0 -481
  441. package/dist/client/utils/schemaConversion.js +0 -255
  442. package/dist/client/utils/taskClassificationUtils.js +0 -149
  443. package/dist/client/utils/taskClassifier.js +0 -94
  444. package/dist/client/utils/thinkingConfig.js +0 -104
  445. package/dist/client/utils/timeout.js +0 -359
  446. package/dist/client/utils/tokenEstimation.js +0 -142
  447. package/dist/client/utils/tokenLimits.js +0 -125
  448. package/dist/client/utils/tokenUtils.js +0 -239
  449. package/dist/client/utils/toolUtils.js +0 -75
  450. package/dist/client/utils/transformationUtils.js +0 -554
  451. package/dist/client/utils/ttsProcessor.js +0 -286
  452. package/dist/client/utils/typeUtils.js +0 -97
  453. package/dist/client/utils/videoAnalysisProcessor.js +0 -67
  454. package/dist/client/workflow/config.js +0 -398
  455. package/dist/client/workflow/core/ensembleExecutor.js +0 -407
  456. package/dist/client/workflow/core/judgeScorer.js +0 -544
  457. package/dist/client/workflow/core/responseConditioner.js +0 -225
  458. package/dist/client/workflow/core/types/conditionerTypes.js +0 -7
  459. package/dist/client/workflow/core/types/ensembleTypes.js +0 -7
  460. package/dist/client/workflow/core/types/index.js +0 -7
  461. package/dist/client/workflow/core/types/judgeTypes.js +0 -7
  462. package/dist/client/workflow/core/types/layerTypes.js +0 -7
  463. package/dist/client/workflow/core/types/registryTypes.js +0 -7
  464. package/dist/client/workflow/core/workflowRegistry.js +0 -304
  465. package/dist/client/workflow/core/workflowRunner.js +0 -586
  466. package/dist/client/workflow/index.js +0 -50
  467. package/dist/client/workflow/types.js +0 -9
  468. package/dist/client/workflow/utils/types/index.js +0 -7
  469. package/dist/client/workflow/utils/workflowMetrics.js +0 -311
  470. package/dist/client/workflow/utils/workflowValidation.js +0 -420
  471. package/dist/client/workflow/workflows/adaptiveWorkflow.js +0 -366
  472. package/dist/client/workflow/workflows/consensusWorkflow.js +0 -192
  473. package/dist/client/workflow/workflows/fallbackWorkflow.js +0 -225
  474. package/dist/client/workflow/workflows/multiJudgeWorkflow.js +0 -351
  475. /package/dist/client/{client/reactHooks.js → reactHooks.js} +0 -0
@@ -1,2161 +0,0 @@
1
- /**
2
- * File Type Detection Utility
3
- * Centralized file detection for all multimodal file types
4
- * Uses multi-strategy approach for reliable type identification
5
- */
6
- import { readFile, stat } from "fs/promises";
7
- import { getGlobalDispatcher, interceptors, request } from "undici";
8
- import { archiveProcessor } from "../processors/archive/ArchiveProcessor.js";
9
- import { audioProcessor } from "../processors/media/AudioProcessor.js";
10
- import { videoProcessor } from "../processors/media/VideoProcessor.js";
11
- import { tracers, ATTR, withSpan } from "../telemetry/index.js";
12
- import { CSVProcessor } from "./csvProcessor.js";
13
- import { ImageProcessor } from "./imageProcessor.js";
14
- import { logger } from "./logger.js";
15
- import { PDFProcessor } from "./pdfProcessor.js";
16
- /**
17
- * Default retry configuration constants
18
- */
19
- const DEFAULT_MAX_RETRIES = 3;
20
- const DEFAULT_RETRY_DELAY = 1000; // milliseconds
21
- /**
22
- * Retryable network error codes (Node.js/undici network errors)
23
- */
24
- const RETRYABLE_ERROR_CODES = [
25
- "ETIMEDOUT",
26
- "ECONNRESET",
27
- "ECONNREFUSED",
28
- "ENOTFOUND",
29
- "ENETUNREACH",
30
- "EAI_AGAIN",
31
- "EPIPE",
32
- "ECONNABORTED",
33
- "UND_ERR_CONNECT_TIMEOUT",
34
- "UND_ERR_HEADERS_TIMEOUT",
35
- "UND_ERR_BODY_TIMEOUT",
36
- "UND_ERR_SOCKET",
37
- ];
38
- /**
39
- * Non-retryable HTTP status codes (client errors)
40
- */
41
- const NON_RETRYABLE_STATUS_CODES = [400, 401, 403, 404, 405];
42
- /**
43
- * Retryable HTTP status codes (server errors + rate limiting)
44
- */
45
- const RETRYABLE_STATUS_CODES = [429, 500, 502, 503, 504];
46
- /**
47
- * Check if an error is a recoverable network error that should be retried
48
- *
49
- * @param error - Error to check
50
- * @returns True if error is retryable (transient network issue)
51
- */
52
- function isRetryableNetworkError(error) {
53
- if (!(error instanceof Error)) {
54
- return false;
55
- }
56
- const errorMessage = error.message.toLowerCase();
57
- // Extract error code from various error shapes
58
- const errorWithCode = error;
59
- const errorCode = errorWithCode.code?.toUpperCase();
60
- // Check for retryable network error codes
61
- if (errorCode && RETRYABLE_ERROR_CODES.includes(errorCode)) {
62
- return true;
63
- }
64
- // Check HTTP status code if present in error message (e.g., "HTTP 503")
65
- const httpStatusMatch = errorMessage.match(/http\s*(\d{3})/);
66
- if (httpStatusMatch) {
67
- const statusCode = parseInt(httpStatusMatch[1], 10);
68
- if (NON_RETRYABLE_STATUS_CODES.includes(statusCode)) {
69
- return false;
70
- }
71
- if (RETRYABLE_STATUS_CODES.includes(statusCode)) {
72
- return true;
73
- }
74
- }
75
- // Check error message for transient issues
76
- const transientKeywords = [
77
- "timeout",
78
- "timed out",
79
- "connection reset",
80
- "econnreset",
81
- "etimedout",
82
- "network error",
83
- "socket hang up",
84
- "enotfound",
85
- "getaddrinfo",
86
- "unavailable",
87
- "service unavailable",
88
- ];
89
- return transientKeywords.some((keyword) => errorMessage.includes(keyword));
90
- }
91
- /**
92
- * Execute an operation with automatic retry logic on transient network errors
93
- *
94
- * @param operation - Async function to execute
95
- * @param options - Retry configuration options
96
- * @returns Promise resolving to the operation result
97
- * @throws Error if all retry attempts fail or error is non-retryable
98
- */
99
- async function withRetry(operation, options = {}) {
100
- const maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES;
101
- const retryDelay = options.retryDelay ?? DEFAULT_RETRY_DELAY;
102
- for (let attempt = 0; attempt <= maxRetries; attempt++) {
103
- try {
104
- return await operation();
105
- }
106
- catch (error) {
107
- const isRetryable = isRetryableNetworkError(error);
108
- const isLastAttempt = attempt === maxRetries;
109
- if (!isRetryable || isLastAttempt) {
110
- throw error;
111
- }
112
- // Calculate exponential backoff delay
113
- const delay = retryDelay * 2 ** attempt;
114
- logger.debug("Retrying network operation after transient error", {
115
- attempt: attempt + 1,
116
- maxRetries,
117
- delay,
118
- error: error instanceof Error ? error.message : String(error),
119
- });
120
- await new Promise((resolve) => setTimeout(resolve, delay));
121
- }
122
- }
123
- // TypeScript exhaustiveness check - should never reach here
124
- throw new Error("Retry logic failed unexpectedly");
125
- }
126
- /**
127
- * Check if text has JSON markers (starts with { or [ and ends with corresponding closing bracket)
128
- */
129
- function hasJsonMarkers(text) {
130
- const trimmed = text.trim();
131
- if (!trimmed) {
132
- return false;
133
- }
134
- const firstChar = trimmed[0];
135
- const lastChar = trimmed[trimmed.length - 1];
136
- const hasMatchingBrackets = (firstChar === "{" && lastChar === "}") ||
137
- (firstChar === "[" && lastChar === "]");
138
- if (!hasMatchingBrackets) {
139
- return false;
140
- }
141
- try {
142
- JSON.parse(trimmed);
143
- return true;
144
- }
145
- catch {
146
- return false;
147
- }
148
- }
149
- /**
150
- * Format file size in human-readable units
151
- */
152
- function formatFileSize(bytes) {
153
- if (bytes < 1024) {
154
- return `${bytes} bytes`;
155
- }
156
- if (bytes < 1024 * 1024) {
157
- return `${(bytes / 1024).toFixed(2)} KB`;
158
- }
159
- if (bytes < 1024 * 1024 * 1024) {
160
- return `${(bytes / (1024 * 1024)).toFixed(2)} MB`;
161
- }
162
- return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
163
- }
164
- /**
165
- * Centralized file type detection and processing
166
- *
167
- * @example
168
- * ```typescript
169
- * // Auto-detect and process any file
170
- * const result = await FileDetector.detectAndProcess("data.csv");
171
- * logger.info(result.type); // 'csv'
172
- * ```
173
- */
174
- export class FileDetector {
175
- // FD-017: Replace hardcoded timeouts with constants.
176
- // These default ensure consistent timeout behavior across all file-detection logic.
177
- static DEFAULT_NETWORK_TIMEOUT = 30000; // 30 seconds
178
- static DEFAULT_HEAD_TIMEOUT = 5000; // 5 seconds
179
- /**
180
- * Auto-detect file type and process in one call
181
- *
182
- * Runs detection strategies in priority order:
183
- * 1. MagicBytesStrategy (95% confidence) - Binary file headers
184
- * 2. MimeTypeStrategy (85% confidence) - HTTP Content-Type for URLs
185
- * 3. ExtensionStrategy (70% confidence) - File extension
186
- * 4. ContentHeuristicStrategy (75% confidence) - Content analysis
187
- *
188
- * @param input - File path, URL, Buffer, or data URI
189
- * @param options - Detection and processing options
190
- * @returns Processed file result with type and content
191
- */
192
- static async detectAndProcess(input, options) {
193
- // Derive filename and size for tracing before detection runs
194
- const inputFilename = FileDetector.deriveInputFilename(input);
195
- const inputSizeBytes = FileDetector.deriveInputSize(input);
196
- return withSpan({
197
- name: "neurolink.file.detect_and_process",
198
- tracer: tracers.file,
199
- attributes: {
200
- [ATTR.FILE_NAME]: inputFilename,
201
- [ATTR.FILE_SIZE_BYTES]: inputSizeBytes,
202
- },
203
- }, async (span) => {
204
- const detection = await FileDetector.detect(input, options);
205
- span.setAttribute(ATTR.FILE_CATEGORY, detection.type);
206
- span.setAttribute(ATTR.FILE_MIMETYPE, detection.mimeType || "unknown");
207
- span.setAttribute(ATTR.FILE_CONFIDENCE, detection.metadata.confidence);
208
- logger.info(`[NEUROLINK] File detected: ${inputFilename} (${detection.mimeType || "unknown"}, ${formatFileSize(inputSizeBytes)}) → category: ${detection.type}`);
209
- // FD-018: Comprehensive fallback parsing for extension-less files
210
- if (options?.allowedTypes &&
211
- !options.allowedTypes.includes(detection.type)) {
212
- const content = await FileDetector.loadContent(input, detection, options);
213
- const errors = [];
214
- for (const allowedType of options.allowedTypes) {
215
- try {
216
- const result = await FileDetector.tryFallbackParsing(content, allowedType, options);
217
- if (result) {
218
- logger.info(`[FileDetector] ✅ ${allowedType.toUpperCase()} fallback successful`);
219
- const outputLength = typeof result.content === "string"
220
- ? result.content.length
221
- : result.content?.length || 0;
222
- span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength);
223
- span.setAttribute(ATTR.FILE_SUCCESS, true);
224
- span.setAttribute(ATTR.FILE_PROCESSOR_USED, `fallback:${allowedType}`);
225
- logger.info(`[NEUROLINK] File processed: ${inputFilename} → ${outputLength} bytes output (fallback: ${allowedType})`);
226
- return result;
227
- }
228
- }
229
- catch (error) {
230
- const errorMsg = error instanceof Error ? error.message : String(error);
231
- errors.push(`${allowedType}: ${errorMsg}`);
232
- logger.debug(`[FileDetector] ${allowedType} fallback failed: ${errorMsg}`);
233
- }
234
- }
235
- logger.warn(`[FileDetector] All fallback parsing failed for type "${detection.type}". ` +
236
- `Attempted: ${options.allowedTypes.join(", ")}. Falling through to universal handler.`);
237
- const csvOptions = options?.csvOptions;
238
- const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider);
239
- FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type);
240
- return result;
241
- }
242
- const content = await FileDetector.loadContent(input, detection, options);
243
- const csvOptions = options?.csvOptions;
244
- const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider);
245
- FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type);
246
- return result;
247
- });
248
- }
249
- /**
250
- * Set span attributes and log after file processing completes.
251
- */
252
- static setFileResultSpanAttributes(span, result, filename, processorType) {
253
- const outputLength = typeof result.content === "string"
254
- ? result.content.length
255
- : result.content?.length || 0;
256
- const hasImages = Array.isArray(result.images)
257
- ? result.images.length > 0
258
- : false;
259
- const imageCount = Array.isArray(result.images)
260
- ? result.images.length
261
- : 0;
262
- span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength);
263
- span.setAttribute(ATTR.FILE_SUCCESS, true);
264
- span.setAttribute(ATTR.FILE_PROCESSOR_USED, processorType);
265
- span.setAttribute(ATTR.FILE_HAS_IMAGES, hasImages);
266
- span.setAttribute(ATTR.FILE_IMAGE_COUNT, imageCount);
267
- logger.info(`[NEUROLINK] File processed: ${filename} → ${outputLength} bytes output` +
268
- (imageCount > 0 ? ` + ${imageCount} image(s)` : "") +
269
- ` (processor: ${processorType})`);
270
- }
271
- /**
272
- * Derive a human-readable filename from FileInput for tracing.
273
- */
274
- static deriveInputFilename(input) {
275
- if (typeof input === "string") {
276
- if (input.startsWith("data:")) {
277
- return "data-uri";
278
- }
279
- if (input.startsWith("http")) {
280
- try {
281
- return new URL(input).pathname.split("/").pop() || "url-file";
282
- }
283
- catch {
284
- return "url-file";
285
- }
286
- }
287
- // File path
288
- return input.split("/").pop() || input.split("\\").pop() || "file";
289
- }
290
- if (Buffer.isBuffer(input)) {
291
- return "buffer";
292
- }
293
- return "unknown-input";
294
- }
295
- /**
296
- * Derive byte size from FileInput for tracing.
297
- */
298
- static deriveInputSize(input) {
299
- if (Buffer.isBuffer(input)) {
300
- return input.length;
301
- }
302
- if (typeof input === "string") {
303
- if (input.startsWith("data:")) {
304
- // Rough estimate: base64 is ~4/3 of raw
305
- const base64Part = input.split(",")[1];
306
- return base64Part ? Math.floor((base64Part.length * 3) / 4) : 0;
307
- }
308
- return input.length; // path or URL string length (not file size)
309
- }
310
- return 0;
311
- }
312
- /**
313
- * Try fallback parsing for a specific file type
314
- * Used when file detection returns "unknown" but we want to try parsing anyway
315
- */
316
- static async tryFallbackParsing(content, fileType, options) {
317
- logger.info(`[FileDetector] Attempting ${fileType.toUpperCase()} fallback parsing`);
318
- switch (fileType) {
319
- case "csv": {
320
- // Try CSV parsing
321
- const csvOptions = options?.csvOptions;
322
- const result = await CSVProcessor.process(content, csvOptions);
323
- logger.info(`[FileDetector] CSV fallback: ${result.metadata?.rowCount || 0} rows, ${result.metadata?.columnCount || 0} columns`);
324
- return result;
325
- }
326
- case "text": {
327
- // Try text parsing - check if content is valid UTF-8 text
328
- const textContent = content.toString("utf-8");
329
- // Validate it's actually text (no null bytes, mostly printable)
330
- if (FileDetector.isValidText(textContent)) {
331
- return {
332
- type: "text",
333
- content: textContent,
334
- mimeType: FileDetector.guessTextMimeType(textContent),
335
- metadata: {
336
- confidence: 70,
337
- size: content.length,
338
- },
339
- };
340
- }
341
- throw new Error("Content does not appear to be valid text");
342
- }
343
- case "image": {
344
- // Image requires magic bytes - can't fallback without detection
345
- throw new Error("Image type requires binary detection, cannot fallback parse");
346
- }
347
- case "pdf": {
348
- // PDF requires magic bytes - can't fallback without detection
349
- throw new Error("PDF type requires binary detection, cannot fallback parse");
350
- }
351
- case "audio": {
352
- // Audio requires magic bytes - can't fallback without detection
353
- throw new Error("Audio type requires binary detection, cannot fallback parse");
354
- }
355
- case "video": {
356
- // Video requires magic bytes - can't fallback without detection
357
- throw new Error("Video type requires binary detection, cannot fallback parse");
358
- }
359
- case "archive": {
360
- // Archive requires magic bytes - can't fallback without detection
361
- throw new Error("Archive type requires binary detection, cannot fallback parse");
362
- }
363
- case "xlsx": {
364
- // Document formats require binary detection
365
- throw new Error("Excel type requires binary detection, cannot fallback parse");
366
- }
367
- case "docx": {
368
- throw new Error("Word type requires binary detection, cannot fallback parse");
369
- }
370
- case "pptx": {
371
- throw new Error("PowerPoint type requires binary detection, cannot fallback parse");
372
- }
373
- case "svg": {
374
- // SVG can be detected from text content
375
- const svgContent = content.toString("utf-8");
376
- if (svgContent.includes("<svg") && svgContent.includes("</svg>")) {
377
- return {
378
- type: "svg",
379
- content: svgContent,
380
- mimeType: "image/svg+xml",
381
- metadata: {
382
- confidence: 70,
383
- size: content.length,
384
- },
385
- };
386
- }
387
- throw new Error("Content does not appear to be valid SVG");
388
- }
389
- default:
390
- return null;
391
- }
392
- }
393
- /**
394
- * Check if content is valid text (UTF-8, mostly printable)
395
- */
396
- static isValidText(content) {
397
- // Check for null bytes which indicate binary content
398
- if (content.includes("\0")) {
399
- return false;
400
- }
401
- // Check if content has reasonable amount of printable characters
402
- let printableCount = 0;
403
- for (let i = 0; i < content.length; i++) {
404
- const code = content.charCodeAt(i);
405
- if ((code >= 32 && code < 127) || // ASCII printable
406
- code === 9 || // Tab
407
- code === 10 || // Newline
408
- code === 13 || // Carriage return
409
- code > 127 // Unicode (non-ASCII)
410
- ) {
411
- printableCount++;
412
- }
413
- }
414
- // At least 90% should be printable
415
- return printableCount / content.length >= 0.9;
416
- }
417
- /**
418
- * Guess the MIME type for text content based on content patterns
419
- */
420
- static guessTextMimeType(content) {
421
- const trimmed = content.trim();
422
- // Check for JSON
423
- if ((trimmed.startsWith("{") && trimmed.endsWith("}")) ||
424
- (trimmed.startsWith("[") && trimmed.endsWith("]"))) {
425
- try {
426
- JSON.parse(trimmed);
427
- return "application/json";
428
- }
429
- catch {
430
- // Not valid JSON, continue checking
431
- }
432
- }
433
- // Check for XML/HTML using stricter detection
434
- if (FileDetector.looksLikeXMLStrict(trimmed)) {
435
- const isHTML = trimmed.includes("<!DOCTYPE html") ||
436
- trimmed.toLowerCase().includes("<html") ||
437
- trimmed.includes("<head") ||
438
- trimmed.includes("<body");
439
- return isHTML ? "text/html" : "application/xml";
440
- }
441
- // Check for YAML using robust multi-indicator detection
442
- if (FileDetector.looksLikeYAMLStrict(trimmed)) {
443
- return "application/yaml";
444
- }
445
- // Default to plain text
446
- return "text/plain";
447
- }
448
- /**
449
- * Strict YAML detection for guessTextMimeType
450
- * Similar to ContentHeuristicStrategy but requires at least 2 indicators
451
- * to avoid false positives from simple key: value patterns
452
- */
453
- static looksLikeYAMLStrict(text) {
454
- if (text.length === 0) {
455
- return false;
456
- }
457
- const lines = text.split("\n");
458
- // For single-line content, only --- or ... qualify as YAML
459
- if (lines.length === 1) {
460
- return text === "---" || text === "...";
461
- }
462
- // Collect YAML indicators (requires at least 2 for positive detection)
463
- const indicators = [];
464
- // Indicator 1: Document start marker (---)
465
- indicators.push(text.startsWith("---"));
466
- // Indicator 2: Document end marker (...)
467
- indicators.push(/^\.\.\.$|[\n]\.\.\.$/.test(text));
468
- // Indicator 3: YAML list items (- followed by space)
469
- indicators.push(/^[\s]*-\s+[^-]/m.test(text));
470
- // Indicator 4: Multiple key-value pairs (at least 2)
471
- const keyValuePattern = /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*(.+)$/;
472
- const keyValueMatches = lines.filter((line) => keyValuePattern.test(line)).length;
473
- indicators.push(keyValueMatches >= 2);
474
- // Require at least 2 indicators for confident YAML detection
475
- const matchCount = indicators.filter(Boolean).length;
476
- return matchCount >= 2;
477
- }
478
- /**
479
- * Strict XML detection for guessTextMimeType
480
- * Ensures content has proper XML declaration or valid tag structure with closing tags
481
- * Prevents false positives from arbitrary content starting with <
482
- */
483
- static looksLikeXMLStrict(content) {
484
- // XML declaration is a definitive marker
485
- if (content.startsWith("<?xml")) {
486
- return true;
487
- }
488
- // Must start with < for XML/HTML
489
- if (!content.startsWith("<")) {
490
- return false;
491
- }
492
- // Check for HTML DOCTYPE declaration
493
- if (content.includes("<!DOCTYPE html")) {
494
- return true;
495
- }
496
- // Must have valid opening tag structure: <tagname
497
- // Not just any < character like "< something"
498
- const hasValidOpeningTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/;
499
- if (!hasValidOpeningTag.test(content)) {
500
- return false;
501
- }
502
- // Must have at least one closing tag or self-closing tag to be valid XML/HTML
503
- const hasClosingTag = /<\/[a-zA-Z][a-zA-Z0-9-]*>/.test(content);
504
- const hasSelfClosingTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?\s*\/\s*>/.test(content);
505
- return hasClosingTag || hasSelfClosingTag;
506
- }
507
- /**
508
- * Detect file type using multi-strategy approach
509
- * Stops at first strategy with confidence >= threshold (default: 80%)
510
- */
511
- static async detect(input, options) {
512
- const confidenceThreshold = options?.confidenceThreshold ?? 80;
513
- const strategies = [
514
- new MagicBytesStrategy(),
515
- new MimeTypeStrategy(),
516
- new ExtensionStrategy(),
517
- new ContentHeuristicStrategy(),
518
- ];
519
- let best = null;
520
- for (const strategy of strategies) {
521
- const result = await strategy.detect(input);
522
- if (!best || result.metadata.confidence > best.metadata.confidence) {
523
- best = result;
524
- }
525
- if (result.metadata.confidence >= confidenceThreshold) {
526
- logger.info(`[FileDetector] Type: ${result.type} (${result.metadata.confidence}%)`);
527
- return result;
528
- }
529
- }
530
- logger.warn(`[FileDetector] Low confidence: ${best?.type ?? "unknown"} (${best?.metadata.confidence ?? 0}%)`);
531
- return best;
532
- }
533
- /**
534
- * Load file content from various sources
535
- */
536
- static async loadContent(input, detection, options) {
537
- let source = detection.source;
538
- if (source === "buffer" && !Buffer.isBuffer(input)) {
539
- if (typeof input === "string") {
540
- if (input.startsWith("data:")) {
541
- source = "datauri";
542
- }
543
- else if (input.startsWith("http://") ||
544
- input.startsWith("https://")) {
545
- source = "url";
546
- }
547
- else {
548
- source = "path";
549
- }
550
- }
551
- }
552
- switch (source) {
553
- case "url":
554
- return await FileDetector.loadFromURL(input, options);
555
- case "path":
556
- return await FileDetector.loadFromPath(input, options);
557
- case "buffer":
558
- return input;
559
- case "datauri":
560
- return FileDetector.loadFromDataURI(input);
561
- default:
562
- throw new Error(`Unknown source: ${source}`);
563
- }
564
- }
565
- /**
566
- * SDK-8: Format an informative placeholder when a file processor fails.
567
- * Instead of bare "[Video file: name]" strings, include size, format, and
568
- * the reason for failure so the LLM can acknowledge the attachment.
569
- */
570
- static formatInformativePlaceholder(typeName, filename, content, detection, error) {
571
- const sizeStr = content.length < 1024
572
- ? `${content.length} bytes`
573
- : content.length < 1024 * 1024
574
- ? `${(content.length / 1024).toFixed(1)} KB`
575
- : `${(content.length / (1024 * 1024)).toFixed(1)} MB`;
576
- const errorMsg = error instanceof Error
577
- ? error.message
578
- : error
579
- ? String(error)
580
- : "Processing returned no usable content";
581
- return (`[${typeName} File: "${filename}"]\n` +
582
- `Size: ${sizeStr}\n` +
583
- `Format: ${detection.mimeType || "unknown"}\n` +
584
- `Error: Could not extract content (${errorMsg}).\n` +
585
- `The file was attached but could not be fully analyzed.`);
586
- }
587
- /**
588
- * Extract metadata and printable strings from an unrecognized binary file.
589
- * This is the "extract what you can" path for unknown file types.
590
- *
591
- * Extracts:
592
- * - File size (human-readable)
593
- * - MIME type / detected format
594
- * - First N bytes as hex dump (for identification)
595
- * - Printable ASCII/UTF-8 strings found in the binary (like `strings` command)
596
- * - Known file signatures that we don't have full processors for
597
- *
598
- * @param content Raw file buffer
599
- * @param detection Detection result (may be "unknown")
600
- * @param filename Original filename (if known)
601
- * @returns Formatted text summary suitable for LLM consumption
602
- */
603
- static extractBinaryMetadata(content, detection, filename) {
604
- const parts = [];
605
- // Header
606
- const ext = detection.extension
607
- ? `.${detection.extension}`
608
- : filename.includes(".")
609
- ? filename.slice(filename.lastIndexOf("."))
610
- : "";
611
- const typeLabel = ext
612
- ? `${ext.toUpperCase().slice(1)} file`
613
- : "Binary file";
614
- parts.push(`[${typeLabel}: "${filename}"]`);
615
- // Basic metadata
616
- const sizeStr = formatFileSize(content.length);
617
- parts.push(`Size: ${sizeStr}`);
618
- if (detection.mimeType &&
619
- detection.mimeType !== "application/octet-stream") {
620
- parts.push(`Format: ${detection.mimeType}`);
621
- }
622
- // Known binary signature identification (broader than our processing capabilities)
623
- const sigLabel = FileDetector.identifyBinarySignature(content);
624
- if (sigLabel) {
625
- parts.push(`Identified as: ${sigLabel}`);
626
- }
627
- // Hex dump of first 32 bytes for identification
628
- const hexPreview = content
629
- .subarray(0, Math.min(32, content.length))
630
- .toString("hex")
631
- .match(/.{1,2}/g)
632
- ?.join(" ");
633
- if (hexPreview) {
634
- parts.push(`Header bytes: ${hexPreview}`);
635
- }
636
- // Extract printable strings (similar to Unix `strings` command)
637
- const strings = FileDetector.extractPrintableStrings(content, 4, 50);
638
- if (strings.length > 0) {
639
- parts.push(`\nEmbedded text found (${strings.length} string${strings.length > 1 ? "s" : ""}):`);
640
- for (const s of strings) {
641
- parts.push(` "${s}"`);
642
- }
643
- }
644
- parts.push(`\nThis file was attached but its format is not fully supported for content extraction.`);
645
- parts.push(`The above metadata and any embedded text have been extracted for context.`);
646
- return parts.join("\n");
647
- }
648
- /**
649
- * Identify known binary file signatures beyond what we can process.
650
- * Returns a human-readable description, or null if unrecognized.
651
- */
652
- static identifyBinarySignature(buf) {
653
- if (buf.length < 4) {
654
- return null;
655
- }
656
- // SQLite: "SQLite format 3\0"
657
- if (buf.length >= 16 &&
658
- buf.subarray(0, 15).toString("ascii") === "SQLite format 3") {
659
- return "SQLite database";
660
- }
661
- // WOFF: "wOFF"
662
- if (buf[0] === 0x77 &&
663
- buf[1] === 0x4f &&
664
- buf[2] === 0x46 &&
665
- buf[3] === 0x46) {
666
- return "WOFF font";
667
- }
668
- // WOFF2: "wOF2"
669
- if (buf[0] === 0x77 &&
670
- buf[1] === 0x4f &&
671
- buf[2] === 0x46 &&
672
- buf[3] === 0x32) {
673
- return "WOFF2 font";
674
- }
675
- // TrueType/OpenType: starts with 0x00010000 or "OTTO"
676
- if ((buf[0] === 0x00 &&
677
- buf[1] === 0x01 &&
678
- buf[2] === 0x00 &&
679
- buf[3] === 0x00) ||
680
- (buf[0] === 0x4f && buf[1] === 0x54 && buf[2] === 0x54 && buf[3] === 0x4f)) {
681
- return "TrueType/OpenType font";
682
- }
683
- // ELF executable: \x7fELF
684
- if (buf[0] === 0x7f &&
685
- buf[1] === 0x45 &&
686
- buf[2] === 0x4c &&
687
- buf[3] === 0x46) {
688
- return "ELF executable/library";
689
- }
690
- // Mach-O: 0xFEEDFACE or 0xFEEDFACF (64-bit) or 0xCAFEBABE (universal)
691
- if ((buf[0] === 0xfe &&
692
- buf[1] === 0xed &&
693
- buf[2] === 0xfa &&
694
- buf[3] === 0xce) ||
695
- (buf[0] === 0xfe &&
696
- buf[1] === 0xed &&
697
- buf[2] === 0xfa &&
698
- buf[3] === 0xcf) ||
699
- (buf[0] === 0xca && buf[1] === 0xfe && buf[2] === 0xba && buf[3] === 0xbe)) {
700
- return "Mach-O executable/library";
701
- }
702
- // PE/Windows executable: "MZ"
703
- if (buf[0] === 0x4d && buf[1] === 0x5a) {
704
- return "Windows PE executable/DLL";
705
- }
706
- // WebAssembly: "\0asm"
707
- if (buf[0] === 0x00 &&
708
- buf[1] === 0x61 &&
709
- buf[2] === 0x73 &&
710
- buf[3] === 0x6d) {
711
- return "WebAssembly binary";
712
- }
713
- // DWG (AutoCAD): starts with "AC10"
714
- if (buf[0] === 0x41 &&
715
- buf[1] === 0x43 &&
716
- buf[2] === 0x31 &&
717
- buf[3] === 0x30) {
718
- return "AutoCAD DWG drawing";
719
- }
720
- // BZ2: "BZ" + 'h'
721
- if (buf[0] === 0x42 && buf[1] === 0x5a && buf[2] === 0x68) {
722
- return "BZip2 compressed archive";
723
- }
724
- // XZ: 0xFD + "7zXZ"
725
- if (buf.length >= 6 &&
726
- buf[0] === 0xfd &&
727
- buf[1] === 0x37 &&
728
- buf[2] === 0x7a &&
729
- buf[3] === 0x58 &&
730
- buf[4] === 0x5a &&
731
- buf[5] === 0x00) {
732
- return "XZ compressed archive";
733
- }
734
- // 7z: "7z" + BC AF 27 1C
735
- if (buf.length >= 6 &&
736
- buf[0] === 0x37 &&
737
- buf[1] === 0x7a &&
738
- buf[2] === 0xbc &&
739
- buf[3] === 0xaf &&
740
- buf[4] === 0x27 &&
741
- buf[5] === 0x1c) {
742
- return "7-Zip archive";
743
- }
744
- // ISO 9660: "CD001" at offset 32769
745
- if (buf.length > 32773 &&
746
- buf.subarray(32769, 32774).toString("ascii") === "CD001") {
747
- return "ISO 9660 disc image";
748
- }
749
- // Apache Parquet: "PAR1"
750
- if (buf[0] === 0x50 &&
751
- buf[1] === 0x41 &&
752
- buf[2] === 0x52 &&
753
- buf[3] === 0x31) {
754
- return "Apache Parquet data file";
755
- }
756
- // Protocol Buffers compiled: (no fixed magic, skip)
757
- // TIFF (already handled as image, but including for completeness)
758
- if ((buf[0] === 0x49 &&
759
- buf[1] === 0x49 &&
760
- buf[2] === 0x2a &&
761
- buf[3] === 0x00) ||
762
- (buf[0] === 0x4d && buf[1] === 0x4d && buf[2] === 0x00 && buf[3] === 0x2a)) {
763
- return "TIFF image";
764
- }
765
- // ICO: 00 00 01 00
766
- if (buf[0] === 0x00 &&
767
- buf[1] === 0x00 &&
768
- buf[2] === 0x01 &&
769
- buf[3] === 0x00) {
770
- return "ICO icon image";
771
- }
772
- return null;
773
- }
774
- /**
775
- * Extract printable ASCII strings from a binary buffer.
776
- * Similar to the Unix `strings` utility.
777
- *
778
- * @param buf Buffer to scan
779
- * @param minLength Minimum string length to include (default 4)
780
- * @param maxStrings Maximum number of strings to return (default 50)
781
- * @returns Array of printable strings found in the binary
782
- */
783
- static extractPrintableStrings(buf, minLength = 4, maxStrings = 50) {
784
- const strings = [];
785
- let current = "";
786
- // Only scan first 64KB to avoid huge processing time
787
- const scanLimit = Math.min(buf.length, 64 * 1024);
788
- for (let i = 0; i < scanLimit; i++) {
789
- const byte = buf[i];
790
- // Printable ASCII range (space through tilde) plus tab
791
- if ((byte >= 0x20 && byte <= 0x7e) || byte === 0x09) {
792
- current += String.fromCharCode(byte);
793
- }
794
- else {
795
- if (current.length >= minLength) {
796
- strings.push(current);
797
- if (strings.length >= maxStrings) {
798
- break;
799
- }
800
- }
801
- current = "";
802
- }
803
- }
804
- // Flush last string
805
- if (current.length >= minLength && strings.length < maxStrings) {
806
- strings.push(current);
807
- }
808
- return strings;
809
- }
810
- /**
811
- * Route to appropriate processor
812
- */
813
- static async processFile(content, detection, options, provider) {
814
- switch (detection.type) {
815
- case "csv":
816
- // Pass original extension through to CSV processor; if detection has none,
817
- // fall back to any extension provided in csvOptions.
818
- return await CSVProcessor.process(content, {
819
- ...options,
820
- extension: detection.extension ?? options?.extension,
821
- });
822
- case "image":
823
- return await ImageProcessor.process(content);
824
- case "pdf":
825
- return await PDFProcessor.process(content, { provider });
826
- case "svg":
827
- // SVG is processed as text content (sanitized XML markup)
828
- // AI providers don't support SVG as image format, so we extract text content
829
- return await FileDetector.processSvgAsText(content, detection);
830
- case "video":
831
- return await FileDetector.processVideoFile(content, detection);
832
- case "audio":
833
- return await FileDetector.processAudioFile(content, detection);
834
- case "archive":
835
- return await FileDetector.processArchiveFile(content, detection);
836
- case "xlsx":
837
- return await FileDetector.processXlsxFile(content, detection);
838
- case "docx":
839
- return await FileDetector.processDocxFile(content, detection);
840
- case "pptx":
841
- return await FileDetector.processPptxFile(content, detection);
842
- case "text":
843
- return {
844
- type: "text",
845
- content: content.toString("utf-8"),
846
- mimeType: detection.mimeType || "text/plain",
847
- metadata: detection.metadata,
848
- };
849
- default: {
850
- // Graceful degradation: try to treat unknown types as text if content is valid UTF-8
851
- const unknownContent = content.toString("utf-8");
852
- if (FileDetector.isValidText(unknownContent)) {
853
- logger.warn(`[FileDetector] Unknown type "${detection.type}", treating as text`);
854
- return {
855
- type: "text",
856
- content: unknownContent,
857
- mimeType: detection.mimeType || "text/plain",
858
- metadata: detection.metadata,
859
- };
860
- }
861
- // Binary file that we can't fully process — extract what we can
862
- // (metadata, printable strings, signature identification)
863
- const filename = detection.metadata.filename || "file";
864
- logger.warn(`[FileDetector] Unknown binary type "${detection.type}", extracting metadata for "${filename}"`);
865
- return {
866
- type: "unknown",
867
- content: FileDetector.extractBinaryMetadata(content, detection, filename),
868
- mimeType: detection.mimeType || "application/octet-stream",
869
- metadata: detection.metadata,
870
- };
871
- }
872
- }
873
- }
874
- /**
875
- * Process video file: extract metadata, keyframes, and subtitles via VideoProcessor
876
- */
877
- static async processVideoFile(content, detection) {
878
- const videoFilename = detection.metadata.filename || "video";
879
- try {
880
- const videoResult = await videoProcessor.processFile({
881
- id: videoFilename,
882
- name: videoFilename,
883
- mimetype: detection.mimeType || "video/mp4",
884
- size: content.length,
885
- buffer: content,
886
- });
887
- if (videoResult.success && videoResult.data) {
888
- return {
889
- type: "video",
890
- content: videoResult.data.textContent ||
891
- FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection),
892
- mimeType: detection.mimeType,
893
- images: videoResult.data.keyframes && videoResult.data.keyframes.length > 0
894
- ? videoResult.data.keyframes
895
- : undefined,
896
- metadata: {
897
- ...detection.metadata,
898
- frameCount: videoResult.data.frameCount,
899
- hasKeyframes: videoResult.data.hasKeyframes,
900
- },
901
- };
902
- }
903
- }
904
- catch (videoError) {
905
- logger.warn(`[FileDetector] VideoProcessor failed for ${videoFilename}, using fallback`, videoError instanceof Error ? videoError.message : String(videoError));
906
- return {
907
- type: "video",
908
- content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection, videoError),
909
- mimeType: detection.mimeType,
910
- metadata: detection.metadata,
911
- };
912
- }
913
- // Fallback if processor returned no data
914
- return {
915
- type: "video",
916
- content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection),
917
- mimeType: detection.mimeType,
918
- metadata: detection.metadata,
919
- };
920
- }
921
- /**
922
- * Process audio file: extract metadata, tags, and cover art via AudioProcessor
923
- */
924
- static async processAudioFile(content, detection) {
925
- const audioFilename = detection.metadata.filename || "audio";
926
- try {
927
- const audioResult = await audioProcessor.processFile({
928
- id: audioFilename,
929
- name: audioFilename,
930
- mimetype: detection.mimeType || "audio/mpeg",
931
- size: content.length,
932
- buffer: content,
933
- });
934
- if (audioResult.success && audioResult.data) {
935
- return {
936
- type: "audio",
937
- content: audioResult.data.textContent ||
938
- FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection),
939
- mimeType: detection.mimeType,
940
- // Surface embedded cover art as an image content block
941
- images: audioResult.data.coverArt
942
- ? [audioResult.data.coverArt]
943
- : undefined,
944
- metadata: detection.metadata,
945
- };
946
- }
947
- }
948
- catch (audioError) {
949
- logger.warn(`[FileDetector] AudioProcessor failed for ${audioFilename}, using fallback`, audioError instanceof Error ? audioError.message : String(audioError));
950
- return {
951
- type: "audio",
952
- content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection, audioError),
953
- mimeType: detection.mimeType,
954
- metadata: detection.metadata,
955
- };
956
- }
957
- // Fallback if processor returned no data
958
- return {
959
- type: "audio",
960
- content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection),
961
- mimeType: detection.mimeType,
962
- metadata: detection.metadata,
963
- };
964
- }
965
- /**
966
- * Process archive file: list contents and extract metadata via ArchiveProcessor
967
- */
968
- static async processArchiveFile(content, detection) {
969
- const archiveFilename = detection.metadata.filename || "archive";
970
- try {
971
- const archiveResult = await archiveProcessor.processFile({
972
- id: archiveFilename,
973
- name: archiveFilename,
974
- mimetype: detection.mimeType || "application/zip",
975
- size: content.length,
976
- buffer: content,
977
- });
978
- if (archiveResult.success && archiveResult.data) {
979
- return {
980
- type: "archive",
981
- content: archiveResult.data.textContent ||
982
- FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection),
983
- mimeType: detection.mimeType,
984
- metadata: detection.metadata,
985
- };
986
- }
987
- }
988
- catch (archiveError) {
989
- logger.warn(`[FileDetector] ArchiveProcessor failed for ${archiveFilename}, using fallback`, archiveError instanceof Error
990
- ? archiveError.message
991
- : String(archiveError));
992
- return {
993
- type: "archive",
994
- content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection, archiveError),
995
- mimeType: detection.mimeType,
996
- metadata: detection.metadata,
997
- };
998
- }
999
- // Fallback if processor returned no data
1000
- return {
1001
- type: "archive",
1002
- content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection),
1003
- mimeType: detection.mimeType,
1004
- metadata: detection.metadata,
1005
- };
1006
- }
1007
- /**
1008
- * Process Excel/OpenDocument spreadsheet file via ExcelProcessor or OpenDocumentProcessor
1009
- */
1010
- static async processXlsxFile(content, detection) {
1011
- const xlsxFilename = detection.metadata.filename || "spreadsheet";
1012
- try {
1013
- const ext = detection.extension?.toLowerCase();
1014
- if (ext === "ods") {
1015
- const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js");
1016
- const odsResult = await openDocumentProcessor.processFile({
1017
- id: xlsxFilename,
1018
- name: xlsxFilename,
1019
- mimetype: detection.mimeType ||
1020
- "application/vnd.oasis.opendocument.spreadsheet",
1021
- size: content.length,
1022
- buffer: content,
1023
- });
1024
- if (odsResult.success && odsResult.data) {
1025
- return {
1026
- type: "xlsx",
1027
- content: odsResult.data.textContent ||
1028
- FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection),
1029
- mimeType: detection.mimeType,
1030
- metadata: detection.metadata,
1031
- };
1032
- }
1033
- }
1034
- else {
1035
- const { excelProcessor } = await import("../processors/document/ExcelProcessor.js");
1036
- const xlsxResult = await excelProcessor.processFile({
1037
- id: xlsxFilename,
1038
- name: xlsxFilename,
1039
- mimetype: detection.mimeType ||
1040
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1041
- size: content.length,
1042
- buffer: content,
1043
- });
1044
- if (xlsxResult.success && xlsxResult.data) {
1045
- // Build text content from worksheets
1046
- const sheets = xlsxResult.data.worksheets || [];
1047
- let textContent = `Spreadsheet: ${sheets.length} sheet(s), ${xlsxResult.data.totalRows} total rows\n`;
1048
- for (const sheet of sheets) {
1049
- textContent += `\n### Sheet: ${sheet.name}\n`;
1050
- textContent += `Columns (${sheet.columnCount}): ${sheet.headers.join(", ")}\n`;
1051
- textContent += `Rows: ${sheet.rowCount}\n`;
1052
- // Include first rows as sample data
1053
- const sampleRows = sheet.rows.slice(0, 20);
1054
- const rowText = sampleRows
1055
- .map((row) => row.map((c) => String(c ?? "")).join("\t"))
1056
- .join("\n");
1057
- if (!rowText) {
1058
- continue;
1059
- }
1060
- textContent += `\nData:\n${sheet.headers.join("\t")}\n${rowText}\n`;
1061
- const remaining = sheet.rowCount - 20;
1062
- if (remaining > 0) {
1063
- textContent += `... (${remaining} more rows)\n`;
1064
- }
1065
- }
1066
- return {
1067
- type: "xlsx",
1068
- content: textContent,
1069
- mimeType: detection.mimeType,
1070
- metadata: detection.metadata,
1071
- };
1072
- }
1073
- }
1074
- }
1075
- catch (xlsxError) {
1076
- logger.warn(`[FileDetector] ExcelProcessor failed for ${xlsxFilename}, using fallback`, xlsxError instanceof Error ? xlsxError.message : String(xlsxError));
1077
- return {
1078
- type: "xlsx",
1079
- content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection, xlsxError),
1080
- mimeType: detection.mimeType,
1081
- metadata: detection.metadata,
1082
- };
1083
- }
1084
- // Fallback if processor returned no data
1085
- return {
1086
- type: "xlsx",
1087
- content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection),
1088
- mimeType: detection.mimeType,
1089
- metadata: detection.metadata,
1090
- };
1091
- }
1092
- /**
1093
- * Process Word/OpenDocument/RTF document via WordProcessor, OpenDocumentProcessor, or RtfProcessor
1094
- */
1095
- static async processDocxFile(content, detection) {
1096
- const docxFilename = detection.metadata.filename || "document";
1097
- const ext = detection.extension?.toLowerCase();
1098
- try {
1099
- if (ext === "odt") {
1100
- const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js");
1101
- const odtResult = await openDocumentProcessor.processFile({
1102
- id: docxFilename,
1103
- name: docxFilename,
1104
- mimetype: detection.mimeType || "application/vnd.oasis.opendocument.text",
1105
- size: content.length,
1106
- buffer: content,
1107
- });
1108
- if (odtResult.success && odtResult.data) {
1109
- return {
1110
- type: "docx",
1111
- content: odtResult.data.textContent ||
1112
- FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
1113
- mimeType: detection.mimeType,
1114
- metadata: detection.metadata,
1115
- };
1116
- }
1117
- }
1118
- else if (ext === "rtf") {
1119
- const { rtfProcessor } = await import("../processors/document/RtfProcessor.js");
1120
- const rtfResult = await rtfProcessor.processFile({
1121
- id: docxFilename,
1122
- name: docxFilename,
1123
- mimetype: detection.mimeType || "application/rtf",
1124
- size: content.length,
1125
- buffer: content,
1126
- });
1127
- if (rtfResult.success && rtfResult.data) {
1128
- return {
1129
- type: "docx",
1130
- content: rtfResult.data.textContent ||
1131
- FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
1132
- mimeType: detection.mimeType,
1133
- metadata: detection.metadata,
1134
- };
1135
- }
1136
- }
1137
- else {
1138
- const { wordProcessor } = await import("../processors/document/WordProcessor.js");
1139
- const docxResult = await wordProcessor.processFile({
1140
- id: docxFilename,
1141
- name: docxFilename,
1142
- mimetype: detection.mimeType ||
1143
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1144
- size: content.length,
1145
- buffer: content,
1146
- });
1147
- if (docxResult.success && docxResult.data) {
1148
- return {
1149
- type: "docx",
1150
- content: docxResult.data.textContent ||
1151
- FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
1152
- mimeType: detection.mimeType,
1153
- metadata: detection.metadata,
1154
- };
1155
- }
1156
- }
1157
- }
1158
- catch (docxError) {
1159
- logger.warn(`[FileDetector] Document processor failed for ${docxFilename}, using fallback`, docxError instanceof Error ? docxError.message : String(docxError));
1160
- return {
1161
- type: "docx",
1162
- content: FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection, docxError),
1163
- mimeType: detection.mimeType,
1164
- metadata: detection.metadata,
1165
- };
1166
- }
1167
- // Fallback if processor returned no data
1168
- return {
1169
- type: "docx",
1170
- content: FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
1171
- mimeType: detection.mimeType,
1172
- metadata: detection.metadata,
1173
- };
1174
- }
1175
- /**
1176
- * Process PowerPoint/OpenDocument presentation via PptxProcessor
1177
- */
1178
- static async processPptxFile(content, detection) {
1179
- const pptxFilename = detection.metadata.filename || "presentation";
1180
- try {
1181
- const { PptxProcessor } = await import("../processors/document/PptxProcessor.js");
1182
- const pptxResult = await PptxProcessor.extractText(content);
1183
- if (pptxResult) {
1184
- return {
1185
- type: "pptx",
1186
- content: pptxResult,
1187
- mimeType: detection.mimeType,
1188
- metadata: detection.metadata,
1189
- };
1190
- }
1191
- }
1192
- catch (pptxError) {
1193
- logger.warn(`[FileDetector] PptxProcessor failed for ${pptxFilename}, using fallback`, pptxError instanceof Error ? pptxError.message : String(pptxError));
1194
- return {
1195
- type: "pptx",
1196
- content: FileDetector.formatInformativePlaceholder("Presentation", pptxFilename, content, detection, pptxError),
1197
- mimeType: detection.mimeType,
1198
- metadata: detection.metadata,
1199
- };
1200
- }
1201
- // Fallback if processor returned no content
1202
- return {
1203
- type: "pptx",
1204
- content: FileDetector.formatInformativePlaceholder("Presentation", pptxFilename, content, detection),
1205
- mimeType: detection.mimeType,
1206
- metadata: detection.metadata,
1207
- };
1208
- }
1209
- /**
1210
- * Process SVG file as text content
1211
- * Uses SvgProcessor for security sanitization (removes XSS vectors)
1212
- * Returns sanitized SVG markup as text for AI analysis
1213
- */
1214
- static async processSvgAsText(content, detection) {
1215
- try {
1216
- // Dynamic import to avoid circular dependencies
1217
- const { processSvg } = await import("../processors/markup/SvgProcessor.js");
1218
- const result = await processSvg({
1219
- id: "svg-file",
1220
- name: detection.metadata.filename || "image.svg",
1221
- mimetype: "image/svg+xml",
1222
- size: content.length,
1223
- buffer: content,
1224
- });
1225
- if (result.success && result.data) {
1226
- logger.info(`[FileDetector] SVG processed as text: ${detection.metadata.filename || "image.svg"}`);
1227
- return {
1228
- type: "svg",
1229
- content: result.data.textContent, // Sanitized SVG content
1230
- mimeType: "image/svg+xml",
1231
- metadata: {
1232
- confidence: detection.metadata.confidence,
1233
- size: content.length,
1234
- filename: detection.metadata.filename,
1235
- extension: detection.extension,
1236
- },
1237
- };
1238
- }
1239
- else {
1240
- // Fail closed: return safe empty SVG instead of raw unsanitized content
1241
- logger.warn(`[FileDetector] SVG processor failed, returning safe empty SVG: ${result.error?.userMessage}`);
1242
- return {
1243
- type: "svg",
1244
- content: '<svg xmlns="http://www.w3.org/2000/svg"></svg>',
1245
- mimeType: "image/svg+xml",
1246
- metadata: {
1247
- confidence: detection.metadata.confidence,
1248
- size: content.length,
1249
- filename: detection.metadata.filename,
1250
- extension: detection.extension,
1251
- },
1252
- };
1253
- }
1254
- }
1255
- catch (error) {
1256
- // Fail closed: return safe empty SVG instead of raw unsanitized content
1257
- logger.warn(`[FileDetector] SVG processor not available, returning safe empty SVG: ${error instanceof Error ? error.message : String(error)}`);
1258
- return {
1259
- type: "svg",
1260
- content: '<svg xmlns="http://www.w3.org/2000/svg"></svg>',
1261
- mimeType: "image/svg+xml",
1262
- metadata: {
1263
- confidence: detection.metadata.confidence,
1264
- size: content.length,
1265
- filename: detection.metadata.filename,
1266
- extension: detection.extension,
1267
- },
1268
- };
1269
- }
1270
- }
1271
- /**
1272
- * Load file from URL with automatic retry on transient network errors
1273
- */
1274
- static async loadFromURL(url, options) {
1275
- const maxSize = options?.maxSize || 200 * 1024 * 1024; // 200MB default (matches Curator memory-safety cap)
1276
- const timeout = options?.timeout || FileDetector.DEFAULT_NETWORK_TIMEOUT;
1277
- const maxRetries = options?.maxRetries ?? DEFAULT_MAX_RETRIES;
1278
- const retryDelay = options?.retryDelay ?? DEFAULT_RETRY_DELAY;
1279
- return withRetry(async () => {
1280
- const response = await request(url, {
1281
- dispatcher: getGlobalDispatcher().compose(interceptors.redirect({ maxRedirections: 5 })),
1282
- method: "GET",
1283
- headersTimeout: timeout,
1284
- bodyTimeout: timeout,
1285
- });
1286
- if (response.statusCode !== 200) {
1287
- throw new Error(`HTTP ${response.statusCode}`);
1288
- }
1289
- const chunks = [];
1290
- let totalSize = 0;
1291
- for await (const chunk of response.body) {
1292
- totalSize += chunk.length;
1293
- if (totalSize > maxSize) {
1294
- throw new Error(`File too large: ${formatFileSize(totalSize)} (max: ${formatFileSize(maxSize)})`);
1295
- }
1296
- chunks.push(chunk);
1297
- }
1298
- return Buffer.concat(chunks);
1299
- }, { maxRetries, retryDelay });
1300
- }
1301
- /**
1302
- * Load file from filesystem path
1303
- */
1304
- static async loadFromPath(path, options) {
1305
- const maxSize = options?.maxSize || 200 * 1024 * 1024; // 200MB default (matches Curator memory-safety cap)
1306
- const statInfo = await stat(path);
1307
- if (!statInfo.isFile()) {
1308
- throw new Error("Not a file");
1309
- }
1310
- if (statInfo.size > maxSize) {
1311
- throw new Error(`File too large: ${formatFileSize(statInfo.size)} (max: ${formatFileSize(maxSize)})`);
1312
- }
1313
- return await readFile(path);
1314
- }
1315
- /**
1316
- * Load file from data URI
1317
- */
1318
- static loadFromDataURI(dataUri) {
1319
- const match = dataUri.match(/^data:([^;]+);base64,(.+)$/);
1320
- if (!match) {
1321
- throw new Error("Invalid data URI format");
1322
- }
1323
- return Buffer.from(match[2], "base64");
1324
- }
1325
- }
1326
- /**
1327
- * Strategy 1: Magic Bytes Detection (95% confidence)
1328
- * Detects file type from binary file headers
1329
- */
1330
- class MagicBytesStrategy {
1331
- async detect(input) {
1332
- if (!Buffer.isBuffer(input)) {
1333
- return this.unknown();
1334
- }
1335
- if (this.isPNG(input)) {
1336
- return this.result("image", "image/png", 95);
1337
- }
1338
- if (this.isJPEG(input)) {
1339
- return this.result("image", "image/jpeg", 95);
1340
- }
1341
- if (this.isGIF(input)) {
1342
- return this.result("image", "image/gif", 95);
1343
- }
1344
- if (this.isWebP(input)) {
1345
- return this.result("image", "image/webp", 95);
1346
- }
1347
- if (this.isPDF(input)) {
1348
- return this.result("pdf", "application/pdf", 95);
1349
- }
1350
- // MP4/MOV: "ftyp" at offset 4
1351
- if (input.length >= 8 &&
1352
- input[4] === 0x66 &&
1353
- input[5] === 0x74 &&
1354
- input[6] === 0x79 &&
1355
- input[7] === 0x70) {
1356
- return this.result("video", "video/mp4", 95);
1357
- }
1358
- // MKV/WebM: EBML header
1359
- if (input.length >= 4 &&
1360
- input[0] === 0x1a &&
1361
- input[1] === 0x45 &&
1362
- input[2] === 0xdf &&
1363
- input[3] === 0xa3) {
1364
- return this.result("video", "video/x-matroska", 90);
1365
- }
1366
- // AVI: "RIFF" + "AVI "
1367
- if (input.length >= 12 &&
1368
- input[0] === 0x52 &&
1369
- input[1] === 0x49 &&
1370
- input[2] === 0x46 &&
1371
- input[3] === 0x46 &&
1372
- input[8] === 0x41 &&
1373
- input[9] === 0x56 &&
1374
- input[10] === 0x49 &&
1375
- input[11] === 0x20) {
1376
- return this.result("video", "video/x-msvideo", 95);
1377
- }
1378
- // WAV: "RIFF" + "WAVE"
1379
- if (input.length >= 12 &&
1380
- input[0] === 0x52 &&
1381
- input[1] === 0x49 &&
1382
- input[2] === 0x46 &&
1383
- input[3] === 0x46 &&
1384
- input[8] === 0x57 &&
1385
- input[9] === 0x41 &&
1386
- input[10] === 0x56 &&
1387
- input[11] === 0x45) {
1388
- return this.result("audio", "audio/wav", 95);
1389
- }
1390
- // MP3: ID3 tag
1391
- if (input.length >= 3 &&
1392
- input[0] === 0x49 &&
1393
- input[1] === 0x44 &&
1394
- input[2] === 0x33) {
1395
- return this.result("audio", "audio/mpeg", 95);
1396
- }
1397
- // MP3: sync word
1398
- if (input.length >= 2 && input[0] === 0xff && (input[1] & 0xe0) === 0xe0) {
1399
- return this.result("audio", "audio/mpeg", 80);
1400
- }
1401
- // FLAC: "fLaC"
1402
- if (input.length >= 4 &&
1403
- input[0] === 0x66 &&
1404
- input[1] === 0x4c &&
1405
- input[2] === 0x61 &&
1406
- input[3] === 0x43) {
1407
- return this.result("audio", "audio/flac", 95);
1408
- }
1409
- // OGG: "OggS"
1410
- if (input.length >= 4 &&
1411
- input[0] === 0x4f &&
1412
- input[1] === 0x67 &&
1413
- input[2] === 0x67 &&
1414
- input[3] === 0x53) {
1415
- return this.result("audio", "audio/ogg", 90);
1416
- }
1417
- // ZIP: "PK\x03\x04"
1418
- // NOTE: Many document formats (OOXML: .xlsx, .docx, .pptx; ODF: .odt, .ods)
1419
- // are internally ZIP archives and share these magic bytes. We return a lower
1420
- // confidence (70%) so the ExtensionStrategy (85%) can override with the correct
1421
- // document type when a file path with extension is available. For raw buffers
1422
- // without path info, this falls through to archive as a safe default.
1423
- if (input.length >= 4 &&
1424
- input[0] === 0x50 &&
1425
- input[1] === 0x4b &&
1426
- input[2] === 0x03 &&
1427
- input[3] === 0x04) {
1428
- return this.result("archive", "application/zip", 70);
1429
- }
1430
- // GZIP: 1F 8B
1431
- if (input.length >= 2 && input[0] === 0x1f && input[1] === 0x8b) {
1432
- return this.result("archive", "application/gzip", 90);
1433
- }
1434
- // RAR: "Rar!"
1435
- if (input.length >= 4 &&
1436
- input[0] === 0x52 &&
1437
- input[1] === 0x61 &&
1438
- input[2] === 0x72 &&
1439
- input[3] === 0x21) {
1440
- return this.result("archive", "application/x-rar-compressed", 95);
1441
- }
1442
- return this.unknown();
1443
- }
1444
- isPNG(buf) {
1445
- return (buf.length >= 4 &&
1446
- buf[0] === 0x89 &&
1447
- buf[1] === 0x50 &&
1448
- buf[2] === 0x4e &&
1449
- buf[3] === 0x47);
1450
- }
1451
- isJPEG(buf) {
1452
- return (buf.length >= 3 && buf[0] === 0xff && buf[1] === 0xd8 && buf[2] === 0xff);
1453
- }
1454
- isGIF(buf) {
1455
- return (buf.length >= 4 &&
1456
- buf[0] === 0x47 &&
1457
- buf[1] === 0x49 &&
1458
- buf[2] === 0x46 &&
1459
- buf[3] === 0x38);
1460
- }
1461
- isWebP(buf) {
1462
- return (buf.length >= 12 &&
1463
- buf.slice(0, 4).toString() === "RIFF" &&
1464
- buf.slice(8, 12).toString() === "WEBP");
1465
- }
1466
- isPDF(buf) {
1467
- return buf.length >= 5 && buf.slice(0, 5).toString() === "%PDF-";
1468
- }
1469
- result(type, mime, confidence) {
1470
- return {
1471
- type,
1472
- mimeType: mime,
1473
- extension: null,
1474
- source: "buffer",
1475
- metadata: { confidence },
1476
- };
1477
- }
1478
- unknown() {
1479
- return {
1480
- type: "unknown",
1481
- mimeType: "application/octet-stream",
1482
- extension: null,
1483
- source: "buffer",
1484
- metadata: { confidence: 0 },
1485
- };
1486
- }
1487
- }
1488
- /**
1489
- * Strategy 2: MIME Type Detection (85% confidence)
1490
- * Detects file type from HTTP Content-Type headers
1491
- */
1492
- class MimeTypeStrategy {
1493
- async detect(input) {
1494
- if (typeof input !== "string" || !this.isURL(input)) {
1495
- return this.unknown();
1496
- }
1497
- try {
1498
- const response = await request(input, {
1499
- dispatcher: getGlobalDispatcher().compose(interceptors.redirect({ maxRedirections: 5 })),
1500
- method: "HEAD",
1501
- headersTimeout: FileDetector.DEFAULT_HEAD_TIMEOUT,
1502
- bodyTimeout: FileDetector.DEFAULT_HEAD_TIMEOUT,
1503
- });
1504
- const contentType = response.headers["content-type"] || "";
1505
- const type = this.mimeToFileType(contentType);
1506
- return {
1507
- type,
1508
- mimeType: contentType.split(";")[0].trim(),
1509
- extension: null,
1510
- source: "url",
1511
- metadata: { confidence: type !== "unknown" ? 85 : 0 },
1512
- };
1513
- }
1514
- catch {
1515
- return this.unknown();
1516
- }
1517
- }
1518
- mimeToFileType(mime) {
1519
- const lower = mime.toLowerCase().split(";")[0].trim();
1520
- // CSV
1521
- if (lower === "text/csv" || lower === "text/tab-separated-values") {
1522
- return "csv";
1523
- }
1524
- // SVG is processed as text/markup, NOT as image
1525
- // Must check before generic image/ check
1526
- if (lower === "image/svg+xml") {
1527
- return "svg";
1528
- }
1529
- // Images
1530
- if (lower.startsWith("image/")) {
1531
- return "image";
1532
- }
1533
- // PDF
1534
- if (lower === "application/pdf") {
1535
- return "pdf";
1536
- }
1537
- // Video
1538
- if (lower.startsWith("video/")) {
1539
- return "video";
1540
- }
1541
- // Audio
1542
- if (lower.startsWith("audio/")) {
1543
- return "audio";
1544
- }
1545
- // Office documents — OOXML
1546
- if (lower ===
1547
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
1548
- lower === "application/msword") {
1549
- return "docx";
1550
- }
1551
- if (lower ===
1552
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ||
1553
- lower === "application/vnd.ms-excel") {
1554
- return "xlsx";
1555
- }
1556
- if (lower ===
1557
- "application/vnd.openxmlformats-officedocument.presentationml.presentation" ||
1558
- lower === "application/vnd.ms-powerpoint") {
1559
- return "pptx";
1560
- }
1561
- // OpenDocument formats
1562
- if (lower === "application/vnd.oasis.opendocument.text") {
1563
- return "docx";
1564
- }
1565
- if (lower === "application/vnd.oasis.opendocument.spreadsheet") {
1566
- return "xlsx";
1567
- }
1568
- if (lower === "application/vnd.oasis.opendocument.presentation") {
1569
- return "pptx";
1570
- }
1571
- // RTF
1572
- if (lower === "application/rtf" || lower === "text/rtf") {
1573
- return "docx";
1574
- }
1575
- // Archive formats
1576
- if (lower === "application/zip" ||
1577
- lower === "application/x-zip-compressed" ||
1578
- lower === "application/gzip" ||
1579
- lower === "application/x-gzip" ||
1580
- lower === "application/x-tar" ||
1581
- lower === "application/x-compressed-tar" ||
1582
- lower === "application/java-archive" ||
1583
- lower === "application/x-rar-compressed" ||
1584
- lower === "application/vnd.rar" ||
1585
- lower === "application/x-7z-compressed") {
1586
- return "archive";
1587
- }
1588
- // Text/markup/source code — broad matching
1589
- if (lower === "text/plain" ||
1590
- lower === "text/markdown" ||
1591
- lower === "text/html" ||
1592
- lower === "text/css" ||
1593
- lower === "text/javascript" ||
1594
- lower === "text/typescript" ||
1595
- lower === "application/json" ||
1596
- lower === "application/xml" ||
1597
- lower === "text/xml" ||
1598
- lower === "application/yaml" ||
1599
- lower === "application/x-yaml") {
1600
- return "text";
1601
- }
1602
- // Source code MIME types (text/x-*)
1603
- if (lower.startsWith("text/x-")) {
1604
- return "text";
1605
- }
1606
- // Generic text types we may not have listed explicitly
1607
- if (lower.startsWith("text/")) {
1608
- return "text";
1609
- }
1610
- return "unknown";
1611
- }
1612
- isURL(str) {
1613
- return str.startsWith("http://") || str.startsWith("https://");
1614
- }
1615
- unknown() {
1616
- return {
1617
- type: "unknown",
1618
- mimeType: "application/octet-stream",
1619
- extension: null,
1620
- source: "buffer",
1621
- metadata: { confidence: 0 },
1622
- };
1623
- }
1624
- }
1625
- /**
1626
- * Strategy 3: Extension Detection (70% confidence)
1627
- * Detects file type from file extension
1628
- */
1629
- class ExtensionStrategy {
1630
- async detect(input) {
1631
- if (typeof input !== "string") {
1632
- return this.unknown();
1633
- }
1634
- const ext = this.getExtension(input);
1635
- if (!ext) {
1636
- return this.unknown();
1637
- }
1638
- const typeMap = {
1639
- csv: "csv",
1640
- tsv: "csv",
1641
- jpg: "image",
1642
- jpeg: "image",
1643
- png: "image",
1644
- gif: "image",
1645
- webp: "image",
1646
- bmp: "image",
1647
- tiff: "image",
1648
- tif: "image",
1649
- // SVG is handled as text/markup, NOT as image
1650
- // AI providers don't support SVG format, so we process it as sanitized text
1651
- svg: "svg",
1652
- avif: "image",
1653
- pdf: "pdf",
1654
- // Video formats
1655
- mp4: "video",
1656
- mkv: "video",
1657
- mov: "video",
1658
- avi: "video",
1659
- webm: "video",
1660
- wmv: "video",
1661
- flv: "video",
1662
- // Audio formats
1663
- mp3: "audio",
1664
- wav: "audio",
1665
- ogg: "audio",
1666
- flac: "audio",
1667
- m4a: "audio",
1668
- aac: "audio",
1669
- wma: "audio",
1670
- opus: "audio",
1671
- // Archive formats
1672
- zip: "archive",
1673
- tar: "archive",
1674
- gz: "archive",
1675
- tgz: "archive",
1676
- rar: "archive",
1677
- "7z": "archive",
1678
- jar: "archive",
1679
- // Document formats (ZIP-based internally)
1680
- xlsx: "xlsx",
1681
- xls: "xlsx",
1682
- docx: "docx",
1683
- doc: "docx",
1684
- pptx: "pptx",
1685
- ppt: "pptx",
1686
- odt: "docx", // OpenDocument text → processed like docx
1687
- ods: "xlsx", // OpenDocument spreadsheet → processed like xlsx
1688
- odp: "pptx", // OpenDocument presentation → processed like pptx
1689
- rtf: "docx", // RTF → processed like docx (text extraction)
1690
- // Text/markup formats
1691
- txt: "text",
1692
- md: "text",
1693
- markdown: "text",
1694
- json: "text",
1695
- xml: "text",
1696
- yaml: "text",
1697
- yml: "text",
1698
- html: "text",
1699
- htm: "text",
1700
- css: "text",
1701
- log: "text",
1702
- conf: "text",
1703
- cfg: "text",
1704
- ini: "text",
1705
- env: "text",
1706
- toml: "text",
1707
- properties: "text",
1708
- gitignore: "text",
1709
- dockerignore: "text",
1710
- editorconfig: "text",
1711
- prettierrc: "text",
1712
- eslintrc: "text",
1713
- babelrc: "text",
1714
- // Source code formats
1715
- js: "text",
1716
- mjs: "text",
1717
- cjs: "text",
1718
- jsx: "text",
1719
- ts: "text",
1720
- tsx: "text",
1721
- py: "text",
1722
- java: "text",
1723
- go: "text",
1724
- rs: "text",
1725
- rb: "text",
1726
- php: "text",
1727
- c: "text",
1728
- cpp: "text",
1729
- cc: "text",
1730
- h: "text",
1731
- hpp: "text",
1732
- cs: "text",
1733
- swift: "text",
1734
- kt: "text",
1735
- kts: "text",
1736
- scala: "text",
1737
- sh: "text",
1738
- bash: "text",
1739
- zsh: "text",
1740
- ps1: "text",
1741
- sql: "text",
1742
- r: "text",
1743
- lua: "text",
1744
- pl: "text",
1745
- perl: "text",
1746
- dart: "text",
1747
- ex: "text",
1748
- exs: "text",
1749
- erl: "text",
1750
- hs: "text",
1751
- clj: "text",
1752
- lisp: "text",
1753
- vim: "text",
1754
- // Additional video/image
1755
- m4v: "video",
1756
- ico: "image",
1757
- };
1758
- const type = typeMap[ext.toLowerCase()];
1759
- return {
1760
- type: type || "unknown",
1761
- mimeType: this.getMimeType(ext),
1762
- extension: ext,
1763
- source: this.detectSource(input),
1764
- metadata: { confidence: type ? 85 : 0 },
1765
- };
1766
- }
1767
- getExtension(input) {
1768
- if (this.isURL(input)) {
1769
- const url = new URL(input);
1770
- const match = url.pathname.match(/\.([^.]+)$/);
1771
- return match ? match[1] : null;
1772
- }
1773
- const match = input.match(/\.([^.]+)$/);
1774
- return match ? match[1] : null;
1775
- }
1776
- isURL(str) {
1777
- return str.startsWith("http://") || str.startsWith("https://");
1778
- }
1779
- detectSource(input) {
1780
- if (input.startsWith("data:")) {
1781
- return "datauri";
1782
- }
1783
- if (this.isURL(input)) {
1784
- return "url";
1785
- }
1786
- return "path";
1787
- }
1788
- getMimeType(ext) {
1789
- const mimeMap = {
1790
- csv: "text/csv",
1791
- tsv: "text/tab-separated-values",
1792
- jpg: "image/jpeg",
1793
- jpeg: "image/jpeg",
1794
- png: "image/png",
1795
- gif: "image/gif",
1796
- webp: "image/webp",
1797
- bmp: "image/bmp",
1798
- tiff: "image/tiff",
1799
- tif: "image/tiff",
1800
- svg: "image/svg+xml",
1801
- avif: "image/avif",
1802
- pdf: "application/pdf",
1803
- // Video MIME types
1804
- mp4: "video/mp4",
1805
- mkv: "video/x-matroska",
1806
- mov: "video/quicktime",
1807
- avi: "video/x-msvideo",
1808
- webm: "video/webm",
1809
- wmv: "video/x-ms-wmv",
1810
- flv: "video/x-flv",
1811
- // Audio MIME types
1812
- mp3: "audio/mpeg",
1813
- wav: "audio/wav",
1814
- ogg: "audio/ogg",
1815
- flac: "audio/flac",
1816
- m4a: "audio/mp4",
1817
- aac: "audio/aac",
1818
- wma: "audio/x-ms-wma",
1819
- opus: "audio/opus",
1820
- // Archive MIME types
1821
- zip: "application/zip",
1822
- tar: "application/x-tar",
1823
- gz: "application/gzip",
1824
- tgz: "application/gzip",
1825
- rar: "application/x-rar-compressed",
1826
- "7z": "application/x-7z-compressed",
1827
- jar: "application/java-archive",
1828
- // Document MIME types
1829
- xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1830
- xls: "application/vnd.ms-excel",
1831
- docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1832
- doc: "application/msword",
1833
- pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
1834
- ppt: "application/vnd.ms-powerpoint",
1835
- odt: "application/vnd.oasis.opendocument.text",
1836
- ods: "application/vnd.oasis.opendocument.spreadsheet",
1837
- odp: "application/vnd.oasis.opendocument.presentation",
1838
- rtf: "application/rtf",
1839
- // Text/markup MIME types
1840
- txt: "text/plain",
1841
- md: "text/markdown",
1842
- markdown: "text/markdown",
1843
- json: "application/json",
1844
- xml: "application/xml",
1845
- yaml: "application/yaml",
1846
- yml: "application/yaml",
1847
- html: "text/html",
1848
- htm: "text/html",
1849
- css: "text/css",
1850
- log: "text/plain",
1851
- conf: "text/plain",
1852
- cfg: "text/plain",
1853
- ini: "text/plain",
1854
- env: "text/plain",
1855
- toml: "text/plain",
1856
- properties: "text/plain",
1857
- gitignore: "text/plain",
1858
- dockerignore: "text/plain",
1859
- editorconfig: "text/plain",
1860
- prettierrc: "application/json",
1861
- eslintrc: "application/json",
1862
- babelrc: "application/json",
1863
- // Source code MIME types
1864
- js: "text/javascript",
1865
- mjs: "text/javascript",
1866
- cjs: "text/javascript",
1867
- jsx: "text/javascript",
1868
- ts: "text/typescript",
1869
- tsx: "text/typescript",
1870
- py: "text/x-python",
1871
- java: "text/x-java-source",
1872
- go: "text/x-go",
1873
- rs: "text/x-rustsrc",
1874
- rb: "text/x-ruby",
1875
- php: "text/x-php",
1876
- c: "text/x-c",
1877
- cpp: "text/x-c++",
1878
- cc: "text/x-c++",
1879
- h: "text/x-c",
1880
- hpp: "text/x-c++",
1881
- cs: "text/x-csharp",
1882
- swift: "text/x-swift",
1883
- kt: "text/x-kotlin",
1884
- kts: "text/x-kotlin",
1885
- scala: "text/x-scala",
1886
- sh: "text/x-shellscript",
1887
- bash: "text/x-shellscript",
1888
- zsh: "text/x-shellscript",
1889
- ps1: "text/x-powershell",
1890
- sql: "text/x-sql",
1891
- r: "text/x-r",
1892
- lua: "text/x-lua",
1893
- pl: "text/x-perl",
1894
- perl: "text/x-perl",
1895
- dart: "text/x-dart",
1896
- ex: "text/x-elixir",
1897
- exs: "text/x-elixir",
1898
- erl: "text/x-erlang",
1899
- hs: "text/x-haskell",
1900
- clj: "text/x-clojure",
1901
- lisp: "text/x-lisp",
1902
- vim: "text/plain",
1903
- // Additional video/image
1904
- m4v: "video/mp4",
1905
- ico: "image/x-icon",
1906
- };
1907
- return mimeMap[ext.toLowerCase()] || "application/octet-stream";
1908
- }
1909
- unknown() {
1910
- return {
1911
- type: "unknown",
1912
- mimeType: "application/octet-stream",
1913
- extension: null,
1914
- source: "buffer",
1915
- metadata: { confidence: 0 },
1916
- };
1917
- }
1918
- }
1919
- /**
1920
- * Strategy 4: Content Heuristics (75% confidence)
1921
- * Detects file type by analyzing content patterns
1922
- */
1923
- class ContentHeuristicStrategy {
1924
- async detect(input) {
1925
- let buffer;
1926
- if (Buffer.isBuffer(input)) {
1927
- buffer = input;
1928
- }
1929
- else if (typeof input === "string") {
1930
- // Try to load from file path or data URI
1931
- if (input.startsWith("data:")) {
1932
- // Data URI
1933
- const match = input.match(/^data:([^;]+);base64,(.+)$/);
1934
- if (!match) {
1935
- return this.unknown();
1936
- }
1937
- buffer = Buffer.from(match[2], "base64");
1938
- }
1939
- else if (input.startsWith("http://") || input.startsWith("https://")) {
1940
- // URL - can't analyze without making HTTP request in ContentHeuristic
1941
- return this.unknown();
1942
- }
1943
- else {
1944
- // File path - try to load it
1945
- try {
1946
- buffer = await readFile(input);
1947
- }
1948
- catch {
1949
- return this.unknown();
1950
- }
1951
- }
1952
- }
1953
- else {
1954
- return this.unknown();
1955
- }
1956
- const sample = buffer.toString("utf-8", 0, Math.min(2000, buffer.length));
1957
- // Check for JSON first (more specific than CSV)
1958
- if (this.looksLikeJSON(sample)) {
1959
- return this.result("text", "application/json", 75);
1960
- }
1961
- // Check CSV after JSON (CSV is more generic)
1962
- if (this.looksLikeCSV(sample)) {
1963
- return this.result("csv", "text/csv", 75);
1964
- }
1965
- // Check for XML/HTML
1966
- if (this.looksLikeXML(sample)) {
1967
- const isHTML = sample.includes("<!DOCTYPE html") || sample.includes("<html");
1968
- return this.result("text", isHTML ? "text/html" : "application/xml", 70);
1969
- }
1970
- // Check for YAML
1971
- if (this.looksLikeYAML(sample)) {
1972
- return this.result("text", "application/yaml", 70);
1973
- }
1974
- // Check for plain text (if mostly printable characters)
1975
- if (this.looksLikeText(sample)) {
1976
- return this.result("text", "text/plain", 60);
1977
- }
1978
- return this.unknown();
1979
- }
1980
- looksLikeCSV(text) {
1981
- const lines = text.trim().split("\n");
1982
- if (lines.length < 2) {
1983
- return false;
1984
- }
1985
- // Detect delimiter from first line
1986
- const firstLine = lines[0];
1987
- const delimiters = [",", ";", "\t", "|"];
1988
- const delimiter = delimiters.find((d) => firstLine.includes(d));
1989
- // Single-column CSV check (no delimiter)
1990
- if (!delimiter) {
1991
- // Exclude content that looks like other structured formats
1992
- // YAML indicators
1993
- if (text.startsWith("---") ||
1994
- /^[\s]*-\s+/m.test(text) ||
1995
- /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*/m.test(text)) {
1996
- return false;
1997
- }
1998
- // XML/HTML indicators
1999
- if (text.startsWith("<") || text.includes("<?xml")) {
2000
- return false;
2001
- }
2002
- // JSON indicators
2003
- if ((text.startsWith("{") && text.includes("}")) ||
2004
- (text.startsWith("[") && text.includes("]"))) {
2005
- return false;
2006
- }
2007
- // Exclude prose/sentences (look for sentence patterns)
2008
- // Check for multiple words per line (prose indicator)
2009
- const hasProsePattern = lines.some((line) => {
2010
- const words = line.trim().split(/\s+/);
2011
- return words.length > 4; // More than 4 words suggests prose, not data
2012
- });
2013
- if (hasProsePattern) {
2014
- return false;
2015
- }
2016
- // Check for consistent line structure (not binary, reasonable lengths)
2017
- const hasReasonableLengths = lines.every((l) => l.length > 0 && l.length < 1000);
2018
- const noBinaryChars = !text.includes("\0");
2019
- // Single-column CSVs should have VERY uniform line lengths
2020
- // (data values like IDs, codes, numbers - not varied content)
2021
- const lengths = lines.map((l) => l.length);
2022
- const avgLength = lengths.reduce((a, b) => a + b, 0) / lengths.length;
2023
- const variance = lengths.reduce((sum, len) => sum + (len - avgLength) ** 2, 0) /
2024
- lengths.length;
2025
- const stdDev = Math.sqrt(variance);
2026
- // Single-column CSVs can contain varied data (names, cities, emails, etc.)
2027
- // but should still show some consistency compared to random text
2028
- const hasUniformLengths = stdDev / avgLength < 0.75;
2029
- return hasReasonableLengths && noBinaryChars && hasUniformLengths;
2030
- }
2031
- // Count delimiters per line and check consistency
2032
- const delimRegex = delimiter === "|" ? /\|/g : new RegExp(delimiter, "g");
2033
- const counts = lines.map((line) => (line.match(delimRegex) || []).length);
2034
- const firstCount = counts[0];
2035
- const consistentLines = counts.filter((c) => c === firstCount).length;
2036
- return consistentLines / lines.length >= 0.8;
2037
- }
2038
- looksLikeJSON(text) {
2039
- // hasJsonMarkers now does full validation including JSON.parse
2040
- return hasJsonMarkers(text);
2041
- }
2042
- looksLikeXML(text) {
2043
- const trimmed = text.trim();
2044
- // XML declaration is a definitive marker
2045
- if (trimmed.startsWith("<?xml")) {
2046
- return true;
2047
- }
2048
- // Check for HTML DOCTYPE or tags
2049
- if (trimmed.includes("<!DOCTYPE html") ||
2050
- trimmed.toLowerCase().includes("<html")) {
2051
- return true;
2052
- }
2053
- // Strict validation for arbitrary content starting with <:
2054
- // Must have proper tag structure with at least one closing tag
2055
- if (!trimmed.startsWith("<")) {
2056
- return false;
2057
- }
2058
- // Must have valid opening tag structure: <tagname followed by space or >
2059
- // Not just any < character
2060
- const hasValidOpeningTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/;
2061
- if (!hasValidOpeningTag.test(trimmed)) {
2062
- return false;
2063
- }
2064
- // Must have at least one closing tag or self-closing tag to be valid XML/HTML
2065
- const hasClosingTag = /<\/[a-zA-Z][a-zA-Z0-9-]*>/.test(trimmed);
2066
- const hasSelfClosingTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?\s*\/\s*>/.test(trimmed);
2067
- return hasClosingTag || hasSelfClosingTag;
2068
- }
2069
- looksLikeYAML(text) {
2070
- const trimmed = text.trim();
2071
- if (trimmed.length === 0) {
2072
- return false;
2073
- }
2074
- // For single-line content, be very conservative about YAML detection
2075
- const lines = trimmed.split("\n");
2076
- if (lines.length === 1) {
2077
- // Single line can only be YAML if it's a document marker
2078
- return trimmed === "---" || trimmed === "...";
2079
- }
2080
- // Collect YAML indicators (requires at least 2 for positive detection)
2081
- const indicators = [];
2082
- // Indicator 1: Document start marker (---)
2083
- indicators.push(trimmed.startsWith("---"));
2084
- // Indicator 2: Document end marker (...) or appears within content
2085
- indicators.push(/^\.\.\.$|[\n]\.\.\.$/.test(trimmed));
2086
- // Indicator 3: YAML list items (- followed by space at line start)
2087
- indicators.push(/^[\s]*-\s+[^-]/m.test(trimmed));
2088
- // Indicator 4: Multiple key-value pairs (at least 2)
2089
- // Allow hyphens and underscores in keys, support nested keys
2090
- const keyValuePattern = /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*(.+)$/;
2091
- const keyValueMatches = lines.filter((line) => keyValuePattern.test(line)).length;
2092
- indicators.push(keyValueMatches >= 2);
2093
- // Indicator 5: Nested indentation pattern (common in YAML objects/lists)
2094
- let hasNesting = false;
2095
- const sampleLines = lines.slice(0, 10);
2096
- for (let i = 0; i < sampleLines.length - 1; i++) {
2097
- const currentLine = sampleLines[i].trim();
2098
- const nextLine = sampleLines[i + 1];
2099
- if (currentLine.length > 0 &&
2100
- nextLine.length > 0 &&
2101
- /[:-]$/.test(currentLine)) {
2102
- const currentIndent = sampleLines[i].match(/^[\s]*/)?.[0].length ?? 0;
2103
- const nextIndent = nextLine.match(/^[\s]*/)?.[0].length ?? 0;
2104
- if (nextIndent > currentIndent) {
2105
- hasNesting = true;
2106
- break;
2107
- }
2108
- }
2109
- }
2110
- indicators.push(hasNesting);
2111
- // Indicator 6: YAML comments (# followed by space)
2112
- indicators.push(/^\s*#\s+/m.test(trimmed));
2113
- // Indicator 7: List continuation (multiple items with - )
2114
- const listItemCount = lines.filter((line) => /^[\s]*-[\s]/.test(line)).length;
2115
- indicators.push(listItemCount >= 2);
2116
- // Indicator 8: Inline maps or complex structures
2117
- indicators.push(/{\s*[a-zA-Z_]/.test(trimmed) || /\[.*\]/.test(trimmed));
2118
- // Require at least 2 indicators for confident YAML detection
2119
- const matchCount = indicators.filter(Boolean).length;
2120
- return matchCount >= 2;
2121
- }
2122
- looksLikeText(text) {
2123
- // Check if content has null bytes (binary indicator)
2124
- if (text.includes("\0")) {
2125
- return false;
2126
- }
2127
- // Count printable characters
2128
- let printable = 0;
2129
- for (let i = 0; i < text.length; i++) {
2130
- const code = text.charCodeAt(i);
2131
- if ((code >= 32 && code < 127) || // ASCII printable
2132
- code === 9 || // Tab
2133
- code === 10 || // Newline
2134
- code === 13 || // Carriage return
2135
- code > 127 // Unicode
2136
- ) {
2137
- printable++;
2138
- }
2139
- }
2140
- // At least 85% should be printable for text
2141
- return printable / text.length >= 0.85;
2142
- }
2143
- result(type, mime, confidence) {
2144
- return {
2145
- type,
2146
- mimeType: mime,
2147
- extension: null,
2148
- source: "buffer",
2149
- metadata: { confidence },
2150
- };
2151
- }
2152
- unknown() {
2153
- return {
2154
- type: "unknown",
2155
- mimeType: "application/octet-stream",
2156
- extension: null,
2157
- source: "buffer",
2158
- metadata: { confidence: 0 },
2159
- };
2160
- }
2161
- }