@juspay/neurolink 9.32.0 → 9.32.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (467) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/auth/anthropicOAuth.js +1 -1
  3. package/dist/cli/commands/proxy.js +18 -5
  4. package/dist/client/aiSdkAdapter.js +1 -1
  5. package/dist/client/index.js +137 -501
  6. package/dist/core/factory.js +0 -1
  7. package/dist/core/redisConversationMemoryManager.js +1 -1
  8. package/dist/features/ppt/slideGenerator.js +0 -1
  9. package/dist/features/ppt/utils.js +0 -1
  10. package/dist/lib/server/routes/claudeProxyRoutes.js +45 -9
  11. package/dist/mcp/elicitationProtocol.js +1 -1
  12. package/dist/mcp/servers/agent/directToolsServer.js +0 -1
  13. package/dist/providers/azureOpenai.js +1 -1
  14. package/dist/providers/huggingFace.js +0 -1
  15. package/dist/providers/openaiCompatible.js +0 -1
  16. package/dist/sdk/toolRegistration.js +0 -1
  17. package/dist/server/openapi/generator.js +1 -1
  18. package/dist/server/routes/claudeProxyRoutes.js +45 -9
  19. package/dist/types/configTypes.js +0 -5
  20. package/dist/types/modelTypes.js +0 -1
  21. package/dist/types/tools.js +0 -1
  22. package/dist/types/typeAliases.js +0 -1
  23. package/dist/types/utilities.js +1 -1
  24. package/dist/types/workflowTypes.js +0 -1
  25. package/dist/utils/providerRetry.js +0 -1
  26. package/dist/utils/providerUtils.js +0 -1
  27. package/package.json +2 -2
  28. package/dist/client/adapters/providerImageAdapter.js +0 -588
  29. package/dist/client/adapters/tts/googleTTSHandler.js +0 -344
  30. package/dist/client/adapters/video/directorPipeline.js +0 -516
  31. package/dist/client/adapters/video/ffmpegAdapter.js +0 -206
  32. package/dist/client/adapters/video/frameExtractor.js +0 -143
  33. package/dist/client/adapters/video/vertexVideoHandler.js +0 -763
  34. package/dist/client/adapters/video/videoAnalyzer.js +0 -238
  35. package/dist/client/adapters/video/videoMerger.js +0 -171
  36. package/dist/client/agent/directTools.js +0 -840
  37. package/dist/client/auth/AuthProviderFactory.js +0 -111
  38. package/dist/client/auth/AuthProviderRegistry.js +0 -190
  39. package/dist/client/auth/RequestContext.js +0 -78
  40. package/dist/client/auth/accountPool.js +0 -178
  41. package/dist/client/auth/anthropicOAuth.js +0 -974
  42. package/dist/client/auth/authContext.js +0 -314
  43. package/dist/client/auth/errors.js +0 -39
  44. package/dist/client/auth/index.js +0 -61
  45. package/dist/client/auth/middleware/AuthMiddleware.js +0 -519
  46. package/dist/client/auth/middleware/rateLimitByUser.js +0 -554
  47. package/dist/client/auth/providers/BaseAuthProvider.js +0 -723
  48. package/dist/client/auth/providers/CognitoProvider.js +0 -304
  49. package/dist/client/auth/providers/KeycloakProvider.js +0 -393
  50. package/dist/client/auth/providers/auth0.js +0 -274
  51. package/dist/client/auth/providers/betterAuth.js +0 -182
  52. package/dist/client/auth/providers/clerk.js +0 -317
  53. package/dist/client/auth/providers/custom.js +0 -112
  54. package/dist/client/auth/providers/firebase.js +0 -226
  55. package/dist/client/auth/providers/jwt.js +0 -212
  56. package/dist/client/auth/providers/oauth2.js +0 -303
  57. package/dist/client/auth/providers/supabase.js +0 -259
  58. package/dist/client/auth/providers/workos.js +0 -284
  59. package/dist/client/auth/serverBridge.js +0 -25
  60. package/dist/client/auth/sessionManager.js +0 -437
  61. package/dist/client/auth/tokenStore.js +0 -799
  62. package/dist/client/client/aiSdkAdapter.js +0 -487
  63. package/dist/client/client/auth.js +0 -473
  64. package/dist/client/client/errors.js +0 -552
  65. package/dist/client/client/httpClient.js +0 -837
  66. package/dist/client/client/index.js +0 -172
  67. package/dist/client/client/interceptors.js +0 -601
  68. package/dist/client/client/sseClient.js +0 -545
  69. package/dist/client/client/streamingClient.js +0 -917
  70. package/dist/client/client/wsClient.js +0 -369
  71. package/dist/client/config/configManager.js +0 -303
  72. package/dist/client/config/conversationMemory.js +0 -86
  73. package/dist/client/config/taskClassificationConfig.js +0 -148
  74. package/dist/client/constants/contextWindows.js +0 -295
  75. package/dist/client/constants/enums.js +0 -853
  76. package/dist/client/constants/index.js +0 -207
  77. package/dist/client/constants/performance.js +0 -389
  78. package/dist/client/constants/retry.js +0 -266
  79. package/dist/client/constants/timeouts.js +0 -182
  80. package/dist/client/constants/tokens.js +0 -380
  81. package/dist/client/constants/videoErrors.js +0 -46
  82. package/dist/client/context/budgetChecker.js +0 -98
  83. package/dist/client/context/contextCompactor.js +0 -205
  84. package/dist/client/context/emergencyTruncation.js +0 -88
  85. package/dist/client/context/errorDetection.js +0 -171
  86. package/dist/client/context/errors.js +0 -21
  87. package/dist/client/context/fileTokenBudget.js +0 -127
  88. package/dist/client/context/prompts/summarizationPrompt.js +0 -117
  89. package/dist/client/context/stages/fileReadDeduplicator.js +0 -66
  90. package/dist/client/context/stages/slidingWindowTruncator.js +0 -190
  91. package/dist/client/context/stages/structuredSummarizer.js +0 -99
  92. package/dist/client/context/stages/toolOutputPruner.js +0 -52
  93. package/dist/client/context/summarizationEngine.js +0 -136
  94. package/dist/client/context/toolOutputLimits.js +0 -78
  95. package/dist/client/context/toolPairRepair.js +0 -66
  96. package/dist/client/core/analytics.js +0 -88
  97. package/dist/client/core/baseProvider.js +0 -1385
  98. package/dist/client/core/constants.js +0 -140
  99. package/dist/client/core/conversationMemoryFactory.js +0 -141
  100. package/dist/client/core/conversationMemoryInitializer.js +0 -128
  101. package/dist/client/core/conversationMemoryManager.js +0 -344
  102. package/dist/client/core/dynamicModels.js +0 -358
  103. package/dist/client/core/evaluation.js +0 -309
  104. package/dist/client/core/evaluationProviders.js +0 -248
  105. package/dist/client/core/factory.js +0 -412
  106. package/dist/client/core/infrastructure/baseError.js +0 -22
  107. package/dist/client/core/infrastructure/baseFactory.js +0 -54
  108. package/dist/client/core/infrastructure/baseRegistry.js +0 -53
  109. package/dist/client/core/infrastructure/index.js +0 -5
  110. package/dist/client/core/infrastructure/retry.js +0 -20
  111. package/dist/client/core/infrastructure/typedEventEmitter.js +0 -23
  112. package/dist/client/core/modelConfiguration.js +0 -851
  113. package/dist/client/core/modules/GenerationHandler.js +0 -588
  114. package/dist/client/core/modules/MessageBuilder.js +0 -273
  115. package/dist/client/core/modules/StreamHandler.js +0 -185
  116. package/dist/client/core/modules/TelemetryHandler.js +0 -203
  117. package/dist/client/core/modules/ToolsManager.js +0 -499
  118. package/dist/client/core/modules/Utilities.js +0 -331
  119. package/dist/client/core/redisConversationMemoryManager.js +0 -1435
  120. package/dist/client/core/streamAnalytics.js +0 -131
  121. package/dist/client/evaluation/contextBuilder.js +0 -134
  122. package/dist/client/evaluation/index.js +0 -61
  123. package/dist/client/evaluation/prompts.js +0 -73
  124. package/dist/client/evaluation/ragasEvaluator.js +0 -110
  125. package/dist/client/evaluation/retryManager.js +0 -78
  126. package/dist/client/evaluation/scoring.js +0 -61
  127. package/dist/client/factories/providerFactory.js +0 -166
  128. package/dist/client/factories/providerRegistry.js +0 -166
  129. package/dist/client/features/ppt/constants.js +0 -896
  130. package/dist/client/features/ppt/contentPlanner.js +0 -529
  131. package/dist/client/features/ppt/presentationOrchestrator.js +0 -236
  132. package/dist/client/features/ppt/slideGenerator.js +0 -532
  133. package/dist/client/features/ppt/slideRenderers.js +0 -2383
  134. package/dist/client/features/ppt/slideTypeInference.js +0 -405
  135. package/dist/client/features/ppt/types.js +0 -13
  136. package/dist/client/features/ppt/utils.js +0 -443
  137. package/dist/client/files/fileReferenceRegistry.js +0 -1543
  138. package/dist/client/files/fileTools.js +0 -450
  139. package/dist/client/files/streamingReader.js +0 -321
  140. package/dist/client/files/types.js +0 -23
  141. package/dist/client/hitl/hitlErrors.js +0 -54
  142. package/dist/client/hitl/hitlManager.js +0 -460
  143. package/dist/client/mcp/agentExposure.js +0 -356
  144. package/dist/client/mcp/auth/index.js +0 -11
  145. package/dist/client/mcp/auth/oauthClientProvider.js +0 -325
  146. package/dist/client/mcp/auth/tokenStorage.js +0 -134
  147. package/dist/client/mcp/batching/index.js +0 -10
  148. package/dist/client/mcp/batching/requestBatcher.js +0 -441
  149. package/dist/client/mcp/caching/index.js +0 -10
  150. package/dist/client/mcp/caching/toolCache.js +0 -433
  151. package/dist/client/mcp/elicitation/elicitationManager.js +0 -376
  152. package/dist/client/mcp/elicitation/index.js +0 -11
  153. package/dist/client/mcp/elicitation/types.js +0 -10
  154. package/dist/client/mcp/elicitationProtocol.js +0 -375
  155. package/dist/client/mcp/enhancedToolDiscovery.js +0 -481
  156. package/dist/client/mcp/externalServerManager.js +0 -1478
  157. package/dist/client/mcp/factory.js +0 -161
  158. package/dist/client/mcp/flexibleToolValidator.js +0 -161
  159. package/dist/client/mcp/httpRateLimiter.js +0 -391
  160. package/dist/client/mcp/httpRetryHandler.js +0 -178
  161. package/dist/client/mcp/index.js +0 -74
  162. package/dist/client/mcp/mcpCircuitBreaker.js +0 -427
  163. package/dist/client/mcp/mcpClientFactory.js +0 -708
  164. package/dist/client/mcp/mcpRegistryClient.js +0 -488
  165. package/dist/client/mcp/mcpServerBase.js +0 -373
  166. package/dist/client/mcp/multiServerManager.js +0 -579
  167. package/dist/client/mcp/registry.js +0 -158
  168. package/dist/client/mcp/routing/index.js +0 -10
  169. package/dist/client/mcp/routing/toolRouter.js +0 -416
  170. package/dist/client/mcp/serverCapabilities.js +0 -502
  171. package/dist/client/mcp/servers/agent/directToolsServer.js +0 -150
  172. package/dist/client/mcp/toolAnnotations.js +0 -239
  173. package/dist/client/mcp/toolConverter.js +0 -258
  174. package/dist/client/mcp/toolDiscoveryService.js +0 -798
  175. package/dist/client/mcp/toolIntegration.js +0 -334
  176. package/dist/client/mcp/toolRegistry.js +0 -729
  177. package/dist/client/memory/hippocampusInitializer.js +0 -19
  178. package/dist/client/memory/memoryRetrievalTools.js +0 -166
  179. package/dist/client/middleware/builtin/analytics.js +0 -132
  180. package/dist/client/middleware/builtin/autoEvaluation.js +0 -203
  181. package/dist/client/middleware/builtin/guardrails.js +0 -109
  182. package/dist/client/middleware/builtin/lifecycle.js +0 -168
  183. package/dist/client/middleware/factory.js +0 -327
  184. package/dist/client/middleware/registry.js +0 -295
  185. package/dist/client/middleware/utils/guardrailsUtils.js +0 -396
  186. package/dist/client/models/anthropicModels.js +0 -527
  187. package/dist/client/neurolink.js +0 -8233
  188. package/dist/client/observability/exporterRegistry.js +0 -413
  189. package/dist/client/observability/exporters/arizeExporter.js +0 -138
  190. package/dist/client/observability/exporters/baseExporter.js +0 -190
  191. package/dist/client/observability/exporters/braintrustExporter.js +0 -154
  192. package/dist/client/observability/exporters/datadogExporter.js +0 -196
  193. package/dist/client/observability/exporters/laminarExporter.js +0 -302
  194. package/dist/client/observability/exporters/langfuseExporter.js +0 -209
  195. package/dist/client/observability/exporters/langsmithExporter.js +0 -143
  196. package/dist/client/observability/exporters/otelExporter.js +0 -164
  197. package/dist/client/observability/exporters/posthogExporter.js +0 -287
  198. package/dist/client/observability/exporters/sentryExporter.js +0 -165
  199. package/dist/client/observability/index.js +0 -31
  200. package/dist/client/observability/metricsAggregator.js +0 -556
  201. package/dist/client/observability/otelBridge.js +0 -131
  202. package/dist/client/observability/retryPolicy.js +0 -383
  203. package/dist/client/observability/sampling/samplers.js +0 -216
  204. package/dist/client/observability/spanProcessor.js +0 -303
  205. package/dist/client/observability/tokenTracker.js +0 -413
  206. package/dist/client/observability/types/exporterTypes.js +0 -5
  207. package/dist/client/observability/types/index.js +0 -4
  208. package/dist/client/observability/types/spanTypes.js +0 -92
  209. package/dist/client/observability/utils/safeMetadata.js +0 -25
  210. package/dist/client/observability/utils/spanSerializer.js +0 -292
  211. package/dist/client/processors/archive/ArchiveProcessor.js +0 -1308
  212. package/dist/client/processors/base/BaseFileProcessor.js +0 -614
  213. package/dist/client/processors/base/types.js +0 -82
  214. package/dist/client/processors/config/fileTypes.js +0 -520
  215. package/dist/client/processors/config/index.js +0 -92
  216. package/dist/client/processors/config/languageMap.js +0 -410
  217. package/dist/client/processors/config/mimeTypes.js +0 -363
  218. package/dist/client/processors/config/sizeLimits.js +0 -258
  219. package/dist/client/processors/document/ExcelProcessor.js +0 -590
  220. package/dist/client/processors/document/OpenDocumentProcessor.js +0 -212
  221. package/dist/client/processors/document/PptxProcessor.js +0 -157
  222. package/dist/client/processors/document/RtfProcessor.js +0 -361
  223. package/dist/client/processors/document/WordProcessor.js +0 -353
  224. package/dist/client/processors/errors/FileErrorCode.js +0 -255
  225. package/dist/client/processors/errors/errorHelpers.js +0 -386
  226. package/dist/client/processors/errors/errorSerializer.js +0 -507
  227. package/dist/client/processors/errors/index.js +0 -49
  228. package/dist/client/processors/markup/SvgProcessor.js +0 -240
  229. package/dist/client/processors/media/AudioProcessor.js +0 -707
  230. package/dist/client/processors/media/VideoProcessor.js +0 -1045
  231. package/dist/client/providers/amazonBedrock.js +0 -1512
  232. package/dist/client/providers/amazonSagemaker.js +0 -162
  233. package/dist/client/providers/anthropic.js +0 -831
  234. package/dist/client/providers/azureOpenai.js +0 -143
  235. package/dist/client/providers/googleAiStudio.js +0 -1200
  236. package/dist/client/providers/googleNativeGemini3.js +0 -543
  237. package/dist/client/providers/googleVertex.js +0 -2936
  238. package/dist/client/providers/huggingFace.js +0 -315
  239. package/dist/client/providers/litellm.js +0 -488
  240. package/dist/client/providers/mistral.js +0 -157
  241. package/dist/client/providers/ollama.js +0 -1579
  242. package/dist/client/providers/openAI.js +0 -627
  243. package/dist/client/providers/openRouter.js +0 -543
  244. package/dist/client/providers/openaiCompatible.js +0 -290
  245. package/dist/client/providers/providerTypeUtils.js +0 -46
  246. package/dist/client/providers/sagemaker/adaptive-semaphore.js +0 -215
  247. package/dist/client/providers/sagemaker/client.js +0 -472
  248. package/dist/client/providers/sagemaker/config.js +0 -317
  249. package/dist/client/providers/sagemaker/detection.js +0 -606
  250. package/dist/client/providers/sagemaker/error-constants.js +0 -227
  251. package/dist/client/providers/sagemaker/errors.js +0 -299
  252. package/dist/client/providers/sagemaker/language-model.js +0 -775
  253. package/dist/client/providers/sagemaker/parsers.js +0 -634
  254. package/dist/client/providers/sagemaker/streaming.js +0 -331
  255. package/dist/client/providers/sagemaker/structured-parser.js +0 -625
  256. package/dist/client/proxy/accountQuota.js +0 -162
  257. package/dist/client/proxy/claudeFormat.js +0 -595
  258. package/dist/client/proxy/modelRouter.js +0 -29
  259. package/dist/client/proxy/oauthFetch.js +0 -367
  260. package/dist/client/proxy/proxyFetch.js +0 -586
  261. package/dist/client/proxy/requestLogger.js +0 -207
  262. package/dist/client/proxy/tokenRefresh.js +0 -124
  263. package/dist/client/proxy/usageStats.js +0 -74
  264. package/dist/client/proxy/utils/noProxyUtils.js +0 -149
  265. package/dist/client/rag/ChunkerFactory.js +0 -320
  266. package/dist/client/rag/ChunkerRegistry.js +0 -421
  267. package/dist/client/rag/chunkers/BaseChunker.js +0 -143
  268. package/dist/client/rag/chunkers/CharacterChunker.js +0 -28
  269. package/dist/client/rag/chunkers/HTMLChunker.js +0 -38
  270. package/dist/client/rag/chunkers/JSONChunker.js +0 -68
  271. package/dist/client/rag/chunkers/LaTeXChunker.js +0 -63
  272. package/dist/client/rag/chunkers/MarkdownChunker.js +0 -306
  273. package/dist/client/rag/chunkers/RecursiveChunker.js +0 -139
  274. package/dist/client/rag/chunkers/SemanticMarkdownChunker.js +0 -138
  275. package/dist/client/rag/chunkers/SentenceChunker.js +0 -66
  276. package/dist/client/rag/chunkers/TokenChunker.js +0 -61
  277. package/dist/client/rag/chunkers/index.js +0 -15
  278. package/dist/client/rag/chunking/characterChunker.js +0 -142
  279. package/dist/client/rag/chunking/chunkerRegistry.js +0 -194
  280. package/dist/client/rag/chunking/htmlChunker.js +0 -247
  281. package/dist/client/rag/chunking/index.js +0 -17
  282. package/dist/client/rag/chunking/jsonChunker.js +0 -281
  283. package/dist/client/rag/chunking/latexChunker.js +0 -251
  284. package/dist/client/rag/chunking/markdownChunker.js +0 -373
  285. package/dist/client/rag/chunking/recursiveChunker.js +0 -148
  286. package/dist/client/rag/chunking/semanticChunker.js +0 -306
  287. package/dist/client/rag/chunking/sentenceChunker.js +0 -230
  288. package/dist/client/rag/chunking/tokenChunker.js +0 -183
  289. package/dist/client/rag/document/MDocument.js +0 -392
  290. package/dist/client/rag/document/index.js +0 -5
  291. package/dist/client/rag/document/loaders.js +0 -500
  292. package/dist/client/rag/errors/RAGError.js +0 -274
  293. package/dist/client/rag/errors/index.js +0 -6
  294. package/dist/client/rag/graphRag/graphRAG.js +0 -401
  295. package/dist/client/rag/graphRag/index.js +0 -4
  296. package/dist/client/rag/index.js +0 -141
  297. package/dist/client/rag/metadata/MetadataExtractorFactory.js +0 -418
  298. package/dist/client/rag/metadata/MetadataExtractorRegistry.js +0 -362
  299. package/dist/client/rag/metadata/index.js +0 -9
  300. package/dist/client/rag/metadata/metadataExtractor.js +0 -280
  301. package/dist/client/rag/pipeline/RAGPipeline.js +0 -436
  302. package/dist/client/rag/pipeline/contextAssembly.js +0 -341
  303. package/dist/client/rag/pipeline/index.js +0 -5
  304. package/dist/client/rag/ragIntegration.js +0 -321
  305. package/dist/client/rag/reranker/RerankerFactory.js +0 -430
  306. package/dist/client/rag/reranker/RerankerRegistry.js +0 -402
  307. package/dist/client/rag/reranker/index.js +0 -9
  308. package/dist/client/rag/reranker/reranker.js +0 -277
  309. package/dist/client/rag/resilience/CircuitBreaker.js +0 -431
  310. package/dist/client/rag/resilience/RetryHandler.js +0 -304
  311. package/dist/client/rag/resilience/index.js +0 -7
  312. package/dist/client/rag/retrieval/hybridSearch.js +0 -335
  313. package/dist/client/rag/retrieval/index.js +0 -5
  314. package/dist/client/rag/retrieval/vectorQueryTool.js +0 -307
  315. package/dist/client/rag/types.js +0 -8
  316. package/dist/client/sdk/toolRegistration.js +0 -377
  317. package/dist/client/server/abstract/baseServerAdapter.js +0 -575
  318. package/dist/client/server/adapters/expressAdapter.js +0 -486
  319. package/dist/client/server/adapters/fastifyAdapter.js +0 -472
  320. package/dist/client/server/adapters/honoAdapter.js +0 -632
  321. package/dist/client/server/adapters/koaAdapter.js +0 -510
  322. package/dist/client/server/errors.js +0 -486
  323. package/dist/client/server/factory/serverAdapterFactory.js +0 -160
  324. package/dist/client/server/index.js +0 -108
  325. package/dist/client/server/middleware/abortSignal.js +0 -111
  326. package/dist/client/server/middleware/auth.js +0 -388
  327. package/dist/client/server/middleware/cache.js +0 -359
  328. package/dist/client/server/middleware/common.js +0 -281
  329. package/dist/client/server/middleware/deprecation.js +0 -190
  330. package/dist/client/server/middleware/mcpBodyAttachment.js +0 -63
  331. package/dist/client/server/middleware/rateLimit.js +0 -227
  332. package/dist/client/server/middleware/validation.js +0 -388
  333. package/dist/client/server/openapi/generator.js +0 -398
  334. package/dist/client/server/openapi/index.js +0 -36
  335. package/dist/client/server/openapi/schemas.js +0 -695
  336. package/dist/client/server/openapi/templates.js +0 -374
  337. package/dist/client/server/routes/agentRoutes.js +0 -189
  338. package/dist/client/server/routes/claudeProxyRoutes.js +0 -1600
  339. package/dist/client/server/routes/healthRoutes.js +0 -187
  340. package/dist/client/server/routes/index.js +0 -57
  341. package/dist/client/server/routes/mcpRoutes.js +0 -342
  342. package/dist/client/server/routes/memoryRoutes.js +0 -350
  343. package/dist/client/server/routes/openApiRoutes.js +0 -126
  344. package/dist/client/server/routes/toolRoutes.js +0 -199
  345. package/dist/client/server/streaming/dataStream.js +0 -486
  346. package/dist/client/server/streaming/index.js +0 -11
  347. package/dist/client/server/types.js +0 -67
  348. package/dist/client/server/utils/redaction.js +0 -334
  349. package/dist/client/server/utils/validation.js +0 -243
  350. package/dist/client/server/websocket/WebSocketHandler.js +0 -383
  351. package/dist/client/server/websocket/index.js +0 -4
  352. package/dist/client/services/server/ai/observability/instrumentation.js +0 -808
  353. package/dist/client/telemetry/attributes.js +0 -100
  354. package/dist/client/telemetry/index.js +0 -26
  355. package/dist/client/telemetry/telemetryService.js +0 -308
  356. package/dist/client/telemetry/tracers.js +0 -17
  357. package/dist/client/telemetry/withSpan.js +0 -34
  358. package/dist/client/types/actionTypes.js +0 -6
  359. package/dist/client/types/analytics.js +0 -5
  360. package/dist/client/types/authTypes.js +0 -9
  361. package/dist/client/types/circuitBreakerErrors.js +0 -34
  362. package/dist/client/types/cli.js +0 -21
  363. package/dist/client/types/clientTypes.js +0 -10
  364. package/dist/client/types/common.js +0 -51
  365. package/dist/client/types/configTypes.js +0 -49
  366. package/dist/client/types/content.js +0 -19
  367. package/dist/client/types/contextTypes.js +0 -400
  368. package/dist/client/types/conversation.js +0 -47
  369. package/dist/client/types/conversationMemoryInterface.js +0 -6
  370. package/dist/client/types/domainTypes.js +0 -5
  371. package/dist/client/types/errors.js +0 -167
  372. package/dist/client/types/evaluation.js +0 -5
  373. package/dist/client/types/evaluationProviders.js +0 -5
  374. package/dist/client/types/evaluationTypes.js +0 -1
  375. package/dist/client/types/externalMcp.js +0 -6
  376. package/dist/client/types/fileReferenceTypes.js +0 -8
  377. package/dist/client/types/fileTypes.js +0 -4
  378. package/dist/client/types/generateTypes.js +0 -1
  379. package/dist/client/types/guardrails.js +0 -1
  380. package/dist/client/types/hitlTypes.js +0 -8
  381. package/dist/client/types/index.js +0 -57
  382. package/dist/client/types/mcpTypes.js +0 -5
  383. package/dist/client/types/middlewareTypes.js +0 -1
  384. package/dist/client/types/modelTypes.js +0 -30
  385. package/dist/client/types/multimodal.js +0 -135
  386. package/dist/client/types/observability.js +0 -6
  387. package/dist/client/types/pptTypes.js +0 -82
  388. package/dist/client/types/providers.js +0 -111
  389. package/dist/client/types/proxyTypes.js +0 -16
  390. package/dist/client/types/ragTypes.js +0 -7
  391. package/dist/client/types/sdkTypes.js +0 -8
  392. package/dist/client/types/serviceTypes.js +0 -5
  393. package/dist/client/types/streamTypes.js +0 -1
  394. package/dist/client/types/subscriptionTypes.js +0 -9
  395. package/dist/client/types/taskClassificationTypes.js +0 -5
  396. package/dist/client/types/tools.js +0 -24
  397. package/dist/client/types/ttsTypes.js +0 -57
  398. package/dist/client/types/typeAliases.js +0 -48
  399. package/dist/client/types/utilities.js +0 -4
  400. package/dist/client/types/workflowTypes.js +0 -30
  401. package/dist/client/utils/async/withTimeout.js +0 -98
  402. package/dist/client/utils/asyncMutex.js +0 -60
  403. package/dist/client/utils/conversationMemory.js +0 -431
  404. package/dist/client/utils/csvProcessor.js +0 -846
  405. package/dist/client/utils/errorHandling.js +0 -936
  406. package/dist/client/utils/evaluationUtils.js +0 -131
  407. package/dist/client/utils/factoryProcessing.js +0 -589
  408. package/dist/client/utils/fileDetector.js +0 -2161
  409. package/dist/client/utils/imageCache.js +0 -376
  410. package/dist/client/utils/imageProcessor.js +0 -704
  411. package/dist/client/utils/logger.js +0 -491
  412. package/dist/client/utils/mcpDefaults.js +0 -134
  413. package/dist/client/utils/messageBuilder.js +0 -1653
  414. package/dist/client/utils/modelAliasResolver.js +0 -54
  415. package/dist/client/utils/modelDetection.js +0 -80
  416. package/dist/client/utils/modelRouter.js +0 -292
  417. package/dist/client/utils/multimodalOptionsBuilder.js +0 -65
  418. package/dist/client/utils/observabilityHelpers.js +0 -47
  419. package/dist/client/utils/parameterValidation.js +0 -966
  420. package/dist/client/utils/pdfProcessor.js +0 -410
  421. package/dist/client/utils/performance.js +0 -222
  422. package/dist/client/utils/pricing.js +0 -340
  423. package/dist/client/utils/promptRedaction.js +0 -62
  424. package/dist/client/utils/providerConfig.js +0 -1009
  425. package/dist/client/utils/providerHealth.js +0 -1237
  426. package/dist/client/utils/providerRetry.js +0 -112
  427. package/dist/client/utils/providerUtils.js +0 -434
  428. package/dist/client/utils/rateLimiter.js +0 -200
  429. package/dist/client/utils/redis.js +0 -368
  430. package/dist/client/utils/retryHandler.js +0 -269
  431. package/dist/client/utils/retryability.js +0 -22
  432. package/dist/client/utils/sanitizers/svg.js +0 -481
  433. package/dist/client/utils/schemaConversion.js +0 -255
  434. package/dist/client/utils/taskClassificationUtils.js +0 -149
  435. package/dist/client/utils/taskClassifier.js +0 -94
  436. package/dist/client/utils/thinkingConfig.js +0 -104
  437. package/dist/client/utils/timeout.js +0 -359
  438. package/dist/client/utils/tokenEstimation.js +0 -142
  439. package/dist/client/utils/tokenLimits.js +0 -125
  440. package/dist/client/utils/tokenUtils.js +0 -239
  441. package/dist/client/utils/toolUtils.js +0 -75
  442. package/dist/client/utils/transformationUtils.js +0 -554
  443. package/dist/client/utils/ttsProcessor.js +0 -286
  444. package/dist/client/utils/typeUtils.js +0 -97
  445. package/dist/client/utils/videoAnalysisProcessor.js +0 -67
  446. package/dist/client/workflow/config.js +0 -398
  447. package/dist/client/workflow/core/ensembleExecutor.js +0 -407
  448. package/dist/client/workflow/core/judgeScorer.js +0 -544
  449. package/dist/client/workflow/core/responseConditioner.js +0 -225
  450. package/dist/client/workflow/core/types/conditionerTypes.js +0 -7
  451. package/dist/client/workflow/core/types/ensembleTypes.js +0 -7
  452. package/dist/client/workflow/core/types/index.js +0 -7
  453. package/dist/client/workflow/core/types/judgeTypes.js +0 -7
  454. package/dist/client/workflow/core/types/layerTypes.js +0 -7
  455. package/dist/client/workflow/core/types/registryTypes.js +0 -7
  456. package/dist/client/workflow/core/workflowRegistry.js +0 -304
  457. package/dist/client/workflow/core/workflowRunner.js +0 -586
  458. package/dist/client/workflow/index.js +0 -50
  459. package/dist/client/workflow/types.js +0 -9
  460. package/dist/client/workflow/utils/types/index.js +0 -7
  461. package/dist/client/workflow/utils/workflowMetrics.js +0 -311
  462. package/dist/client/workflow/utils/workflowValidation.js +0 -420
  463. package/dist/client/workflow/workflows/adaptiveWorkflow.js +0 -366
  464. package/dist/client/workflow/workflows/consensusWorkflow.js +0 -192
  465. package/dist/client/workflow/workflows/fallbackWorkflow.js +0 -225
  466. package/dist/client/workflow/workflows/multiJudgeWorkflow.js +0 -351
  467. /package/dist/client/{client/reactHooks.js → reactHooks.js} +0 -0
@@ -1,2161 +0,0 @@
1
- /**
2
- * File Type Detection Utility
3
- * Centralized file detection for all multimodal file types
4
- * Uses multi-strategy approach for reliable type identification
5
- */
6
- import { readFile, stat } from "fs/promises";
7
- import { getGlobalDispatcher, interceptors, request } from "undici";
8
- import { archiveProcessor } from "../processors/archive/ArchiveProcessor.js";
9
- import { audioProcessor } from "../processors/media/AudioProcessor.js";
10
- import { videoProcessor } from "../processors/media/VideoProcessor.js";
11
- import { tracers, ATTR, withSpan } from "../telemetry/index.js";
12
- import { CSVProcessor } from "./csvProcessor.js";
13
- import { ImageProcessor } from "./imageProcessor.js";
14
- import { logger } from "./logger.js";
15
- import { PDFProcessor } from "./pdfProcessor.js";
16
- /**
17
- * Default retry configuration constants
18
- */
19
- const DEFAULT_MAX_RETRIES = 3;
20
- const DEFAULT_RETRY_DELAY = 1000; // milliseconds
21
- /**
22
- * Retryable network error codes (Node.js/undici network errors)
23
- */
24
- const RETRYABLE_ERROR_CODES = [
25
- "ETIMEDOUT",
26
- "ECONNRESET",
27
- "ECONNREFUSED",
28
- "ENOTFOUND",
29
- "ENETUNREACH",
30
- "EAI_AGAIN",
31
- "EPIPE",
32
- "ECONNABORTED",
33
- "UND_ERR_CONNECT_TIMEOUT",
34
- "UND_ERR_HEADERS_TIMEOUT",
35
- "UND_ERR_BODY_TIMEOUT",
36
- "UND_ERR_SOCKET",
37
- ];
38
- /**
39
- * Non-retryable HTTP status codes (client errors)
40
- */
41
- const NON_RETRYABLE_STATUS_CODES = [400, 401, 403, 404, 405];
42
- /**
43
- * Retryable HTTP status codes (server errors + rate limiting)
44
- */
45
- const RETRYABLE_STATUS_CODES = [429, 500, 502, 503, 504];
46
- /**
47
- * Check if an error is a recoverable network error that should be retried
48
- *
49
- * @param error - Error to check
50
- * @returns True if error is retryable (transient network issue)
51
- */
52
- function isRetryableNetworkError(error) {
53
- if (!(error instanceof Error)) {
54
- return false;
55
- }
56
- const errorMessage = error.message.toLowerCase();
57
- // Extract error code from various error shapes
58
- const errorWithCode = error;
59
- const errorCode = errorWithCode.code?.toUpperCase();
60
- // Check for retryable network error codes
61
- if (errorCode && RETRYABLE_ERROR_CODES.includes(errorCode)) {
62
- return true;
63
- }
64
- // Check HTTP status code if present in error message (e.g., "HTTP 503")
65
- const httpStatusMatch = errorMessage.match(/http\s*(\d{3})/);
66
- if (httpStatusMatch) {
67
- const statusCode = parseInt(httpStatusMatch[1], 10);
68
- if (NON_RETRYABLE_STATUS_CODES.includes(statusCode)) {
69
- return false;
70
- }
71
- if (RETRYABLE_STATUS_CODES.includes(statusCode)) {
72
- return true;
73
- }
74
- }
75
- // Check error message for transient issues
76
- const transientKeywords = [
77
- "timeout",
78
- "timed out",
79
- "connection reset",
80
- "econnreset",
81
- "etimedout",
82
- "network error",
83
- "socket hang up",
84
- "enotfound",
85
- "getaddrinfo",
86
- "unavailable",
87
- "service unavailable",
88
- ];
89
- return transientKeywords.some((keyword) => errorMessage.includes(keyword));
90
- }
91
- /**
92
- * Execute an operation with automatic retry logic on transient network errors
93
- *
94
- * @param operation - Async function to execute
95
- * @param options - Retry configuration options
96
- * @returns Promise resolving to the operation result
97
- * @throws Error if all retry attempts fail or error is non-retryable
98
- */
99
- async function withRetry(operation, options = {}) {
100
- const maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES;
101
- const retryDelay = options.retryDelay ?? DEFAULT_RETRY_DELAY;
102
- for (let attempt = 0; attempt <= maxRetries; attempt++) {
103
- try {
104
- return await operation();
105
- }
106
- catch (error) {
107
- const isRetryable = isRetryableNetworkError(error);
108
- const isLastAttempt = attempt === maxRetries;
109
- if (!isRetryable || isLastAttempt) {
110
- throw error;
111
- }
112
- // Calculate exponential backoff delay
113
- const delay = retryDelay * 2 ** attempt;
114
- logger.debug("Retrying network operation after transient error", {
115
- attempt: attempt + 1,
116
- maxRetries,
117
- delay,
118
- error: error instanceof Error ? error.message : String(error),
119
- });
120
- await new Promise((resolve) => setTimeout(resolve, delay));
121
- }
122
- }
123
- // TypeScript exhaustiveness check - should never reach here
124
- throw new Error("Retry logic failed unexpectedly");
125
- }
126
- /**
127
- * Check if text has JSON markers (starts with { or [ and ends with corresponding closing bracket)
128
- */
129
- function hasJsonMarkers(text) {
130
- const trimmed = text.trim();
131
- if (!trimmed) {
132
- return false;
133
- }
134
- const firstChar = trimmed[0];
135
- const lastChar = trimmed[trimmed.length - 1];
136
- const hasMatchingBrackets = (firstChar === "{" && lastChar === "}") ||
137
- (firstChar === "[" && lastChar === "]");
138
- if (!hasMatchingBrackets) {
139
- return false;
140
- }
141
- try {
142
- JSON.parse(trimmed);
143
- return true;
144
- }
145
- catch {
146
- return false;
147
- }
148
- }
149
- /**
150
- * Format file size in human-readable units
151
- */
152
- function formatFileSize(bytes) {
153
- if (bytes < 1024) {
154
- return `${bytes} bytes`;
155
- }
156
- if (bytes < 1024 * 1024) {
157
- return `${(bytes / 1024).toFixed(2)} KB`;
158
- }
159
- if (bytes < 1024 * 1024 * 1024) {
160
- return `${(bytes / (1024 * 1024)).toFixed(2)} MB`;
161
- }
162
- return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
163
- }
164
- /**
165
- * Centralized file type detection and processing
166
- *
167
- * @example
168
- * ```typescript
169
- * // Auto-detect and process any file
170
- * const result = await FileDetector.detectAndProcess("data.csv");
171
- * logger.info(result.type); // 'csv'
172
- * ```
173
- */
174
- export class FileDetector {
175
- // FD-017: Replace hardcoded timeouts with constants.
176
- // These default ensure consistent timeout behavior across all file-detection logic.
177
- static DEFAULT_NETWORK_TIMEOUT = 30000; // 30 seconds
178
- static DEFAULT_HEAD_TIMEOUT = 5000; // 5 seconds
179
- /**
180
- * Auto-detect file type and process in one call
181
- *
182
- * Runs detection strategies in priority order:
183
- * 1. MagicBytesStrategy (95% confidence) - Binary file headers
184
- * 2. MimeTypeStrategy (85% confidence) - HTTP Content-Type for URLs
185
- * 3. ExtensionStrategy (70% confidence) - File extension
186
- * 4. ContentHeuristicStrategy (75% confidence) - Content analysis
187
- *
188
- * @param input - File path, URL, Buffer, or data URI
189
- * @param options - Detection and processing options
190
- * @returns Processed file result with type and content
191
- */
192
- static async detectAndProcess(input, options) {
193
- // Derive filename and size for tracing before detection runs
194
- const inputFilename = FileDetector.deriveInputFilename(input);
195
- const inputSizeBytes = FileDetector.deriveInputSize(input);
196
- return withSpan({
197
- name: "neurolink.file.detect_and_process",
198
- tracer: tracers.file,
199
- attributes: {
200
- [ATTR.FILE_NAME]: inputFilename,
201
- [ATTR.FILE_SIZE_BYTES]: inputSizeBytes,
202
- },
203
- }, async (span) => {
204
- const detection = await FileDetector.detect(input, options);
205
- span.setAttribute(ATTR.FILE_CATEGORY, detection.type);
206
- span.setAttribute(ATTR.FILE_MIMETYPE, detection.mimeType || "unknown");
207
- span.setAttribute(ATTR.FILE_CONFIDENCE, detection.metadata.confidence);
208
- logger.info(`[NEUROLINK] File detected: ${inputFilename} (${detection.mimeType || "unknown"}, ${formatFileSize(inputSizeBytes)}) → category: ${detection.type}`);
209
- // FD-018: Comprehensive fallback parsing for extension-less files
210
- if (options?.allowedTypes &&
211
- !options.allowedTypes.includes(detection.type)) {
212
- const content = await FileDetector.loadContent(input, detection, options);
213
- const errors = [];
214
- for (const allowedType of options.allowedTypes) {
215
- try {
216
- const result = await FileDetector.tryFallbackParsing(content, allowedType, options);
217
- if (result) {
218
- logger.info(`[FileDetector] ✅ ${allowedType.toUpperCase()} fallback successful`);
219
- const outputLength = typeof result.content === "string"
220
- ? result.content.length
221
- : result.content?.length || 0;
222
- span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength);
223
- span.setAttribute(ATTR.FILE_SUCCESS, true);
224
- span.setAttribute(ATTR.FILE_PROCESSOR_USED, `fallback:${allowedType}`);
225
- logger.info(`[NEUROLINK] File processed: ${inputFilename} → ${outputLength} bytes output (fallback: ${allowedType})`);
226
- return result;
227
- }
228
- }
229
- catch (error) {
230
- const errorMsg = error instanceof Error ? error.message : String(error);
231
- errors.push(`${allowedType}: ${errorMsg}`);
232
- logger.debug(`[FileDetector] ${allowedType} fallback failed: ${errorMsg}`);
233
- }
234
- }
235
- logger.warn(`[FileDetector] All fallback parsing failed for type "${detection.type}". ` +
236
- `Attempted: ${options.allowedTypes.join(", ")}. Falling through to universal handler.`);
237
- const csvOptions = options?.csvOptions;
238
- const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider);
239
- FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type);
240
- return result;
241
- }
242
- const content = await FileDetector.loadContent(input, detection, options);
243
- const csvOptions = options?.csvOptions;
244
- const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider);
245
- FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type);
246
- return result;
247
- });
248
- }
249
- /**
250
- * Set span attributes and log after file processing completes.
251
- */
252
- static setFileResultSpanAttributes(span, result, filename, processorType) {
253
- const outputLength = typeof result.content === "string"
254
- ? result.content.length
255
- : result.content?.length || 0;
256
- const hasImages = Array.isArray(result.images)
257
- ? result.images.length > 0
258
- : false;
259
- const imageCount = Array.isArray(result.images)
260
- ? result.images.length
261
- : 0;
262
- span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength);
263
- span.setAttribute(ATTR.FILE_SUCCESS, true);
264
- span.setAttribute(ATTR.FILE_PROCESSOR_USED, processorType);
265
- span.setAttribute(ATTR.FILE_HAS_IMAGES, hasImages);
266
- span.setAttribute(ATTR.FILE_IMAGE_COUNT, imageCount);
267
- logger.info(`[NEUROLINK] File processed: ${filename} → ${outputLength} bytes output` +
268
- (imageCount > 0 ? ` + ${imageCount} image(s)` : "") +
269
- ` (processor: ${processorType})`);
270
- }
271
- /**
272
- * Derive a human-readable filename from FileInput for tracing.
273
- */
274
- static deriveInputFilename(input) {
275
- if (typeof input === "string") {
276
- if (input.startsWith("data:")) {
277
- return "data-uri";
278
- }
279
- if (input.startsWith("http")) {
280
- try {
281
- return new URL(input).pathname.split("/").pop() || "url-file";
282
- }
283
- catch {
284
- return "url-file";
285
- }
286
- }
287
- // File path
288
- return input.split("/").pop() || input.split("\\").pop() || "file";
289
- }
290
- if (Buffer.isBuffer(input)) {
291
- return "buffer";
292
- }
293
- return "unknown-input";
294
- }
295
- /**
296
- * Derive byte size from FileInput for tracing.
297
- */
298
- static deriveInputSize(input) {
299
- if (Buffer.isBuffer(input)) {
300
- return input.length;
301
- }
302
- if (typeof input === "string") {
303
- if (input.startsWith("data:")) {
304
- // Rough estimate: base64 is ~4/3 of raw
305
- const base64Part = input.split(",")[1];
306
- return base64Part ? Math.floor((base64Part.length * 3) / 4) : 0;
307
- }
308
- return input.length; // path or URL string length (not file size)
309
- }
310
- return 0;
311
- }
312
- /**
313
- * Try fallback parsing for a specific file type
314
- * Used when file detection returns "unknown" but we want to try parsing anyway
315
- */
316
- static async tryFallbackParsing(content, fileType, options) {
317
- logger.info(`[FileDetector] Attempting ${fileType.toUpperCase()} fallback parsing`);
318
- switch (fileType) {
319
- case "csv": {
320
- // Try CSV parsing
321
- const csvOptions = options?.csvOptions;
322
- const result = await CSVProcessor.process(content, csvOptions);
323
- logger.info(`[FileDetector] CSV fallback: ${result.metadata?.rowCount || 0} rows, ${result.metadata?.columnCount || 0} columns`);
324
- return result;
325
- }
326
- case "text": {
327
- // Try text parsing - check if content is valid UTF-8 text
328
- const textContent = content.toString("utf-8");
329
- // Validate it's actually text (no null bytes, mostly printable)
330
- if (FileDetector.isValidText(textContent)) {
331
- return {
332
- type: "text",
333
- content: textContent,
334
- mimeType: FileDetector.guessTextMimeType(textContent),
335
- metadata: {
336
- confidence: 70,
337
- size: content.length,
338
- },
339
- };
340
- }
341
- throw new Error("Content does not appear to be valid text");
342
- }
343
- case "image": {
344
- // Image requires magic bytes - can't fallback without detection
345
- throw new Error("Image type requires binary detection, cannot fallback parse");
346
- }
347
- case "pdf": {
348
- // PDF requires magic bytes - can't fallback without detection
349
- throw new Error("PDF type requires binary detection, cannot fallback parse");
350
- }
351
- case "audio": {
352
- // Audio requires magic bytes - can't fallback without detection
353
- throw new Error("Audio type requires binary detection, cannot fallback parse");
354
- }
355
- case "video": {
356
- // Video requires magic bytes - can't fallback without detection
357
- throw new Error("Video type requires binary detection, cannot fallback parse");
358
- }
359
- case "archive": {
360
- // Archive requires magic bytes - can't fallback without detection
361
- throw new Error("Archive type requires binary detection, cannot fallback parse");
362
- }
363
- case "xlsx": {
364
- // Document formats require binary detection
365
- throw new Error("Excel type requires binary detection, cannot fallback parse");
366
- }
367
- case "docx": {
368
- throw new Error("Word type requires binary detection, cannot fallback parse");
369
- }
370
- case "pptx": {
371
- throw new Error("PowerPoint type requires binary detection, cannot fallback parse");
372
- }
373
- case "svg": {
374
- // SVG can be detected from text content
375
- const svgContent = content.toString("utf-8");
376
- if (svgContent.includes("<svg") && svgContent.includes("</svg>")) {
377
- return {
378
- type: "svg",
379
- content: svgContent,
380
- mimeType: "image/svg+xml",
381
- metadata: {
382
- confidence: 70,
383
- size: content.length,
384
- },
385
- };
386
- }
387
- throw new Error("Content does not appear to be valid SVG");
388
- }
389
- default:
390
- return null;
391
- }
392
- }
393
- /**
394
- * Check if content is valid text (UTF-8, mostly printable)
395
- */
396
- static isValidText(content) {
397
- // Check for null bytes which indicate binary content
398
- if (content.includes("\0")) {
399
- return false;
400
- }
401
- // Check if content has reasonable amount of printable characters
402
- let printableCount = 0;
403
- for (let i = 0; i < content.length; i++) {
404
- const code = content.charCodeAt(i);
405
- if ((code >= 32 && code < 127) || // ASCII printable
406
- code === 9 || // Tab
407
- code === 10 || // Newline
408
- code === 13 || // Carriage return
409
- code > 127 // Unicode (non-ASCII)
410
- ) {
411
- printableCount++;
412
- }
413
- }
414
- // At least 90% should be printable
415
- return printableCount / content.length >= 0.9;
416
- }
417
- /**
418
- * Guess the MIME type for text content based on content patterns
419
- */
420
- static guessTextMimeType(content) {
421
- const trimmed = content.trim();
422
- // Check for JSON
423
- if ((trimmed.startsWith("{") && trimmed.endsWith("}")) ||
424
- (trimmed.startsWith("[") && trimmed.endsWith("]"))) {
425
- try {
426
- JSON.parse(trimmed);
427
- return "application/json";
428
- }
429
- catch {
430
- // Not valid JSON, continue checking
431
- }
432
- }
433
- // Check for XML/HTML using stricter detection
434
- if (FileDetector.looksLikeXMLStrict(trimmed)) {
435
- const isHTML = trimmed.includes("<!DOCTYPE html") ||
436
- trimmed.toLowerCase().includes("<html") ||
437
- trimmed.includes("<head") ||
438
- trimmed.includes("<body");
439
- return isHTML ? "text/html" : "application/xml";
440
- }
441
- // Check for YAML using robust multi-indicator detection
442
- if (FileDetector.looksLikeYAMLStrict(trimmed)) {
443
- return "application/yaml";
444
- }
445
- // Default to plain text
446
- return "text/plain";
447
- }
448
- /**
449
- * Strict YAML detection for guessTextMimeType
450
- * Similar to ContentHeuristicStrategy but requires at least 2 indicators
451
- * to avoid false positives from simple key: value patterns
452
- */
453
- static looksLikeYAMLStrict(text) {
454
- if (text.length === 0) {
455
- return false;
456
- }
457
- const lines = text.split("\n");
458
- // For single-line content, only --- or ... qualify as YAML
459
- if (lines.length === 1) {
460
- return text === "---" || text === "...";
461
- }
462
- // Collect YAML indicators (requires at least 2 for positive detection)
463
- const indicators = [];
464
- // Indicator 1: Document start marker (---)
465
- indicators.push(text.startsWith("---"));
466
- // Indicator 2: Document end marker (...)
467
- indicators.push(/^\.\.\.$|[\n]\.\.\.$/.test(text));
468
- // Indicator 3: YAML list items (- followed by space)
469
- indicators.push(/^[\s]*-\s+[^-]/m.test(text));
470
- // Indicator 4: Multiple key-value pairs (at least 2)
471
- const keyValuePattern = /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*(.+)$/;
472
- const keyValueMatches = lines.filter((line) => keyValuePattern.test(line)).length;
473
- indicators.push(keyValueMatches >= 2);
474
- // Require at least 2 indicators for confident YAML detection
475
- const matchCount = indicators.filter(Boolean).length;
476
- return matchCount >= 2;
477
- }
478
- /**
479
- * Strict XML detection for guessTextMimeType
480
- * Ensures content has proper XML declaration or valid tag structure with closing tags
481
- * Prevents false positives from arbitrary content starting with <
482
- */
483
- static looksLikeXMLStrict(content) {
484
- // XML declaration is a definitive marker
485
- if (content.startsWith("<?xml")) {
486
- return true;
487
- }
488
- // Must start with < for XML/HTML
489
- if (!content.startsWith("<")) {
490
- return false;
491
- }
492
- // Check for HTML DOCTYPE declaration
493
- if (content.includes("<!DOCTYPE html")) {
494
- return true;
495
- }
496
- // Must have valid opening tag structure: <tagname
497
- // Not just any < character like "< something"
498
- const hasValidOpeningTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/;
499
- if (!hasValidOpeningTag.test(content)) {
500
- return false;
501
- }
502
- // Must have at least one closing tag or self-closing tag to be valid XML/HTML
503
- const hasClosingTag = /<\/[a-zA-Z][a-zA-Z0-9-]*>/.test(content);
504
- const hasSelfClosingTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?\s*\/\s*>/.test(content);
505
- return hasClosingTag || hasSelfClosingTag;
506
- }
507
- /**
508
- * Detect file type using multi-strategy approach
509
- * Stops at first strategy with confidence >= threshold (default: 80%)
510
- */
511
- static async detect(input, options) {
512
- const confidenceThreshold = options?.confidenceThreshold ?? 80;
513
- const strategies = [
514
- new MagicBytesStrategy(),
515
- new MimeTypeStrategy(),
516
- new ExtensionStrategy(),
517
- new ContentHeuristicStrategy(),
518
- ];
519
- let best = null;
520
- for (const strategy of strategies) {
521
- const result = await strategy.detect(input);
522
- if (!best || result.metadata.confidence > best.metadata.confidence) {
523
- best = result;
524
- }
525
- if (result.metadata.confidence >= confidenceThreshold) {
526
- logger.info(`[FileDetector] Type: ${result.type} (${result.metadata.confidence}%)`);
527
- return result;
528
- }
529
- }
530
- logger.warn(`[FileDetector] Low confidence: ${best?.type ?? "unknown"} (${best?.metadata.confidence ?? 0}%)`);
531
- return best;
532
- }
533
- /**
534
- * Load file content from various sources
535
- */
536
- static async loadContent(input, detection, options) {
537
- let source = detection.source;
538
- if (source === "buffer" && !Buffer.isBuffer(input)) {
539
- if (typeof input === "string") {
540
- if (input.startsWith("data:")) {
541
- source = "datauri";
542
- }
543
- else if (input.startsWith("http://") ||
544
- input.startsWith("https://")) {
545
- source = "url";
546
- }
547
- else {
548
- source = "path";
549
- }
550
- }
551
- }
552
- switch (source) {
553
- case "url":
554
- return await FileDetector.loadFromURL(input, options);
555
- case "path":
556
- return await FileDetector.loadFromPath(input, options);
557
- case "buffer":
558
- return input;
559
- case "datauri":
560
- return FileDetector.loadFromDataURI(input);
561
- default:
562
- throw new Error(`Unknown source: ${source}`);
563
- }
564
- }
565
- /**
566
- * SDK-8: Format an informative placeholder when a file processor fails.
567
- * Instead of bare "[Video file: name]" strings, include size, format, and
568
- * the reason for failure so the LLM can acknowledge the attachment.
569
- */
570
- static formatInformativePlaceholder(typeName, filename, content, detection, error) {
571
- const sizeStr = content.length < 1024
572
- ? `${content.length} bytes`
573
- : content.length < 1024 * 1024
574
- ? `${(content.length / 1024).toFixed(1)} KB`
575
- : `${(content.length / (1024 * 1024)).toFixed(1)} MB`;
576
- const errorMsg = error instanceof Error
577
- ? error.message
578
- : error
579
- ? String(error)
580
- : "Processing returned no usable content";
581
- return (`[${typeName} File: "${filename}"]\n` +
582
- `Size: ${sizeStr}\n` +
583
- `Format: ${detection.mimeType || "unknown"}\n` +
584
- `Error: Could not extract content (${errorMsg}).\n` +
585
- `The file was attached but could not be fully analyzed.`);
586
- }
587
- /**
588
- * Extract metadata and printable strings from an unrecognized binary file.
589
- * This is the "extract what you can" path for unknown file types.
590
- *
591
- * Extracts:
592
- * - File size (human-readable)
593
- * - MIME type / detected format
594
- * - First N bytes as hex dump (for identification)
595
- * - Printable ASCII/UTF-8 strings found in the binary (like `strings` command)
596
- * - Known file signatures that we don't have full processors for
597
- *
598
- * @param content Raw file buffer
599
- * @param detection Detection result (may be "unknown")
600
- * @param filename Original filename (if known)
601
- * @returns Formatted text summary suitable for LLM consumption
602
- */
603
- static extractBinaryMetadata(content, detection, filename) {
604
- const parts = [];
605
- // Header
606
- const ext = detection.extension
607
- ? `.${detection.extension}`
608
- : filename.includes(".")
609
- ? filename.slice(filename.lastIndexOf("."))
610
- : "";
611
- const typeLabel = ext
612
- ? `${ext.toUpperCase().slice(1)} file`
613
- : "Binary file";
614
- parts.push(`[${typeLabel}: "${filename}"]`);
615
- // Basic metadata
616
- const sizeStr = formatFileSize(content.length);
617
- parts.push(`Size: ${sizeStr}`);
618
- if (detection.mimeType &&
619
- detection.mimeType !== "application/octet-stream") {
620
- parts.push(`Format: ${detection.mimeType}`);
621
- }
622
- // Known binary signature identification (broader than our processing capabilities)
623
- const sigLabel = FileDetector.identifyBinarySignature(content);
624
- if (sigLabel) {
625
- parts.push(`Identified as: ${sigLabel}`);
626
- }
627
- // Hex dump of first 32 bytes for identification
628
- const hexPreview = content
629
- .subarray(0, Math.min(32, content.length))
630
- .toString("hex")
631
- .match(/.{1,2}/g)
632
- ?.join(" ");
633
- if (hexPreview) {
634
- parts.push(`Header bytes: ${hexPreview}`);
635
- }
636
- // Extract printable strings (similar to Unix `strings` command)
637
- const strings = FileDetector.extractPrintableStrings(content, 4, 50);
638
- if (strings.length > 0) {
639
- parts.push(`\nEmbedded text found (${strings.length} string${strings.length > 1 ? "s" : ""}):`);
640
- for (const s of strings) {
641
- parts.push(` "${s}"`);
642
- }
643
- }
644
- parts.push(`\nThis file was attached but its format is not fully supported for content extraction.`);
645
- parts.push(`The above metadata and any embedded text have been extracted for context.`);
646
- return parts.join("\n");
647
- }
648
- /**
649
- * Identify known binary file signatures beyond what we can process.
650
- * Returns a human-readable description, or null if unrecognized.
651
- */
652
- static identifyBinarySignature(buf) {
653
- if (buf.length < 4) {
654
- return null;
655
- }
656
- // SQLite: "SQLite format 3\0"
657
- if (buf.length >= 16 &&
658
- buf.subarray(0, 15).toString("ascii") === "SQLite format 3") {
659
- return "SQLite database";
660
- }
661
- // WOFF: "wOFF"
662
- if (buf[0] === 0x77 &&
663
- buf[1] === 0x4f &&
664
- buf[2] === 0x46 &&
665
- buf[3] === 0x46) {
666
- return "WOFF font";
667
- }
668
- // WOFF2: "wOF2"
669
- if (buf[0] === 0x77 &&
670
- buf[1] === 0x4f &&
671
- buf[2] === 0x46 &&
672
- buf[3] === 0x32) {
673
- return "WOFF2 font";
674
- }
675
- // TrueType/OpenType: starts with 0x00010000 or "OTTO"
676
- if ((buf[0] === 0x00 &&
677
- buf[1] === 0x01 &&
678
- buf[2] === 0x00 &&
679
- buf[3] === 0x00) ||
680
- (buf[0] === 0x4f && buf[1] === 0x54 && buf[2] === 0x54 && buf[3] === 0x4f)) {
681
- return "TrueType/OpenType font";
682
- }
683
- // ELF executable: \x7fELF
684
- if (buf[0] === 0x7f &&
685
- buf[1] === 0x45 &&
686
- buf[2] === 0x4c &&
687
- buf[3] === 0x46) {
688
- return "ELF executable/library";
689
- }
690
- // Mach-O: 0xFEEDFACE or 0xFEEDFACF (64-bit) or 0xCAFEBABE (universal)
691
- if ((buf[0] === 0xfe &&
692
- buf[1] === 0xed &&
693
- buf[2] === 0xfa &&
694
- buf[3] === 0xce) ||
695
- (buf[0] === 0xfe &&
696
- buf[1] === 0xed &&
697
- buf[2] === 0xfa &&
698
- buf[3] === 0xcf) ||
699
- (buf[0] === 0xca && buf[1] === 0xfe && buf[2] === 0xba && buf[3] === 0xbe)) {
700
- return "Mach-O executable/library";
701
- }
702
- // PE/Windows executable: "MZ"
703
- if (buf[0] === 0x4d && buf[1] === 0x5a) {
704
- return "Windows PE executable/DLL";
705
- }
706
- // WebAssembly: "\0asm"
707
- if (buf[0] === 0x00 &&
708
- buf[1] === 0x61 &&
709
- buf[2] === 0x73 &&
710
- buf[3] === 0x6d) {
711
- return "WebAssembly binary";
712
- }
713
- // DWG (AutoCAD): starts with "AC10"
714
- if (buf[0] === 0x41 &&
715
- buf[1] === 0x43 &&
716
- buf[2] === 0x31 &&
717
- buf[3] === 0x30) {
718
- return "AutoCAD DWG drawing";
719
- }
720
- // BZ2: "BZ" + 'h'
721
- if (buf[0] === 0x42 && buf[1] === 0x5a && buf[2] === 0x68) {
722
- return "BZip2 compressed archive";
723
- }
724
- // XZ: 0xFD + "7zXZ"
725
- if (buf.length >= 6 &&
726
- buf[0] === 0xfd &&
727
- buf[1] === 0x37 &&
728
- buf[2] === 0x7a &&
729
- buf[3] === 0x58 &&
730
- buf[4] === 0x5a &&
731
- buf[5] === 0x00) {
732
- return "XZ compressed archive";
733
- }
734
- // 7z: "7z" + BC AF 27 1C
735
- if (buf.length >= 6 &&
736
- buf[0] === 0x37 &&
737
- buf[1] === 0x7a &&
738
- buf[2] === 0xbc &&
739
- buf[3] === 0xaf &&
740
- buf[4] === 0x27 &&
741
- buf[5] === 0x1c) {
742
- return "7-Zip archive";
743
- }
744
- // ISO 9660: "CD001" at offset 32769
745
- if (buf.length > 32773 &&
746
- buf.subarray(32769, 32774).toString("ascii") === "CD001") {
747
- return "ISO 9660 disc image";
748
- }
749
- // Apache Parquet: "PAR1"
750
- if (buf[0] === 0x50 &&
751
- buf[1] === 0x41 &&
752
- buf[2] === 0x52 &&
753
- buf[3] === 0x31) {
754
- return "Apache Parquet data file";
755
- }
756
- // Protocol Buffers compiled: (no fixed magic, skip)
757
- // TIFF (already handled as image, but including for completeness)
758
- if ((buf[0] === 0x49 &&
759
- buf[1] === 0x49 &&
760
- buf[2] === 0x2a &&
761
- buf[3] === 0x00) ||
762
- (buf[0] === 0x4d && buf[1] === 0x4d && buf[2] === 0x00 && buf[3] === 0x2a)) {
763
- return "TIFF image";
764
- }
765
- // ICO: 00 00 01 00
766
- if (buf[0] === 0x00 &&
767
- buf[1] === 0x00 &&
768
- buf[2] === 0x01 &&
769
- buf[3] === 0x00) {
770
- return "ICO icon image";
771
- }
772
- return null;
773
- }
774
- /**
775
- * Extract printable ASCII strings from a binary buffer.
776
- * Similar to the Unix `strings` utility.
777
- *
778
- * @param buf Buffer to scan
779
- * @param minLength Minimum string length to include (default 4)
780
- * @param maxStrings Maximum number of strings to return (default 50)
781
- * @returns Array of printable strings found in the binary
782
- */
783
- static extractPrintableStrings(buf, minLength = 4, maxStrings = 50) {
784
- const strings = [];
785
- let current = "";
786
- // Only scan first 64KB to avoid huge processing time
787
- const scanLimit = Math.min(buf.length, 64 * 1024);
788
- for (let i = 0; i < scanLimit; i++) {
789
- const byte = buf[i];
790
- // Printable ASCII range (space through tilde) plus tab
791
- if ((byte >= 0x20 && byte <= 0x7e) || byte === 0x09) {
792
- current += String.fromCharCode(byte);
793
- }
794
- else {
795
- if (current.length >= minLength) {
796
- strings.push(current);
797
- if (strings.length >= maxStrings) {
798
- break;
799
- }
800
- }
801
- current = "";
802
- }
803
- }
804
- // Flush last string
805
- if (current.length >= minLength && strings.length < maxStrings) {
806
- strings.push(current);
807
- }
808
- return strings;
809
- }
810
- /**
811
- * Route to appropriate processor
812
- */
813
- static async processFile(content, detection, options, provider) {
814
- switch (detection.type) {
815
- case "csv":
816
- // Pass original extension through to CSV processor; if detection has none,
817
- // fall back to any extension provided in csvOptions.
818
- return await CSVProcessor.process(content, {
819
- ...options,
820
- extension: detection.extension ?? options?.extension,
821
- });
822
- case "image":
823
- return await ImageProcessor.process(content);
824
- case "pdf":
825
- return await PDFProcessor.process(content, { provider });
826
- case "svg":
827
- // SVG is processed as text content (sanitized XML markup)
828
- // AI providers don't support SVG as image format, so we extract text content
829
- return await FileDetector.processSvgAsText(content, detection);
830
- case "video":
831
- return await FileDetector.processVideoFile(content, detection);
832
- case "audio":
833
- return await FileDetector.processAudioFile(content, detection);
834
- case "archive":
835
- return await FileDetector.processArchiveFile(content, detection);
836
- case "xlsx":
837
- return await FileDetector.processXlsxFile(content, detection);
838
- case "docx":
839
- return await FileDetector.processDocxFile(content, detection);
840
- case "pptx":
841
- return await FileDetector.processPptxFile(content, detection);
842
- case "text":
843
- return {
844
- type: "text",
845
- content: content.toString("utf-8"),
846
- mimeType: detection.mimeType || "text/plain",
847
- metadata: detection.metadata,
848
- };
849
- default: {
850
- // Graceful degradation: try to treat unknown types as text if content is valid UTF-8
851
- const unknownContent = content.toString("utf-8");
852
- if (FileDetector.isValidText(unknownContent)) {
853
- logger.warn(`[FileDetector] Unknown type "${detection.type}", treating as text`);
854
- return {
855
- type: "text",
856
- content: unknownContent,
857
- mimeType: detection.mimeType || "text/plain",
858
- metadata: detection.metadata,
859
- };
860
- }
861
- // Binary file that we can't fully process — extract what we can
862
- // (metadata, printable strings, signature identification)
863
- const filename = detection.metadata.filename || "file";
864
- logger.warn(`[FileDetector] Unknown binary type "${detection.type}", extracting metadata for "${filename}"`);
865
- return {
866
- type: "unknown",
867
- content: FileDetector.extractBinaryMetadata(content, detection, filename),
868
- mimeType: detection.mimeType || "application/octet-stream",
869
- metadata: detection.metadata,
870
- };
871
- }
872
- }
873
- }
874
- /**
875
- * Process video file: extract metadata, keyframes, and subtitles via VideoProcessor
876
- */
877
- static async processVideoFile(content, detection) {
878
- const videoFilename = detection.metadata.filename || "video";
879
- try {
880
- const videoResult = await videoProcessor.processFile({
881
- id: videoFilename,
882
- name: videoFilename,
883
- mimetype: detection.mimeType || "video/mp4",
884
- size: content.length,
885
- buffer: content,
886
- });
887
- if (videoResult.success && videoResult.data) {
888
- return {
889
- type: "video",
890
- content: videoResult.data.textContent ||
891
- FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection),
892
- mimeType: detection.mimeType,
893
- images: videoResult.data.keyframes && videoResult.data.keyframes.length > 0
894
- ? videoResult.data.keyframes
895
- : undefined,
896
- metadata: {
897
- ...detection.metadata,
898
- frameCount: videoResult.data.frameCount,
899
- hasKeyframes: videoResult.data.hasKeyframes,
900
- },
901
- };
902
- }
903
- }
904
- catch (videoError) {
905
- logger.warn(`[FileDetector] VideoProcessor failed for ${videoFilename}, using fallback`, videoError instanceof Error ? videoError.message : String(videoError));
906
- return {
907
- type: "video",
908
- content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection, videoError),
909
- mimeType: detection.mimeType,
910
- metadata: detection.metadata,
911
- };
912
- }
913
- // Fallback if processor returned no data
914
- return {
915
- type: "video",
916
- content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection),
917
- mimeType: detection.mimeType,
918
- metadata: detection.metadata,
919
- };
920
- }
921
- /**
922
- * Process audio file: extract metadata, tags, and cover art via AudioProcessor
923
- */
924
- static async processAudioFile(content, detection) {
925
- const audioFilename = detection.metadata.filename || "audio";
926
- try {
927
- const audioResult = await audioProcessor.processFile({
928
- id: audioFilename,
929
- name: audioFilename,
930
- mimetype: detection.mimeType || "audio/mpeg",
931
- size: content.length,
932
- buffer: content,
933
- });
934
- if (audioResult.success && audioResult.data) {
935
- return {
936
- type: "audio",
937
- content: audioResult.data.textContent ||
938
- FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection),
939
- mimeType: detection.mimeType,
940
- // Surface embedded cover art as an image content block
941
- images: audioResult.data.coverArt
942
- ? [audioResult.data.coverArt]
943
- : undefined,
944
- metadata: detection.metadata,
945
- };
946
- }
947
- }
948
- catch (audioError) {
949
- logger.warn(`[FileDetector] AudioProcessor failed for ${audioFilename}, using fallback`, audioError instanceof Error ? audioError.message : String(audioError));
950
- return {
951
- type: "audio",
952
- content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection, audioError),
953
- mimeType: detection.mimeType,
954
- metadata: detection.metadata,
955
- };
956
- }
957
- // Fallback if processor returned no data
958
- return {
959
- type: "audio",
960
- content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection),
961
- mimeType: detection.mimeType,
962
- metadata: detection.metadata,
963
- };
964
- }
965
- /**
966
- * Process archive file: list contents and extract metadata via ArchiveProcessor
967
- */
968
- static async processArchiveFile(content, detection) {
969
- const archiveFilename = detection.metadata.filename || "archive";
970
- try {
971
- const archiveResult = await archiveProcessor.processFile({
972
- id: archiveFilename,
973
- name: archiveFilename,
974
- mimetype: detection.mimeType || "application/zip",
975
- size: content.length,
976
- buffer: content,
977
- });
978
- if (archiveResult.success && archiveResult.data) {
979
- return {
980
- type: "archive",
981
- content: archiveResult.data.textContent ||
982
- FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection),
983
- mimeType: detection.mimeType,
984
- metadata: detection.metadata,
985
- };
986
- }
987
- }
988
- catch (archiveError) {
989
- logger.warn(`[FileDetector] ArchiveProcessor failed for ${archiveFilename}, using fallback`, archiveError instanceof Error
990
- ? archiveError.message
991
- : String(archiveError));
992
- return {
993
- type: "archive",
994
- content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection, archiveError),
995
- mimeType: detection.mimeType,
996
- metadata: detection.metadata,
997
- };
998
- }
999
- // Fallback if processor returned no data
1000
- return {
1001
- type: "archive",
1002
- content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection),
1003
- mimeType: detection.mimeType,
1004
- metadata: detection.metadata,
1005
- };
1006
- }
1007
- /**
1008
- * Process Excel/OpenDocument spreadsheet file via ExcelProcessor or OpenDocumentProcessor
1009
- */
1010
- static async processXlsxFile(content, detection) {
1011
- const xlsxFilename = detection.metadata.filename || "spreadsheet";
1012
- try {
1013
- const ext = detection.extension?.toLowerCase();
1014
- if (ext === "ods") {
1015
- const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js");
1016
- const odsResult = await openDocumentProcessor.processFile({
1017
- id: xlsxFilename,
1018
- name: xlsxFilename,
1019
- mimetype: detection.mimeType ||
1020
- "application/vnd.oasis.opendocument.spreadsheet",
1021
- size: content.length,
1022
- buffer: content,
1023
- });
1024
- if (odsResult.success && odsResult.data) {
1025
- return {
1026
- type: "xlsx",
1027
- content: odsResult.data.textContent ||
1028
- FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection),
1029
- mimeType: detection.mimeType,
1030
- metadata: detection.metadata,
1031
- };
1032
- }
1033
- }
1034
- else {
1035
- const { excelProcessor } = await import("../processors/document/ExcelProcessor.js");
1036
- const xlsxResult = await excelProcessor.processFile({
1037
- id: xlsxFilename,
1038
- name: xlsxFilename,
1039
- mimetype: detection.mimeType ||
1040
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1041
- size: content.length,
1042
- buffer: content,
1043
- });
1044
- if (xlsxResult.success && xlsxResult.data) {
1045
- // Build text content from worksheets
1046
- const sheets = xlsxResult.data.worksheets || [];
1047
- let textContent = `Spreadsheet: ${sheets.length} sheet(s), ${xlsxResult.data.totalRows} total rows\n`;
1048
- for (const sheet of sheets) {
1049
- textContent += `\n### Sheet: ${sheet.name}\n`;
1050
- textContent += `Columns (${sheet.columnCount}): ${sheet.headers.join(", ")}\n`;
1051
- textContent += `Rows: ${sheet.rowCount}\n`;
1052
- // Include first rows as sample data
1053
- const sampleRows = sheet.rows.slice(0, 20);
1054
- const rowText = sampleRows
1055
- .map((row) => row.map((c) => String(c ?? "")).join("\t"))
1056
- .join("\n");
1057
- if (!rowText) {
1058
- continue;
1059
- }
1060
- textContent += `\nData:\n${sheet.headers.join("\t")}\n${rowText}\n`;
1061
- const remaining = sheet.rowCount - 20;
1062
- if (remaining > 0) {
1063
- textContent += `... (${remaining} more rows)\n`;
1064
- }
1065
- }
1066
- return {
1067
- type: "xlsx",
1068
- content: textContent,
1069
- mimeType: detection.mimeType,
1070
- metadata: detection.metadata,
1071
- };
1072
- }
1073
- }
1074
- }
1075
- catch (xlsxError) {
1076
- logger.warn(`[FileDetector] ExcelProcessor failed for ${xlsxFilename}, using fallback`, xlsxError instanceof Error ? xlsxError.message : String(xlsxError));
1077
- return {
1078
- type: "xlsx",
1079
- content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection, xlsxError),
1080
- mimeType: detection.mimeType,
1081
- metadata: detection.metadata,
1082
- };
1083
- }
1084
- // Fallback if processor returned no data
1085
- return {
1086
- type: "xlsx",
1087
- content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection),
1088
- mimeType: detection.mimeType,
1089
- metadata: detection.metadata,
1090
- };
1091
- }
1092
- /**
1093
- * Process Word/OpenDocument/RTF document via WordProcessor, OpenDocumentProcessor, or RtfProcessor
1094
- */
1095
- static async processDocxFile(content, detection) {
1096
- const docxFilename = detection.metadata.filename || "document";
1097
- const ext = detection.extension?.toLowerCase();
1098
- try {
1099
- if (ext === "odt") {
1100
- const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js");
1101
- const odtResult = await openDocumentProcessor.processFile({
1102
- id: docxFilename,
1103
- name: docxFilename,
1104
- mimetype: detection.mimeType || "application/vnd.oasis.opendocument.text",
1105
- size: content.length,
1106
- buffer: content,
1107
- });
1108
- if (odtResult.success && odtResult.data) {
1109
- return {
1110
- type: "docx",
1111
- content: odtResult.data.textContent ||
1112
- FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
1113
- mimeType: detection.mimeType,
1114
- metadata: detection.metadata,
1115
- };
1116
- }
1117
- }
1118
- else if (ext === "rtf") {
1119
- const { rtfProcessor } = await import("../processors/document/RtfProcessor.js");
1120
- const rtfResult = await rtfProcessor.processFile({
1121
- id: docxFilename,
1122
- name: docxFilename,
1123
- mimetype: detection.mimeType || "application/rtf",
1124
- size: content.length,
1125
- buffer: content,
1126
- });
1127
- if (rtfResult.success && rtfResult.data) {
1128
- return {
1129
- type: "docx",
1130
- content: rtfResult.data.textContent ||
1131
- FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
1132
- mimeType: detection.mimeType,
1133
- metadata: detection.metadata,
1134
- };
1135
- }
1136
- }
1137
- else {
1138
- const { wordProcessor } = await import("../processors/document/WordProcessor.js");
1139
- const docxResult = await wordProcessor.processFile({
1140
- id: docxFilename,
1141
- name: docxFilename,
1142
- mimetype: detection.mimeType ||
1143
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1144
- size: content.length,
1145
- buffer: content,
1146
- });
1147
- if (docxResult.success && docxResult.data) {
1148
- return {
1149
- type: "docx",
1150
- content: docxResult.data.textContent ||
1151
- FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
1152
- mimeType: detection.mimeType,
1153
- metadata: detection.metadata,
1154
- };
1155
- }
1156
- }
1157
- }
1158
- catch (docxError) {
1159
- logger.warn(`[FileDetector] Document processor failed for ${docxFilename}, using fallback`, docxError instanceof Error ? docxError.message : String(docxError));
1160
- return {
1161
- type: "docx",
1162
- content: FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection, docxError),
1163
- mimeType: detection.mimeType,
1164
- metadata: detection.metadata,
1165
- };
1166
- }
1167
- // Fallback if processor returned no data
1168
- return {
1169
- type: "docx",
1170
- content: FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
1171
- mimeType: detection.mimeType,
1172
- metadata: detection.metadata,
1173
- };
1174
- }
1175
- /**
1176
- * Process PowerPoint/OpenDocument presentation via PptxProcessor
1177
- */
1178
- static async processPptxFile(content, detection) {
1179
- const pptxFilename = detection.metadata.filename || "presentation";
1180
- try {
1181
- const { PptxProcessor } = await import("../processors/document/PptxProcessor.js");
1182
- const pptxResult = await PptxProcessor.extractText(content);
1183
- if (pptxResult) {
1184
- return {
1185
- type: "pptx",
1186
- content: pptxResult,
1187
- mimeType: detection.mimeType,
1188
- metadata: detection.metadata,
1189
- };
1190
- }
1191
- }
1192
- catch (pptxError) {
1193
- logger.warn(`[FileDetector] PptxProcessor failed for ${pptxFilename}, using fallback`, pptxError instanceof Error ? pptxError.message : String(pptxError));
1194
- return {
1195
- type: "pptx",
1196
- content: FileDetector.formatInformativePlaceholder("Presentation", pptxFilename, content, detection, pptxError),
1197
- mimeType: detection.mimeType,
1198
- metadata: detection.metadata,
1199
- };
1200
- }
1201
- // Fallback if processor returned no content
1202
- return {
1203
- type: "pptx",
1204
- content: FileDetector.formatInformativePlaceholder("Presentation", pptxFilename, content, detection),
1205
- mimeType: detection.mimeType,
1206
- metadata: detection.metadata,
1207
- };
1208
- }
1209
- /**
1210
- * Process SVG file as text content
1211
- * Uses SvgProcessor for security sanitization (removes XSS vectors)
1212
- * Returns sanitized SVG markup as text for AI analysis
1213
- */
1214
- static async processSvgAsText(content, detection) {
1215
- try {
1216
- // Dynamic import to avoid circular dependencies
1217
- const { processSvg } = await import("../processors/markup/SvgProcessor.js");
1218
- const result = await processSvg({
1219
- id: "svg-file",
1220
- name: detection.metadata.filename || "image.svg",
1221
- mimetype: "image/svg+xml",
1222
- size: content.length,
1223
- buffer: content,
1224
- });
1225
- if (result.success && result.data) {
1226
- logger.info(`[FileDetector] SVG processed as text: ${detection.metadata.filename || "image.svg"}`);
1227
- return {
1228
- type: "svg",
1229
- content: result.data.textContent, // Sanitized SVG content
1230
- mimeType: "image/svg+xml",
1231
- metadata: {
1232
- confidence: detection.metadata.confidence,
1233
- size: content.length,
1234
- filename: detection.metadata.filename,
1235
- extension: detection.extension,
1236
- },
1237
- };
1238
- }
1239
- else {
1240
- // Fail closed: return safe empty SVG instead of raw unsanitized content
1241
- logger.warn(`[FileDetector] SVG processor failed, returning safe empty SVG: ${result.error?.userMessage}`);
1242
- return {
1243
- type: "svg",
1244
- content: '<svg xmlns="http://www.w3.org/2000/svg"></svg>',
1245
- mimeType: "image/svg+xml",
1246
- metadata: {
1247
- confidence: detection.metadata.confidence,
1248
- size: content.length,
1249
- filename: detection.metadata.filename,
1250
- extension: detection.extension,
1251
- },
1252
- };
1253
- }
1254
- }
1255
- catch (error) {
1256
- // Fail closed: return safe empty SVG instead of raw unsanitized content
1257
- logger.warn(`[FileDetector] SVG processor not available, returning safe empty SVG: ${error instanceof Error ? error.message : String(error)}`);
1258
- return {
1259
- type: "svg",
1260
- content: '<svg xmlns="http://www.w3.org/2000/svg"></svg>',
1261
- mimeType: "image/svg+xml",
1262
- metadata: {
1263
- confidence: detection.metadata.confidence,
1264
- size: content.length,
1265
- filename: detection.metadata.filename,
1266
- extension: detection.extension,
1267
- },
1268
- };
1269
- }
1270
- }
1271
- /**
1272
- * Load file from URL with automatic retry on transient network errors
1273
- */
1274
- static async loadFromURL(url, options) {
1275
- const maxSize = options?.maxSize || 200 * 1024 * 1024; // 200MB default (matches Curator memory-safety cap)
1276
- const timeout = options?.timeout || FileDetector.DEFAULT_NETWORK_TIMEOUT;
1277
- const maxRetries = options?.maxRetries ?? DEFAULT_MAX_RETRIES;
1278
- const retryDelay = options?.retryDelay ?? DEFAULT_RETRY_DELAY;
1279
- return withRetry(async () => {
1280
- const response = await request(url, {
1281
- dispatcher: getGlobalDispatcher().compose(interceptors.redirect({ maxRedirections: 5 })),
1282
- method: "GET",
1283
- headersTimeout: timeout,
1284
- bodyTimeout: timeout,
1285
- });
1286
- if (response.statusCode !== 200) {
1287
- throw new Error(`HTTP ${response.statusCode}`);
1288
- }
1289
- const chunks = [];
1290
- let totalSize = 0;
1291
- for await (const chunk of response.body) {
1292
- totalSize += chunk.length;
1293
- if (totalSize > maxSize) {
1294
- throw new Error(`File too large: ${formatFileSize(totalSize)} (max: ${formatFileSize(maxSize)})`);
1295
- }
1296
- chunks.push(chunk);
1297
- }
1298
- return Buffer.concat(chunks);
1299
- }, { maxRetries, retryDelay });
1300
- }
1301
- /**
1302
- * Load file from filesystem path
1303
- */
1304
- static async loadFromPath(path, options) {
1305
- const maxSize = options?.maxSize || 200 * 1024 * 1024; // 200MB default (matches Curator memory-safety cap)
1306
- const statInfo = await stat(path);
1307
- if (!statInfo.isFile()) {
1308
- throw new Error("Not a file");
1309
- }
1310
- if (statInfo.size > maxSize) {
1311
- throw new Error(`File too large: ${formatFileSize(statInfo.size)} (max: ${formatFileSize(maxSize)})`);
1312
- }
1313
- return await readFile(path);
1314
- }
1315
- /**
1316
- * Load file from data URI
1317
- */
1318
- static loadFromDataURI(dataUri) {
1319
- const match = dataUri.match(/^data:([^;]+);base64,(.+)$/);
1320
- if (!match) {
1321
- throw new Error("Invalid data URI format");
1322
- }
1323
- return Buffer.from(match[2], "base64");
1324
- }
1325
- }
1326
- /**
1327
- * Strategy 1: Magic Bytes Detection (95% confidence)
1328
- * Detects file type from binary file headers
1329
- */
1330
- class MagicBytesStrategy {
1331
- async detect(input) {
1332
- if (!Buffer.isBuffer(input)) {
1333
- return this.unknown();
1334
- }
1335
- if (this.isPNG(input)) {
1336
- return this.result("image", "image/png", 95);
1337
- }
1338
- if (this.isJPEG(input)) {
1339
- return this.result("image", "image/jpeg", 95);
1340
- }
1341
- if (this.isGIF(input)) {
1342
- return this.result("image", "image/gif", 95);
1343
- }
1344
- if (this.isWebP(input)) {
1345
- return this.result("image", "image/webp", 95);
1346
- }
1347
- if (this.isPDF(input)) {
1348
- return this.result("pdf", "application/pdf", 95);
1349
- }
1350
- // MP4/MOV: "ftyp" at offset 4
1351
- if (input.length >= 8 &&
1352
- input[4] === 0x66 &&
1353
- input[5] === 0x74 &&
1354
- input[6] === 0x79 &&
1355
- input[7] === 0x70) {
1356
- return this.result("video", "video/mp4", 95);
1357
- }
1358
- // MKV/WebM: EBML header
1359
- if (input.length >= 4 &&
1360
- input[0] === 0x1a &&
1361
- input[1] === 0x45 &&
1362
- input[2] === 0xdf &&
1363
- input[3] === 0xa3) {
1364
- return this.result("video", "video/x-matroska", 90);
1365
- }
1366
- // AVI: "RIFF" + "AVI "
1367
- if (input.length >= 12 &&
1368
- input[0] === 0x52 &&
1369
- input[1] === 0x49 &&
1370
- input[2] === 0x46 &&
1371
- input[3] === 0x46 &&
1372
- input[8] === 0x41 &&
1373
- input[9] === 0x56 &&
1374
- input[10] === 0x49 &&
1375
- input[11] === 0x20) {
1376
- return this.result("video", "video/x-msvideo", 95);
1377
- }
1378
- // WAV: "RIFF" + "WAVE"
1379
- if (input.length >= 12 &&
1380
- input[0] === 0x52 &&
1381
- input[1] === 0x49 &&
1382
- input[2] === 0x46 &&
1383
- input[3] === 0x46 &&
1384
- input[8] === 0x57 &&
1385
- input[9] === 0x41 &&
1386
- input[10] === 0x56 &&
1387
- input[11] === 0x45) {
1388
- return this.result("audio", "audio/wav", 95);
1389
- }
1390
- // MP3: ID3 tag
1391
- if (input.length >= 3 &&
1392
- input[0] === 0x49 &&
1393
- input[1] === 0x44 &&
1394
- input[2] === 0x33) {
1395
- return this.result("audio", "audio/mpeg", 95);
1396
- }
1397
- // MP3: sync word
1398
- if (input.length >= 2 && input[0] === 0xff && (input[1] & 0xe0) === 0xe0) {
1399
- return this.result("audio", "audio/mpeg", 80);
1400
- }
1401
- // FLAC: "fLaC"
1402
- if (input.length >= 4 &&
1403
- input[0] === 0x66 &&
1404
- input[1] === 0x4c &&
1405
- input[2] === 0x61 &&
1406
- input[3] === 0x43) {
1407
- return this.result("audio", "audio/flac", 95);
1408
- }
1409
- // OGG: "OggS"
1410
- if (input.length >= 4 &&
1411
- input[0] === 0x4f &&
1412
- input[1] === 0x67 &&
1413
- input[2] === 0x67 &&
1414
- input[3] === 0x53) {
1415
- return this.result("audio", "audio/ogg", 90);
1416
- }
1417
- // ZIP: "PK\x03\x04"
1418
- // NOTE: Many document formats (OOXML: .xlsx, .docx, .pptx; ODF: .odt, .ods)
1419
- // are internally ZIP archives and share these magic bytes. We return a lower
1420
- // confidence (70%) so the ExtensionStrategy (85%) can override with the correct
1421
- // document type when a file path with extension is available. For raw buffers
1422
- // without path info, this falls through to archive as a safe default.
1423
- if (input.length >= 4 &&
1424
- input[0] === 0x50 &&
1425
- input[1] === 0x4b &&
1426
- input[2] === 0x03 &&
1427
- input[3] === 0x04) {
1428
- return this.result("archive", "application/zip", 70);
1429
- }
1430
- // GZIP: 1F 8B
1431
- if (input.length >= 2 && input[0] === 0x1f && input[1] === 0x8b) {
1432
- return this.result("archive", "application/gzip", 90);
1433
- }
1434
- // RAR: "Rar!"
1435
- if (input.length >= 4 &&
1436
- input[0] === 0x52 &&
1437
- input[1] === 0x61 &&
1438
- input[2] === 0x72 &&
1439
- input[3] === 0x21) {
1440
- return this.result("archive", "application/x-rar-compressed", 95);
1441
- }
1442
- return this.unknown();
1443
- }
1444
- isPNG(buf) {
1445
- return (buf.length >= 4 &&
1446
- buf[0] === 0x89 &&
1447
- buf[1] === 0x50 &&
1448
- buf[2] === 0x4e &&
1449
- buf[3] === 0x47);
1450
- }
1451
- isJPEG(buf) {
1452
- return (buf.length >= 3 && buf[0] === 0xff && buf[1] === 0xd8 && buf[2] === 0xff);
1453
- }
1454
- isGIF(buf) {
1455
- return (buf.length >= 4 &&
1456
- buf[0] === 0x47 &&
1457
- buf[1] === 0x49 &&
1458
- buf[2] === 0x46 &&
1459
- buf[3] === 0x38);
1460
- }
1461
- isWebP(buf) {
1462
- return (buf.length >= 12 &&
1463
- buf.slice(0, 4).toString() === "RIFF" &&
1464
- buf.slice(8, 12).toString() === "WEBP");
1465
- }
1466
- isPDF(buf) {
1467
- return buf.length >= 5 && buf.slice(0, 5).toString() === "%PDF-";
1468
- }
1469
- result(type, mime, confidence) {
1470
- return {
1471
- type,
1472
- mimeType: mime,
1473
- extension: null,
1474
- source: "buffer",
1475
- metadata: { confidence },
1476
- };
1477
- }
1478
- unknown() {
1479
- return {
1480
- type: "unknown",
1481
- mimeType: "application/octet-stream",
1482
- extension: null,
1483
- source: "buffer",
1484
- metadata: { confidence: 0 },
1485
- };
1486
- }
1487
- }
1488
- /**
1489
- * Strategy 2: MIME Type Detection (85% confidence)
1490
- * Detects file type from HTTP Content-Type headers
1491
- */
1492
- class MimeTypeStrategy {
1493
- async detect(input) {
1494
- if (typeof input !== "string" || !this.isURL(input)) {
1495
- return this.unknown();
1496
- }
1497
- try {
1498
- const response = await request(input, {
1499
- dispatcher: getGlobalDispatcher().compose(interceptors.redirect({ maxRedirections: 5 })),
1500
- method: "HEAD",
1501
- headersTimeout: FileDetector.DEFAULT_HEAD_TIMEOUT,
1502
- bodyTimeout: FileDetector.DEFAULT_HEAD_TIMEOUT,
1503
- });
1504
- const contentType = response.headers["content-type"] || "";
1505
- const type = this.mimeToFileType(contentType);
1506
- return {
1507
- type,
1508
- mimeType: contentType.split(";")[0].trim(),
1509
- extension: null,
1510
- source: "url",
1511
- metadata: { confidence: type !== "unknown" ? 85 : 0 },
1512
- };
1513
- }
1514
- catch {
1515
- return this.unknown();
1516
- }
1517
- }
1518
- mimeToFileType(mime) {
1519
- const lower = mime.toLowerCase().split(";")[0].trim();
1520
- // CSV
1521
- if (lower === "text/csv" || lower === "text/tab-separated-values") {
1522
- return "csv";
1523
- }
1524
- // SVG is processed as text/markup, NOT as image
1525
- // Must check before generic image/ check
1526
- if (lower === "image/svg+xml") {
1527
- return "svg";
1528
- }
1529
- // Images
1530
- if (lower.startsWith("image/")) {
1531
- return "image";
1532
- }
1533
- // PDF
1534
- if (lower === "application/pdf") {
1535
- return "pdf";
1536
- }
1537
- // Video
1538
- if (lower.startsWith("video/")) {
1539
- return "video";
1540
- }
1541
- // Audio
1542
- if (lower.startsWith("audio/")) {
1543
- return "audio";
1544
- }
1545
- // Office documents — OOXML
1546
- if (lower ===
1547
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
1548
- lower === "application/msword") {
1549
- return "docx";
1550
- }
1551
- if (lower ===
1552
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ||
1553
- lower === "application/vnd.ms-excel") {
1554
- return "xlsx";
1555
- }
1556
- if (lower ===
1557
- "application/vnd.openxmlformats-officedocument.presentationml.presentation" ||
1558
- lower === "application/vnd.ms-powerpoint") {
1559
- return "pptx";
1560
- }
1561
- // OpenDocument formats
1562
- if (lower === "application/vnd.oasis.opendocument.text") {
1563
- return "docx";
1564
- }
1565
- if (lower === "application/vnd.oasis.opendocument.spreadsheet") {
1566
- return "xlsx";
1567
- }
1568
- if (lower === "application/vnd.oasis.opendocument.presentation") {
1569
- return "pptx";
1570
- }
1571
- // RTF
1572
- if (lower === "application/rtf" || lower === "text/rtf") {
1573
- return "docx";
1574
- }
1575
- // Archive formats
1576
- if (lower === "application/zip" ||
1577
- lower === "application/x-zip-compressed" ||
1578
- lower === "application/gzip" ||
1579
- lower === "application/x-gzip" ||
1580
- lower === "application/x-tar" ||
1581
- lower === "application/x-compressed-tar" ||
1582
- lower === "application/java-archive" ||
1583
- lower === "application/x-rar-compressed" ||
1584
- lower === "application/vnd.rar" ||
1585
- lower === "application/x-7z-compressed") {
1586
- return "archive";
1587
- }
1588
- // Text/markup/source code — broad matching
1589
- if (lower === "text/plain" ||
1590
- lower === "text/markdown" ||
1591
- lower === "text/html" ||
1592
- lower === "text/css" ||
1593
- lower === "text/javascript" ||
1594
- lower === "text/typescript" ||
1595
- lower === "application/json" ||
1596
- lower === "application/xml" ||
1597
- lower === "text/xml" ||
1598
- lower === "application/yaml" ||
1599
- lower === "application/x-yaml") {
1600
- return "text";
1601
- }
1602
- // Source code MIME types (text/x-*)
1603
- if (lower.startsWith("text/x-")) {
1604
- return "text";
1605
- }
1606
- // Generic text types we may not have listed explicitly
1607
- if (lower.startsWith("text/")) {
1608
- return "text";
1609
- }
1610
- return "unknown";
1611
- }
1612
- isURL(str) {
1613
- return str.startsWith("http://") || str.startsWith("https://");
1614
- }
1615
- unknown() {
1616
- return {
1617
- type: "unknown",
1618
- mimeType: "application/octet-stream",
1619
- extension: null,
1620
- source: "buffer",
1621
- metadata: { confidence: 0 },
1622
- };
1623
- }
1624
- }
1625
- /**
1626
- * Strategy 3: Extension Detection (70% confidence)
1627
- * Detects file type from file extension
1628
- */
1629
- class ExtensionStrategy {
1630
- async detect(input) {
1631
- if (typeof input !== "string") {
1632
- return this.unknown();
1633
- }
1634
- const ext = this.getExtension(input);
1635
- if (!ext) {
1636
- return this.unknown();
1637
- }
1638
- const typeMap = {
1639
- csv: "csv",
1640
- tsv: "csv",
1641
- jpg: "image",
1642
- jpeg: "image",
1643
- png: "image",
1644
- gif: "image",
1645
- webp: "image",
1646
- bmp: "image",
1647
- tiff: "image",
1648
- tif: "image",
1649
- // SVG is handled as text/markup, NOT as image
1650
- // AI providers don't support SVG format, so we process it as sanitized text
1651
- svg: "svg",
1652
- avif: "image",
1653
- pdf: "pdf",
1654
- // Video formats
1655
- mp4: "video",
1656
- mkv: "video",
1657
- mov: "video",
1658
- avi: "video",
1659
- webm: "video",
1660
- wmv: "video",
1661
- flv: "video",
1662
- // Audio formats
1663
- mp3: "audio",
1664
- wav: "audio",
1665
- ogg: "audio",
1666
- flac: "audio",
1667
- m4a: "audio",
1668
- aac: "audio",
1669
- wma: "audio",
1670
- opus: "audio",
1671
- // Archive formats
1672
- zip: "archive",
1673
- tar: "archive",
1674
- gz: "archive",
1675
- tgz: "archive",
1676
- rar: "archive",
1677
- "7z": "archive",
1678
- jar: "archive",
1679
- // Document formats (ZIP-based internally)
1680
- xlsx: "xlsx",
1681
- xls: "xlsx",
1682
- docx: "docx",
1683
- doc: "docx",
1684
- pptx: "pptx",
1685
- ppt: "pptx",
1686
- odt: "docx", // OpenDocument text → processed like docx
1687
- ods: "xlsx", // OpenDocument spreadsheet → processed like xlsx
1688
- odp: "pptx", // OpenDocument presentation → processed like pptx
1689
- rtf: "docx", // RTF → processed like docx (text extraction)
1690
- // Text/markup formats
1691
- txt: "text",
1692
- md: "text",
1693
- markdown: "text",
1694
- json: "text",
1695
- xml: "text",
1696
- yaml: "text",
1697
- yml: "text",
1698
- html: "text",
1699
- htm: "text",
1700
- css: "text",
1701
- log: "text",
1702
- conf: "text",
1703
- cfg: "text",
1704
- ini: "text",
1705
- env: "text",
1706
- toml: "text",
1707
- properties: "text",
1708
- gitignore: "text",
1709
- dockerignore: "text",
1710
- editorconfig: "text",
1711
- prettierrc: "text",
1712
- eslintrc: "text",
1713
- babelrc: "text",
1714
- // Source code formats
1715
- js: "text",
1716
- mjs: "text",
1717
- cjs: "text",
1718
- jsx: "text",
1719
- ts: "text",
1720
- tsx: "text",
1721
- py: "text",
1722
- java: "text",
1723
- go: "text",
1724
- rs: "text",
1725
- rb: "text",
1726
- php: "text",
1727
- c: "text",
1728
- cpp: "text",
1729
- cc: "text",
1730
- h: "text",
1731
- hpp: "text",
1732
- cs: "text",
1733
- swift: "text",
1734
- kt: "text",
1735
- kts: "text",
1736
- scala: "text",
1737
- sh: "text",
1738
- bash: "text",
1739
- zsh: "text",
1740
- ps1: "text",
1741
- sql: "text",
1742
- r: "text",
1743
- lua: "text",
1744
- pl: "text",
1745
- perl: "text",
1746
- dart: "text",
1747
- ex: "text",
1748
- exs: "text",
1749
- erl: "text",
1750
- hs: "text",
1751
- clj: "text",
1752
- lisp: "text",
1753
- vim: "text",
1754
- // Additional video/image
1755
- m4v: "video",
1756
- ico: "image",
1757
- };
1758
- const type = typeMap[ext.toLowerCase()];
1759
- return {
1760
- type: type || "unknown",
1761
- mimeType: this.getMimeType(ext),
1762
- extension: ext,
1763
- source: this.detectSource(input),
1764
- metadata: { confidence: type ? 85 : 0 },
1765
- };
1766
- }
1767
- getExtension(input) {
1768
- if (this.isURL(input)) {
1769
- const url = new URL(input);
1770
- const match = url.pathname.match(/\.([^.]+)$/);
1771
- return match ? match[1] : null;
1772
- }
1773
- const match = input.match(/\.([^.]+)$/);
1774
- return match ? match[1] : null;
1775
- }
1776
- isURL(str) {
1777
- return str.startsWith("http://") || str.startsWith("https://");
1778
- }
1779
- detectSource(input) {
1780
- if (input.startsWith("data:")) {
1781
- return "datauri";
1782
- }
1783
- if (this.isURL(input)) {
1784
- return "url";
1785
- }
1786
- return "path";
1787
- }
1788
- getMimeType(ext) {
1789
- const mimeMap = {
1790
- csv: "text/csv",
1791
- tsv: "text/tab-separated-values",
1792
- jpg: "image/jpeg",
1793
- jpeg: "image/jpeg",
1794
- png: "image/png",
1795
- gif: "image/gif",
1796
- webp: "image/webp",
1797
- bmp: "image/bmp",
1798
- tiff: "image/tiff",
1799
- tif: "image/tiff",
1800
- svg: "image/svg+xml",
1801
- avif: "image/avif",
1802
- pdf: "application/pdf",
1803
- // Video MIME types
1804
- mp4: "video/mp4",
1805
- mkv: "video/x-matroska",
1806
- mov: "video/quicktime",
1807
- avi: "video/x-msvideo",
1808
- webm: "video/webm",
1809
- wmv: "video/x-ms-wmv",
1810
- flv: "video/x-flv",
1811
- // Audio MIME types
1812
- mp3: "audio/mpeg",
1813
- wav: "audio/wav",
1814
- ogg: "audio/ogg",
1815
- flac: "audio/flac",
1816
- m4a: "audio/mp4",
1817
- aac: "audio/aac",
1818
- wma: "audio/x-ms-wma",
1819
- opus: "audio/opus",
1820
- // Archive MIME types
1821
- zip: "application/zip",
1822
- tar: "application/x-tar",
1823
- gz: "application/gzip",
1824
- tgz: "application/gzip",
1825
- rar: "application/x-rar-compressed",
1826
- "7z": "application/x-7z-compressed",
1827
- jar: "application/java-archive",
1828
- // Document MIME types
1829
- xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1830
- xls: "application/vnd.ms-excel",
1831
- docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1832
- doc: "application/msword",
1833
- pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
1834
- ppt: "application/vnd.ms-powerpoint",
1835
- odt: "application/vnd.oasis.opendocument.text",
1836
- ods: "application/vnd.oasis.opendocument.spreadsheet",
1837
- odp: "application/vnd.oasis.opendocument.presentation",
1838
- rtf: "application/rtf",
1839
- // Text/markup MIME types
1840
- txt: "text/plain",
1841
- md: "text/markdown",
1842
- markdown: "text/markdown",
1843
- json: "application/json",
1844
- xml: "application/xml",
1845
- yaml: "application/yaml",
1846
- yml: "application/yaml",
1847
- html: "text/html",
1848
- htm: "text/html",
1849
- css: "text/css",
1850
- log: "text/plain",
1851
- conf: "text/plain",
1852
- cfg: "text/plain",
1853
- ini: "text/plain",
1854
- env: "text/plain",
1855
- toml: "text/plain",
1856
- properties: "text/plain",
1857
- gitignore: "text/plain",
1858
- dockerignore: "text/plain",
1859
- editorconfig: "text/plain",
1860
- prettierrc: "application/json",
1861
- eslintrc: "application/json",
1862
- babelrc: "application/json",
1863
- // Source code MIME types
1864
- js: "text/javascript",
1865
- mjs: "text/javascript",
1866
- cjs: "text/javascript",
1867
- jsx: "text/javascript",
1868
- ts: "text/typescript",
1869
- tsx: "text/typescript",
1870
- py: "text/x-python",
1871
- java: "text/x-java-source",
1872
- go: "text/x-go",
1873
- rs: "text/x-rustsrc",
1874
- rb: "text/x-ruby",
1875
- php: "text/x-php",
1876
- c: "text/x-c",
1877
- cpp: "text/x-c++",
1878
- cc: "text/x-c++",
1879
- h: "text/x-c",
1880
- hpp: "text/x-c++",
1881
- cs: "text/x-csharp",
1882
- swift: "text/x-swift",
1883
- kt: "text/x-kotlin",
1884
- kts: "text/x-kotlin",
1885
- scala: "text/x-scala",
1886
- sh: "text/x-shellscript",
1887
- bash: "text/x-shellscript",
1888
- zsh: "text/x-shellscript",
1889
- ps1: "text/x-powershell",
1890
- sql: "text/x-sql",
1891
- r: "text/x-r",
1892
- lua: "text/x-lua",
1893
- pl: "text/x-perl",
1894
- perl: "text/x-perl",
1895
- dart: "text/x-dart",
1896
- ex: "text/x-elixir",
1897
- exs: "text/x-elixir",
1898
- erl: "text/x-erlang",
1899
- hs: "text/x-haskell",
1900
- clj: "text/x-clojure",
1901
- lisp: "text/x-lisp",
1902
- vim: "text/plain",
1903
- // Additional video/image
1904
- m4v: "video/mp4",
1905
- ico: "image/x-icon",
1906
- };
1907
- return mimeMap[ext.toLowerCase()] || "application/octet-stream";
1908
- }
1909
- unknown() {
1910
- return {
1911
- type: "unknown",
1912
- mimeType: "application/octet-stream",
1913
- extension: null,
1914
- source: "buffer",
1915
- metadata: { confidence: 0 },
1916
- };
1917
- }
1918
- }
1919
- /**
1920
- * Strategy 4: Content Heuristics (75% confidence)
1921
- * Detects file type by analyzing content patterns
1922
- */
1923
- class ContentHeuristicStrategy {
1924
- async detect(input) {
1925
- let buffer;
1926
- if (Buffer.isBuffer(input)) {
1927
- buffer = input;
1928
- }
1929
- else if (typeof input === "string") {
1930
- // Try to load from file path or data URI
1931
- if (input.startsWith("data:")) {
1932
- // Data URI
1933
- const match = input.match(/^data:([^;]+);base64,(.+)$/);
1934
- if (!match) {
1935
- return this.unknown();
1936
- }
1937
- buffer = Buffer.from(match[2], "base64");
1938
- }
1939
- else if (input.startsWith("http://") || input.startsWith("https://")) {
1940
- // URL - can't analyze without making HTTP request in ContentHeuristic
1941
- return this.unknown();
1942
- }
1943
- else {
1944
- // File path - try to load it
1945
- try {
1946
- buffer = await readFile(input);
1947
- }
1948
- catch {
1949
- return this.unknown();
1950
- }
1951
- }
1952
- }
1953
- else {
1954
- return this.unknown();
1955
- }
1956
- const sample = buffer.toString("utf-8", 0, Math.min(2000, buffer.length));
1957
- // Check for JSON first (more specific than CSV)
1958
- if (this.looksLikeJSON(sample)) {
1959
- return this.result("text", "application/json", 75);
1960
- }
1961
- // Check CSV after JSON (CSV is more generic)
1962
- if (this.looksLikeCSV(sample)) {
1963
- return this.result("csv", "text/csv", 75);
1964
- }
1965
- // Check for XML/HTML
1966
- if (this.looksLikeXML(sample)) {
1967
- const isHTML = sample.includes("<!DOCTYPE html") || sample.includes("<html");
1968
- return this.result("text", isHTML ? "text/html" : "application/xml", 70);
1969
- }
1970
- // Check for YAML
1971
- if (this.looksLikeYAML(sample)) {
1972
- return this.result("text", "application/yaml", 70);
1973
- }
1974
- // Check for plain text (if mostly printable characters)
1975
- if (this.looksLikeText(sample)) {
1976
- return this.result("text", "text/plain", 60);
1977
- }
1978
- return this.unknown();
1979
- }
1980
- looksLikeCSV(text) {
1981
- const lines = text.trim().split("\n");
1982
- if (lines.length < 2) {
1983
- return false;
1984
- }
1985
- // Detect delimiter from first line
1986
- const firstLine = lines[0];
1987
- const delimiters = [",", ";", "\t", "|"];
1988
- const delimiter = delimiters.find((d) => firstLine.includes(d));
1989
- // Single-column CSV check (no delimiter)
1990
- if (!delimiter) {
1991
- // Exclude content that looks like other structured formats
1992
- // YAML indicators
1993
- if (text.startsWith("---") ||
1994
- /^[\s]*-\s+/m.test(text) ||
1995
- /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*/m.test(text)) {
1996
- return false;
1997
- }
1998
- // XML/HTML indicators
1999
- if (text.startsWith("<") || text.includes("<?xml")) {
2000
- return false;
2001
- }
2002
- // JSON indicators
2003
- if ((text.startsWith("{") && text.includes("}")) ||
2004
- (text.startsWith("[") && text.includes("]"))) {
2005
- return false;
2006
- }
2007
- // Exclude prose/sentences (look for sentence patterns)
2008
- // Check for multiple words per line (prose indicator)
2009
- const hasProsePattern = lines.some((line) => {
2010
- const words = line.trim().split(/\s+/);
2011
- return words.length > 4; // More than 4 words suggests prose, not data
2012
- });
2013
- if (hasProsePattern) {
2014
- return false;
2015
- }
2016
- // Check for consistent line structure (not binary, reasonable lengths)
2017
- const hasReasonableLengths = lines.every((l) => l.length > 0 && l.length < 1000);
2018
- const noBinaryChars = !text.includes("\0");
2019
- // Single-column CSVs should have VERY uniform line lengths
2020
- // (data values like IDs, codes, numbers - not varied content)
2021
- const lengths = lines.map((l) => l.length);
2022
- const avgLength = lengths.reduce((a, b) => a + b, 0) / lengths.length;
2023
- const variance = lengths.reduce((sum, len) => sum + (len - avgLength) ** 2, 0) /
2024
- lengths.length;
2025
- const stdDev = Math.sqrt(variance);
2026
- // Single-column CSVs can contain varied data (names, cities, emails, etc.)
2027
- // but should still show some consistency compared to random text
2028
- const hasUniformLengths = stdDev / avgLength < 0.75;
2029
- return hasReasonableLengths && noBinaryChars && hasUniformLengths;
2030
- }
2031
- // Count delimiters per line and check consistency
2032
- const delimRegex = delimiter === "|" ? /\|/g : new RegExp(delimiter, "g");
2033
- const counts = lines.map((line) => (line.match(delimRegex) || []).length);
2034
- const firstCount = counts[0];
2035
- const consistentLines = counts.filter((c) => c === firstCount).length;
2036
- return consistentLines / lines.length >= 0.8;
2037
- }
2038
- looksLikeJSON(text) {
2039
- // hasJsonMarkers now does full validation including JSON.parse
2040
- return hasJsonMarkers(text);
2041
- }
2042
- looksLikeXML(text) {
2043
- const trimmed = text.trim();
2044
- // XML declaration is a definitive marker
2045
- if (trimmed.startsWith("<?xml")) {
2046
- return true;
2047
- }
2048
- // Check for HTML DOCTYPE or tags
2049
- if (trimmed.includes("<!DOCTYPE html") ||
2050
- trimmed.toLowerCase().includes("<html")) {
2051
- return true;
2052
- }
2053
- // Strict validation for arbitrary content starting with <:
2054
- // Must have proper tag structure with at least one closing tag
2055
- if (!trimmed.startsWith("<")) {
2056
- return false;
2057
- }
2058
- // Must have valid opening tag structure: <tagname followed by space or >
2059
- // Not just any < character
2060
- const hasValidOpeningTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/;
2061
- if (!hasValidOpeningTag.test(trimmed)) {
2062
- return false;
2063
- }
2064
- // Must have at least one closing tag or self-closing tag to be valid XML/HTML
2065
- const hasClosingTag = /<\/[a-zA-Z][a-zA-Z0-9-]*>/.test(trimmed);
2066
- const hasSelfClosingTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?\s*\/\s*>/.test(trimmed);
2067
- return hasClosingTag || hasSelfClosingTag;
2068
- }
2069
- looksLikeYAML(text) {
2070
- const trimmed = text.trim();
2071
- if (trimmed.length === 0) {
2072
- return false;
2073
- }
2074
- // For single-line content, be very conservative about YAML detection
2075
- const lines = trimmed.split("\n");
2076
- if (lines.length === 1) {
2077
- // Single line can only be YAML if it's a document marker
2078
- return trimmed === "---" || trimmed === "...";
2079
- }
2080
- // Collect YAML indicators (requires at least 2 for positive detection)
2081
- const indicators = [];
2082
- // Indicator 1: Document start marker (---)
2083
- indicators.push(trimmed.startsWith("---"));
2084
- // Indicator 2: Document end marker (...) or appears within content
2085
- indicators.push(/^\.\.\.$|[\n]\.\.\.$/.test(trimmed));
2086
- // Indicator 3: YAML list items (- followed by space at line start)
2087
- indicators.push(/^[\s]*-\s+[^-]/m.test(trimmed));
2088
- // Indicator 4: Multiple key-value pairs (at least 2)
2089
- // Allow hyphens and underscores in keys, support nested keys
2090
- const keyValuePattern = /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*(.+)$/;
2091
- const keyValueMatches = lines.filter((line) => keyValuePattern.test(line)).length;
2092
- indicators.push(keyValueMatches >= 2);
2093
- // Indicator 5: Nested indentation pattern (common in YAML objects/lists)
2094
- let hasNesting = false;
2095
- const sampleLines = lines.slice(0, 10);
2096
- for (let i = 0; i < sampleLines.length - 1; i++) {
2097
- const currentLine = sampleLines[i].trim();
2098
- const nextLine = sampleLines[i + 1];
2099
- if (currentLine.length > 0 &&
2100
- nextLine.length > 0 &&
2101
- /[:-]$/.test(currentLine)) {
2102
- const currentIndent = sampleLines[i].match(/^[\s]*/)?.[0].length ?? 0;
2103
- const nextIndent = nextLine.match(/^[\s]*/)?.[0].length ?? 0;
2104
- if (nextIndent > currentIndent) {
2105
- hasNesting = true;
2106
- break;
2107
- }
2108
- }
2109
- }
2110
- indicators.push(hasNesting);
2111
- // Indicator 6: YAML comments (# followed by space)
2112
- indicators.push(/^\s*#\s+/m.test(trimmed));
2113
- // Indicator 7: List continuation (multiple items with - )
2114
- const listItemCount = lines.filter((line) => /^[\s]*-[\s]/.test(line)).length;
2115
- indicators.push(listItemCount >= 2);
2116
- // Indicator 8: Inline maps or complex structures
2117
- indicators.push(/{\s*[a-zA-Z_]/.test(trimmed) || /\[.*\]/.test(trimmed));
2118
- // Require at least 2 indicators for confident YAML detection
2119
- const matchCount = indicators.filter(Boolean).length;
2120
- return matchCount >= 2;
2121
- }
2122
- looksLikeText(text) {
2123
- // Check if content has null bytes (binary indicator)
2124
- if (text.includes("\0")) {
2125
- return false;
2126
- }
2127
- // Count printable characters
2128
- let printable = 0;
2129
- for (let i = 0; i < text.length; i++) {
2130
- const code = text.charCodeAt(i);
2131
- if ((code >= 32 && code < 127) || // ASCII printable
2132
- code === 9 || // Tab
2133
- code === 10 || // Newline
2134
- code === 13 || // Carriage return
2135
- code > 127 // Unicode
2136
- ) {
2137
- printable++;
2138
- }
2139
- }
2140
- // At least 85% should be printable for text
2141
- return printable / text.length >= 0.85;
2142
- }
2143
- result(type, mime, confidence) {
2144
- return {
2145
- type,
2146
- mimeType: mime,
2147
- extension: null,
2148
- source: "buffer",
2149
- metadata: { confidence },
2150
- };
2151
- }
2152
- unknown() {
2153
- return {
2154
- type: "unknown",
2155
- mimeType: "application/octet-stream",
2156
- extension: null,
2157
- source: "buffer",
2158
- metadata: { confidence: 0 },
2159
- };
2160
- }
2161
- }