@juspay/neurolink 9.32.0 → 9.32.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/auth/anthropicOAuth.js +1 -1
- package/dist/cli/commands/proxy.js +18 -5
- package/dist/client/aiSdkAdapter.js +1 -1
- package/dist/client/index.js +137 -501
- package/dist/core/factory.js +0 -1
- package/dist/core/redisConversationMemoryManager.js +1 -1
- package/dist/features/ppt/slideGenerator.js +0 -1
- package/dist/features/ppt/utils.js +0 -1
- package/dist/lib/server/routes/claudeProxyRoutes.js +45 -9
- package/dist/mcp/elicitationProtocol.js +1 -1
- package/dist/mcp/servers/agent/directToolsServer.js +0 -1
- package/dist/providers/azureOpenai.js +1 -1
- package/dist/providers/huggingFace.js +0 -1
- package/dist/providers/openaiCompatible.js +0 -1
- package/dist/sdk/toolRegistration.js +0 -1
- package/dist/server/openapi/generator.js +1 -1
- package/dist/server/routes/claudeProxyRoutes.js +45 -9
- package/dist/types/configTypes.js +0 -5
- package/dist/types/modelTypes.js +0 -1
- package/dist/types/tools.js +0 -1
- package/dist/types/typeAliases.js +0 -1
- package/dist/types/utilities.js +1 -1
- package/dist/types/workflowTypes.js +0 -1
- package/dist/utils/providerRetry.js +0 -1
- package/dist/utils/providerUtils.js +0 -1
- package/package.json +2 -2
- package/dist/client/adapters/providerImageAdapter.js +0 -588
- package/dist/client/adapters/tts/googleTTSHandler.js +0 -344
- package/dist/client/adapters/video/directorPipeline.js +0 -516
- package/dist/client/adapters/video/ffmpegAdapter.js +0 -206
- package/dist/client/adapters/video/frameExtractor.js +0 -143
- package/dist/client/adapters/video/vertexVideoHandler.js +0 -763
- package/dist/client/adapters/video/videoAnalyzer.js +0 -238
- package/dist/client/adapters/video/videoMerger.js +0 -171
- package/dist/client/agent/directTools.js +0 -840
- package/dist/client/auth/AuthProviderFactory.js +0 -111
- package/dist/client/auth/AuthProviderRegistry.js +0 -190
- package/dist/client/auth/RequestContext.js +0 -78
- package/dist/client/auth/accountPool.js +0 -178
- package/dist/client/auth/anthropicOAuth.js +0 -974
- package/dist/client/auth/authContext.js +0 -314
- package/dist/client/auth/errors.js +0 -39
- package/dist/client/auth/index.js +0 -61
- package/dist/client/auth/middleware/AuthMiddleware.js +0 -519
- package/dist/client/auth/middleware/rateLimitByUser.js +0 -554
- package/dist/client/auth/providers/BaseAuthProvider.js +0 -723
- package/dist/client/auth/providers/CognitoProvider.js +0 -304
- package/dist/client/auth/providers/KeycloakProvider.js +0 -393
- package/dist/client/auth/providers/auth0.js +0 -274
- package/dist/client/auth/providers/betterAuth.js +0 -182
- package/dist/client/auth/providers/clerk.js +0 -317
- package/dist/client/auth/providers/custom.js +0 -112
- package/dist/client/auth/providers/firebase.js +0 -226
- package/dist/client/auth/providers/jwt.js +0 -212
- package/dist/client/auth/providers/oauth2.js +0 -303
- package/dist/client/auth/providers/supabase.js +0 -259
- package/dist/client/auth/providers/workos.js +0 -284
- package/dist/client/auth/serverBridge.js +0 -25
- package/dist/client/auth/sessionManager.js +0 -437
- package/dist/client/auth/tokenStore.js +0 -799
- package/dist/client/client/aiSdkAdapter.js +0 -487
- package/dist/client/client/auth.js +0 -473
- package/dist/client/client/errors.js +0 -552
- package/dist/client/client/httpClient.js +0 -837
- package/dist/client/client/index.js +0 -172
- package/dist/client/client/interceptors.js +0 -601
- package/dist/client/client/sseClient.js +0 -545
- package/dist/client/client/streamingClient.js +0 -917
- package/dist/client/client/wsClient.js +0 -369
- package/dist/client/config/configManager.js +0 -303
- package/dist/client/config/conversationMemory.js +0 -86
- package/dist/client/config/taskClassificationConfig.js +0 -148
- package/dist/client/constants/contextWindows.js +0 -295
- package/dist/client/constants/enums.js +0 -853
- package/dist/client/constants/index.js +0 -207
- package/dist/client/constants/performance.js +0 -389
- package/dist/client/constants/retry.js +0 -266
- package/dist/client/constants/timeouts.js +0 -182
- package/dist/client/constants/tokens.js +0 -380
- package/dist/client/constants/videoErrors.js +0 -46
- package/dist/client/context/budgetChecker.js +0 -98
- package/dist/client/context/contextCompactor.js +0 -205
- package/dist/client/context/emergencyTruncation.js +0 -88
- package/dist/client/context/errorDetection.js +0 -171
- package/dist/client/context/errors.js +0 -21
- package/dist/client/context/fileTokenBudget.js +0 -127
- package/dist/client/context/prompts/summarizationPrompt.js +0 -117
- package/dist/client/context/stages/fileReadDeduplicator.js +0 -66
- package/dist/client/context/stages/slidingWindowTruncator.js +0 -190
- package/dist/client/context/stages/structuredSummarizer.js +0 -99
- package/dist/client/context/stages/toolOutputPruner.js +0 -52
- package/dist/client/context/summarizationEngine.js +0 -136
- package/dist/client/context/toolOutputLimits.js +0 -78
- package/dist/client/context/toolPairRepair.js +0 -66
- package/dist/client/core/analytics.js +0 -88
- package/dist/client/core/baseProvider.js +0 -1385
- package/dist/client/core/constants.js +0 -140
- package/dist/client/core/conversationMemoryFactory.js +0 -141
- package/dist/client/core/conversationMemoryInitializer.js +0 -128
- package/dist/client/core/conversationMemoryManager.js +0 -344
- package/dist/client/core/dynamicModels.js +0 -358
- package/dist/client/core/evaluation.js +0 -309
- package/dist/client/core/evaluationProviders.js +0 -248
- package/dist/client/core/factory.js +0 -412
- package/dist/client/core/infrastructure/baseError.js +0 -22
- package/dist/client/core/infrastructure/baseFactory.js +0 -54
- package/dist/client/core/infrastructure/baseRegistry.js +0 -53
- package/dist/client/core/infrastructure/index.js +0 -5
- package/dist/client/core/infrastructure/retry.js +0 -20
- package/dist/client/core/infrastructure/typedEventEmitter.js +0 -23
- package/dist/client/core/modelConfiguration.js +0 -851
- package/dist/client/core/modules/GenerationHandler.js +0 -588
- package/dist/client/core/modules/MessageBuilder.js +0 -273
- package/dist/client/core/modules/StreamHandler.js +0 -185
- package/dist/client/core/modules/TelemetryHandler.js +0 -203
- package/dist/client/core/modules/ToolsManager.js +0 -499
- package/dist/client/core/modules/Utilities.js +0 -331
- package/dist/client/core/redisConversationMemoryManager.js +0 -1435
- package/dist/client/core/streamAnalytics.js +0 -131
- package/dist/client/evaluation/contextBuilder.js +0 -134
- package/dist/client/evaluation/index.js +0 -61
- package/dist/client/evaluation/prompts.js +0 -73
- package/dist/client/evaluation/ragasEvaluator.js +0 -110
- package/dist/client/evaluation/retryManager.js +0 -78
- package/dist/client/evaluation/scoring.js +0 -61
- package/dist/client/factories/providerFactory.js +0 -166
- package/dist/client/factories/providerRegistry.js +0 -166
- package/dist/client/features/ppt/constants.js +0 -896
- package/dist/client/features/ppt/contentPlanner.js +0 -529
- package/dist/client/features/ppt/presentationOrchestrator.js +0 -236
- package/dist/client/features/ppt/slideGenerator.js +0 -532
- package/dist/client/features/ppt/slideRenderers.js +0 -2383
- package/dist/client/features/ppt/slideTypeInference.js +0 -405
- package/dist/client/features/ppt/types.js +0 -13
- package/dist/client/features/ppt/utils.js +0 -443
- package/dist/client/files/fileReferenceRegistry.js +0 -1543
- package/dist/client/files/fileTools.js +0 -450
- package/dist/client/files/streamingReader.js +0 -321
- package/dist/client/files/types.js +0 -23
- package/dist/client/hitl/hitlErrors.js +0 -54
- package/dist/client/hitl/hitlManager.js +0 -460
- package/dist/client/mcp/agentExposure.js +0 -356
- package/dist/client/mcp/auth/index.js +0 -11
- package/dist/client/mcp/auth/oauthClientProvider.js +0 -325
- package/dist/client/mcp/auth/tokenStorage.js +0 -134
- package/dist/client/mcp/batching/index.js +0 -10
- package/dist/client/mcp/batching/requestBatcher.js +0 -441
- package/dist/client/mcp/caching/index.js +0 -10
- package/dist/client/mcp/caching/toolCache.js +0 -433
- package/dist/client/mcp/elicitation/elicitationManager.js +0 -376
- package/dist/client/mcp/elicitation/index.js +0 -11
- package/dist/client/mcp/elicitation/types.js +0 -10
- package/dist/client/mcp/elicitationProtocol.js +0 -375
- package/dist/client/mcp/enhancedToolDiscovery.js +0 -481
- package/dist/client/mcp/externalServerManager.js +0 -1478
- package/dist/client/mcp/factory.js +0 -161
- package/dist/client/mcp/flexibleToolValidator.js +0 -161
- package/dist/client/mcp/httpRateLimiter.js +0 -391
- package/dist/client/mcp/httpRetryHandler.js +0 -178
- package/dist/client/mcp/index.js +0 -74
- package/dist/client/mcp/mcpCircuitBreaker.js +0 -427
- package/dist/client/mcp/mcpClientFactory.js +0 -708
- package/dist/client/mcp/mcpRegistryClient.js +0 -488
- package/dist/client/mcp/mcpServerBase.js +0 -373
- package/dist/client/mcp/multiServerManager.js +0 -579
- package/dist/client/mcp/registry.js +0 -158
- package/dist/client/mcp/routing/index.js +0 -10
- package/dist/client/mcp/routing/toolRouter.js +0 -416
- package/dist/client/mcp/serverCapabilities.js +0 -502
- package/dist/client/mcp/servers/agent/directToolsServer.js +0 -150
- package/dist/client/mcp/toolAnnotations.js +0 -239
- package/dist/client/mcp/toolConverter.js +0 -258
- package/dist/client/mcp/toolDiscoveryService.js +0 -798
- package/dist/client/mcp/toolIntegration.js +0 -334
- package/dist/client/mcp/toolRegistry.js +0 -729
- package/dist/client/memory/hippocampusInitializer.js +0 -19
- package/dist/client/memory/memoryRetrievalTools.js +0 -166
- package/dist/client/middleware/builtin/analytics.js +0 -132
- package/dist/client/middleware/builtin/autoEvaluation.js +0 -203
- package/dist/client/middleware/builtin/guardrails.js +0 -109
- package/dist/client/middleware/builtin/lifecycle.js +0 -168
- package/dist/client/middleware/factory.js +0 -327
- package/dist/client/middleware/registry.js +0 -295
- package/dist/client/middleware/utils/guardrailsUtils.js +0 -396
- package/dist/client/models/anthropicModels.js +0 -527
- package/dist/client/neurolink.js +0 -8233
- package/dist/client/observability/exporterRegistry.js +0 -413
- package/dist/client/observability/exporters/arizeExporter.js +0 -138
- package/dist/client/observability/exporters/baseExporter.js +0 -190
- package/dist/client/observability/exporters/braintrustExporter.js +0 -154
- package/dist/client/observability/exporters/datadogExporter.js +0 -196
- package/dist/client/observability/exporters/laminarExporter.js +0 -302
- package/dist/client/observability/exporters/langfuseExporter.js +0 -209
- package/dist/client/observability/exporters/langsmithExporter.js +0 -143
- package/dist/client/observability/exporters/otelExporter.js +0 -164
- package/dist/client/observability/exporters/posthogExporter.js +0 -287
- package/dist/client/observability/exporters/sentryExporter.js +0 -165
- package/dist/client/observability/index.js +0 -31
- package/dist/client/observability/metricsAggregator.js +0 -556
- package/dist/client/observability/otelBridge.js +0 -131
- package/dist/client/observability/retryPolicy.js +0 -383
- package/dist/client/observability/sampling/samplers.js +0 -216
- package/dist/client/observability/spanProcessor.js +0 -303
- package/dist/client/observability/tokenTracker.js +0 -413
- package/dist/client/observability/types/exporterTypes.js +0 -5
- package/dist/client/observability/types/index.js +0 -4
- package/dist/client/observability/types/spanTypes.js +0 -92
- package/dist/client/observability/utils/safeMetadata.js +0 -25
- package/dist/client/observability/utils/spanSerializer.js +0 -292
- package/dist/client/processors/archive/ArchiveProcessor.js +0 -1308
- package/dist/client/processors/base/BaseFileProcessor.js +0 -614
- package/dist/client/processors/base/types.js +0 -82
- package/dist/client/processors/config/fileTypes.js +0 -520
- package/dist/client/processors/config/index.js +0 -92
- package/dist/client/processors/config/languageMap.js +0 -410
- package/dist/client/processors/config/mimeTypes.js +0 -363
- package/dist/client/processors/config/sizeLimits.js +0 -258
- package/dist/client/processors/document/ExcelProcessor.js +0 -590
- package/dist/client/processors/document/OpenDocumentProcessor.js +0 -212
- package/dist/client/processors/document/PptxProcessor.js +0 -157
- package/dist/client/processors/document/RtfProcessor.js +0 -361
- package/dist/client/processors/document/WordProcessor.js +0 -353
- package/dist/client/processors/errors/FileErrorCode.js +0 -255
- package/dist/client/processors/errors/errorHelpers.js +0 -386
- package/dist/client/processors/errors/errorSerializer.js +0 -507
- package/dist/client/processors/errors/index.js +0 -49
- package/dist/client/processors/markup/SvgProcessor.js +0 -240
- package/dist/client/processors/media/AudioProcessor.js +0 -707
- package/dist/client/processors/media/VideoProcessor.js +0 -1045
- package/dist/client/providers/amazonBedrock.js +0 -1512
- package/dist/client/providers/amazonSagemaker.js +0 -162
- package/dist/client/providers/anthropic.js +0 -831
- package/dist/client/providers/azureOpenai.js +0 -143
- package/dist/client/providers/googleAiStudio.js +0 -1200
- package/dist/client/providers/googleNativeGemini3.js +0 -543
- package/dist/client/providers/googleVertex.js +0 -2936
- package/dist/client/providers/huggingFace.js +0 -315
- package/dist/client/providers/litellm.js +0 -488
- package/dist/client/providers/mistral.js +0 -157
- package/dist/client/providers/ollama.js +0 -1579
- package/dist/client/providers/openAI.js +0 -627
- package/dist/client/providers/openRouter.js +0 -543
- package/dist/client/providers/openaiCompatible.js +0 -290
- package/dist/client/providers/providerTypeUtils.js +0 -46
- package/dist/client/providers/sagemaker/adaptive-semaphore.js +0 -215
- package/dist/client/providers/sagemaker/client.js +0 -472
- package/dist/client/providers/sagemaker/config.js +0 -317
- package/dist/client/providers/sagemaker/detection.js +0 -606
- package/dist/client/providers/sagemaker/error-constants.js +0 -227
- package/dist/client/providers/sagemaker/errors.js +0 -299
- package/dist/client/providers/sagemaker/language-model.js +0 -775
- package/dist/client/providers/sagemaker/parsers.js +0 -634
- package/dist/client/providers/sagemaker/streaming.js +0 -331
- package/dist/client/providers/sagemaker/structured-parser.js +0 -625
- package/dist/client/proxy/accountQuota.js +0 -162
- package/dist/client/proxy/claudeFormat.js +0 -595
- package/dist/client/proxy/modelRouter.js +0 -29
- package/dist/client/proxy/oauthFetch.js +0 -367
- package/dist/client/proxy/proxyFetch.js +0 -586
- package/dist/client/proxy/requestLogger.js +0 -207
- package/dist/client/proxy/tokenRefresh.js +0 -124
- package/dist/client/proxy/usageStats.js +0 -74
- package/dist/client/proxy/utils/noProxyUtils.js +0 -149
- package/dist/client/rag/ChunkerFactory.js +0 -320
- package/dist/client/rag/ChunkerRegistry.js +0 -421
- package/dist/client/rag/chunkers/BaseChunker.js +0 -143
- package/dist/client/rag/chunkers/CharacterChunker.js +0 -28
- package/dist/client/rag/chunkers/HTMLChunker.js +0 -38
- package/dist/client/rag/chunkers/JSONChunker.js +0 -68
- package/dist/client/rag/chunkers/LaTeXChunker.js +0 -63
- package/dist/client/rag/chunkers/MarkdownChunker.js +0 -306
- package/dist/client/rag/chunkers/RecursiveChunker.js +0 -139
- package/dist/client/rag/chunkers/SemanticMarkdownChunker.js +0 -138
- package/dist/client/rag/chunkers/SentenceChunker.js +0 -66
- package/dist/client/rag/chunkers/TokenChunker.js +0 -61
- package/dist/client/rag/chunkers/index.js +0 -15
- package/dist/client/rag/chunking/characterChunker.js +0 -142
- package/dist/client/rag/chunking/chunkerRegistry.js +0 -194
- package/dist/client/rag/chunking/htmlChunker.js +0 -247
- package/dist/client/rag/chunking/index.js +0 -17
- package/dist/client/rag/chunking/jsonChunker.js +0 -281
- package/dist/client/rag/chunking/latexChunker.js +0 -251
- package/dist/client/rag/chunking/markdownChunker.js +0 -373
- package/dist/client/rag/chunking/recursiveChunker.js +0 -148
- package/dist/client/rag/chunking/semanticChunker.js +0 -306
- package/dist/client/rag/chunking/sentenceChunker.js +0 -230
- package/dist/client/rag/chunking/tokenChunker.js +0 -183
- package/dist/client/rag/document/MDocument.js +0 -392
- package/dist/client/rag/document/index.js +0 -5
- package/dist/client/rag/document/loaders.js +0 -500
- package/dist/client/rag/errors/RAGError.js +0 -274
- package/dist/client/rag/errors/index.js +0 -6
- package/dist/client/rag/graphRag/graphRAG.js +0 -401
- package/dist/client/rag/graphRag/index.js +0 -4
- package/dist/client/rag/index.js +0 -141
- package/dist/client/rag/metadata/MetadataExtractorFactory.js +0 -418
- package/dist/client/rag/metadata/MetadataExtractorRegistry.js +0 -362
- package/dist/client/rag/metadata/index.js +0 -9
- package/dist/client/rag/metadata/metadataExtractor.js +0 -280
- package/dist/client/rag/pipeline/RAGPipeline.js +0 -436
- package/dist/client/rag/pipeline/contextAssembly.js +0 -341
- package/dist/client/rag/pipeline/index.js +0 -5
- package/dist/client/rag/ragIntegration.js +0 -321
- package/dist/client/rag/reranker/RerankerFactory.js +0 -430
- package/dist/client/rag/reranker/RerankerRegistry.js +0 -402
- package/dist/client/rag/reranker/index.js +0 -9
- package/dist/client/rag/reranker/reranker.js +0 -277
- package/dist/client/rag/resilience/CircuitBreaker.js +0 -431
- package/dist/client/rag/resilience/RetryHandler.js +0 -304
- package/dist/client/rag/resilience/index.js +0 -7
- package/dist/client/rag/retrieval/hybridSearch.js +0 -335
- package/dist/client/rag/retrieval/index.js +0 -5
- package/dist/client/rag/retrieval/vectorQueryTool.js +0 -307
- package/dist/client/rag/types.js +0 -8
- package/dist/client/sdk/toolRegistration.js +0 -377
- package/dist/client/server/abstract/baseServerAdapter.js +0 -575
- package/dist/client/server/adapters/expressAdapter.js +0 -486
- package/dist/client/server/adapters/fastifyAdapter.js +0 -472
- package/dist/client/server/adapters/honoAdapter.js +0 -632
- package/dist/client/server/adapters/koaAdapter.js +0 -510
- package/dist/client/server/errors.js +0 -486
- package/dist/client/server/factory/serverAdapterFactory.js +0 -160
- package/dist/client/server/index.js +0 -108
- package/dist/client/server/middleware/abortSignal.js +0 -111
- package/dist/client/server/middleware/auth.js +0 -388
- package/dist/client/server/middleware/cache.js +0 -359
- package/dist/client/server/middleware/common.js +0 -281
- package/dist/client/server/middleware/deprecation.js +0 -190
- package/dist/client/server/middleware/mcpBodyAttachment.js +0 -63
- package/dist/client/server/middleware/rateLimit.js +0 -227
- package/dist/client/server/middleware/validation.js +0 -388
- package/dist/client/server/openapi/generator.js +0 -398
- package/dist/client/server/openapi/index.js +0 -36
- package/dist/client/server/openapi/schemas.js +0 -695
- package/dist/client/server/openapi/templates.js +0 -374
- package/dist/client/server/routes/agentRoutes.js +0 -189
- package/dist/client/server/routes/claudeProxyRoutes.js +0 -1600
- package/dist/client/server/routes/healthRoutes.js +0 -187
- package/dist/client/server/routes/index.js +0 -57
- package/dist/client/server/routes/mcpRoutes.js +0 -342
- package/dist/client/server/routes/memoryRoutes.js +0 -350
- package/dist/client/server/routes/openApiRoutes.js +0 -126
- package/dist/client/server/routes/toolRoutes.js +0 -199
- package/dist/client/server/streaming/dataStream.js +0 -486
- package/dist/client/server/streaming/index.js +0 -11
- package/dist/client/server/types.js +0 -67
- package/dist/client/server/utils/redaction.js +0 -334
- package/dist/client/server/utils/validation.js +0 -243
- package/dist/client/server/websocket/WebSocketHandler.js +0 -383
- package/dist/client/server/websocket/index.js +0 -4
- package/dist/client/services/server/ai/observability/instrumentation.js +0 -808
- package/dist/client/telemetry/attributes.js +0 -100
- package/dist/client/telemetry/index.js +0 -26
- package/dist/client/telemetry/telemetryService.js +0 -308
- package/dist/client/telemetry/tracers.js +0 -17
- package/dist/client/telemetry/withSpan.js +0 -34
- package/dist/client/types/actionTypes.js +0 -6
- package/dist/client/types/analytics.js +0 -5
- package/dist/client/types/authTypes.js +0 -9
- package/dist/client/types/circuitBreakerErrors.js +0 -34
- package/dist/client/types/cli.js +0 -21
- package/dist/client/types/clientTypes.js +0 -10
- package/dist/client/types/common.js +0 -51
- package/dist/client/types/configTypes.js +0 -49
- package/dist/client/types/content.js +0 -19
- package/dist/client/types/contextTypes.js +0 -400
- package/dist/client/types/conversation.js +0 -47
- package/dist/client/types/conversationMemoryInterface.js +0 -6
- package/dist/client/types/domainTypes.js +0 -5
- package/dist/client/types/errors.js +0 -167
- package/dist/client/types/evaluation.js +0 -5
- package/dist/client/types/evaluationProviders.js +0 -5
- package/dist/client/types/evaluationTypes.js +0 -1
- package/dist/client/types/externalMcp.js +0 -6
- package/dist/client/types/fileReferenceTypes.js +0 -8
- package/dist/client/types/fileTypes.js +0 -4
- package/dist/client/types/generateTypes.js +0 -1
- package/dist/client/types/guardrails.js +0 -1
- package/dist/client/types/hitlTypes.js +0 -8
- package/dist/client/types/index.js +0 -57
- package/dist/client/types/mcpTypes.js +0 -5
- package/dist/client/types/middlewareTypes.js +0 -1
- package/dist/client/types/modelTypes.js +0 -30
- package/dist/client/types/multimodal.js +0 -135
- package/dist/client/types/observability.js +0 -6
- package/dist/client/types/pptTypes.js +0 -82
- package/dist/client/types/providers.js +0 -111
- package/dist/client/types/proxyTypes.js +0 -16
- package/dist/client/types/ragTypes.js +0 -7
- package/dist/client/types/sdkTypes.js +0 -8
- package/dist/client/types/serviceTypes.js +0 -5
- package/dist/client/types/streamTypes.js +0 -1
- package/dist/client/types/subscriptionTypes.js +0 -9
- package/dist/client/types/taskClassificationTypes.js +0 -5
- package/dist/client/types/tools.js +0 -24
- package/dist/client/types/ttsTypes.js +0 -57
- package/dist/client/types/typeAliases.js +0 -48
- package/dist/client/types/utilities.js +0 -4
- package/dist/client/types/workflowTypes.js +0 -30
- package/dist/client/utils/async/withTimeout.js +0 -98
- package/dist/client/utils/asyncMutex.js +0 -60
- package/dist/client/utils/conversationMemory.js +0 -431
- package/dist/client/utils/csvProcessor.js +0 -846
- package/dist/client/utils/errorHandling.js +0 -936
- package/dist/client/utils/evaluationUtils.js +0 -131
- package/dist/client/utils/factoryProcessing.js +0 -589
- package/dist/client/utils/fileDetector.js +0 -2161
- package/dist/client/utils/imageCache.js +0 -376
- package/dist/client/utils/imageProcessor.js +0 -704
- package/dist/client/utils/logger.js +0 -491
- package/dist/client/utils/mcpDefaults.js +0 -134
- package/dist/client/utils/messageBuilder.js +0 -1653
- package/dist/client/utils/modelAliasResolver.js +0 -54
- package/dist/client/utils/modelDetection.js +0 -80
- package/dist/client/utils/modelRouter.js +0 -292
- package/dist/client/utils/multimodalOptionsBuilder.js +0 -65
- package/dist/client/utils/observabilityHelpers.js +0 -47
- package/dist/client/utils/parameterValidation.js +0 -966
- package/dist/client/utils/pdfProcessor.js +0 -410
- package/dist/client/utils/performance.js +0 -222
- package/dist/client/utils/pricing.js +0 -340
- package/dist/client/utils/promptRedaction.js +0 -62
- package/dist/client/utils/providerConfig.js +0 -1009
- package/dist/client/utils/providerHealth.js +0 -1237
- package/dist/client/utils/providerRetry.js +0 -112
- package/dist/client/utils/providerUtils.js +0 -434
- package/dist/client/utils/rateLimiter.js +0 -200
- package/dist/client/utils/redis.js +0 -368
- package/dist/client/utils/retryHandler.js +0 -269
- package/dist/client/utils/retryability.js +0 -22
- package/dist/client/utils/sanitizers/svg.js +0 -481
- package/dist/client/utils/schemaConversion.js +0 -255
- package/dist/client/utils/taskClassificationUtils.js +0 -149
- package/dist/client/utils/taskClassifier.js +0 -94
- package/dist/client/utils/thinkingConfig.js +0 -104
- package/dist/client/utils/timeout.js +0 -359
- package/dist/client/utils/tokenEstimation.js +0 -142
- package/dist/client/utils/tokenLimits.js +0 -125
- package/dist/client/utils/tokenUtils.js +0 -239
- package/dist/client/utils/toolUtils.js +0 -75
- package/dist/client/utils/transformationUtils.js +0 -554
- package/dist/client/utils/ttsProcessor.js +0 -286
- package/dist/client/utils/typeUtils.js +0 -97
- package/dist/client/utils/videoAnalysisProcessor.js +0 -67
- package/dist/client/workflow/config.js +0 -398
- package/dist/client/workflow/core/ensembleExecutor.js +0 -407
- package/dist/client/workflow/core/judgeScorer.js +0 -544
- package/dist/client/workflow/core/responseConditioner.js +0 -225
- package/dist/client/workflow/core/types/conditionerTypes.js +0 -7
- package/dist/client/workflow/core/types/ensembleTypes.js +0 -7
- package/dist/client/workflow/core/types/index.js +0 -7
- package/dist/client/workflow/core/types/judgeTypes.js +0 -7
- package/dist/client/workflow/core/types/layerTypes.js +0 -7
- package/dist/client/workflow/core/types/registryTypes.js +0 -7
- package/dist/client/workflow/core/workflowRegistry.js +0 -304
- package/dist/client/workflow/core/workflowRunner.js +0 -586
- package/dist/client/workflow/index.js +0 -50
- package/dist/client/workflow/types.js +0 -9
- package/dist/client/workflow/utils/types/index.js +0 -7
- package/dist/client/workflow/utils/workflowMetrics.js +0 -311
- package/dist/client/workflow/utils/workflowValidation.js +0 -420
- package/dist/client/workflow/workflows/adaptiveWorkflow.js +0 -366
- package/dist/client/workflow/workflows/consensusWorkflow.js +0 -192
- package/dist/client/workflow/workflows/fallbackWorkflow.js +0 -225
- package/dist/client/workflow/workflows/multiJudgeWorkflow.js +0 -351
- /package/dist/client/{client/reactHooks.js → reactHooks.js} +0 -0
|
@@ -1,2161 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* File Type Detection Utility
|
|
3
|
-
* Centralized file detection for all multimodal file types
|
|
4
|
-
* Uses multi-strategy approach for reliable type identification
|
|
5
|
-
*/
|
|
6
|
-
import { readFile, stat } from "fs/promises";
|
|
7
|
-
import { getGlobalDispatcher, interceptors, request } from "undici";
|
|
8
|
-
import { archiveProcessor } from "../processors/archive/ArchiveProcessor.js";
|
|
9
|
-
import { audioProcessor } from "../processors/media/AudioProcessor.js";
|
|
10
|
-
import { videoProcessor } from "../processors/media/VideoProcessor.js";
|
|
11
|
-
import { tracers, ATTR, withSpan } from "../telemetry/index.js";
|
|
12
|
-
import { CSVProcessor } from "./csvProcessor.js";
|
|
13
|
-
import { ImageProcessor } from "./imageProcessor.js";
|
|
14
|
-
import { logger } from "./logger.js";
|
|
15
|
-
import { PDFProcessor } from "./pdfProcessor.js";
|
|
16
|
-
/**
|
|
17
|
-
* Default retry configuration constants
|
|
18
|
-
*/
|
|
19
|
-
const DEFAULT_MAX_RETRIES = 3;
|
|
20
|
-
const DEFAULT_RETRY_DELAY = 1000; // milliseconds
|
|
21
|
-
/**
|
|
22
|
-
* Retryable network error codes (Node.js/undici network errors)
|
|
23
|
-
*/
|
|
24
|
-
const RETRYABLE_ERROR_CODES = [
|
|
25
|
-
"ETIMEDOUT",
|
|
26
|
-
"ECONNRESET",
|
|
27
|
-
"ECONNREFUSED",
|
|
28
|
-
"ENOTFOUND",
|
|
29
|
-
"ENETUNREACH",
|
|
30
|
-
"EAI_AGAIN",
|
|
31
|
-
"EPIPE",
|
|
32
|
-
"ECONNABORTED",
|
|
33
|
-
"UND_ERR_CONNECT_TIMEOUT",
|
|
34
|
-
"UND_ERR_HEADERS_TIMEOUT",
|
|
35
|
-
"UND_ERR_BODY_TIMEOUT",
|
|
36
|
-
"UND_ERR_SOCKET",
|
|
37
|
-
];
|
|
38
|
-
/**
|
|
39
|
-
* Non-retryable HTTP status codes (client errors)
|
|
40
|
-
*/
|
|
41
|
-
const NON_RETRYABLE_STATUS_CODES = [400, 401, 403, 404, 405];
|
|
42
|
-
/**
|
|
43
|
-
* Retryable HTTP status codes (server errors + rate limiting)
|
|
44
|
-
*/
|
|
45
|
-
const RETRYABLE_STATUS_CODES = [429, 500, 502, 503, 504];
|
|
46
|
-
/**
|
|
47
|
-
* Check if an error is a recoverable network error that should be retried
|
|
48
|
-
*
|
|
49
|
-
* @param error - Error to check
|
|
50
|
-
* @returns True if error is retryable (transient network issue)
|
|
51
|
-
*/
|
|
52
|
-
function isRetryableNetworkError(error) {
|
|
53
|
-
if (!(error instanceof Error)) {
|
|
54
|
-
return false;
|
|
55
|
-
}
|
|
56
|
-
const errorMessage = error.message.toLowerCase();
|
|
57
|
-
// Extract error code from various error shapes
|
|
58
|
-
const errorWithCode = error;
|
|
59
|
-
const errorCode = errorWithCode.code?.toUpperCase();
|
|
60
|
-
// Check for retryable network error codes
|
|
61
|
-
if (errorCode && RETRYABLE_ERROR_CODES.includes(errorCode)) {
|
|
62
|
-
return true;
|
|
63
|
-
}
|
|
64
|
-
// Check HTTP status code if present in error message (e.g., "HTTP 503")
|
|
65
|
-
const httpStatusMatch = errorMessage.match(/http\s*(\d{3})/);
|
|
66
|
-
if (httpStatusMatch) {
|
|
67
|
-
const statusCode = parseInt(httpStatusMatch[1], 10);
|
|
68
|
-
if (NON_RETRYABLE_STATUS_CODES.includes(statusCode)) {
|
|
69
|
-
return false;
|
|
70
|
-
}
|
|
71
|
-
if (RETRYABLE_STATUS_CODES.includes(statusCode)) {
|
|
72
|
-
return true;
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
// Check error message for transient issues
|
|
76
|
-
const transientKeywords = [
|
|
77
|
-
"timeout",
|
|
78
|
-
"timed out",
|
|
79
|
-
"connection reset",
|
|
80
|
-
"econnreset",
|
|
81
|
-
"etimedout",
|
|
82
|
-
"network error",
|
|
83
|
-
"socket hang up",
|
|
84
|
-
"enotfound",
|
|
85
|
-
"getaddrinfo",
|
|
86
|
-
"unavailable",
|
|
87
|
-
"service unavailable",
|
|
88
|
-
];
|
|
89
|
-
return transientKeywords.some((keyword) => errorMessage.includes(keyword));
|
|
90
|
-
}
|
|
91
|
-
/**
|
|
92
|
-
* Execute an operation with automatic retry logic on transient network errors
|
|
93
|
-
*
|
|
94
|
-
* @param operation - Async function to execute
|
|
95
|
-
* @param options - Retry configuration options
|
|
96
|
-
* @returns Promise resolving to the operation result
|
|
97
|
-
* @throws Error if all retry attempts fail or error is non-retryable
|
|
98
|
-
*/
|
|
99
|
-
async function withRetry(operation, options = {}) {
|
|
100
|
-
const maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES;
|
|
101
|
-
const retryDelay = options.retryDelay ?? DEFAULT_RETRY_DELAY;
|
|
102
|
-
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
103
|
-
try {
|
|
104
|
-
return await operation();
|
|
105
|
-
}
|
|
106
|
-
catch (error) {
|
|
107
|
-
const isRetryable = isRetryableNetworkError(error);
|
|
108
|
-
const isLastAttempt = attempt === maxRetries;
|
|
109
|
-
if (!isRetryable || isLastAttempt) {
|
|
110
|
-
throw error;
|
|
111
|
-
}
|
|
112
|
-
// Calculate exponential backoff delay
|
|
113
|
-
const delay = retryDelay * 2 ** attempt;
|
|
114
|
-
logger.debug("Retrying network operation after transient error", {
|
|
115
|
-
attempt: attempt + 1,
|
|
116
|
-
maxRetries,
|
|
117
|
-
delay,
|
|
118
|
-
error: error instanceof Error ? error.message : String(error),
|
|
119
|
-
});
|
|
120
|
-
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
// TypeScript exhaustiveness check - should never reach here
|
|
124
|
-
throw new Error("Retry logic failed unexpectedly");
|
|
125
|
-
}
|
|
126
|
-
/**
|
|
127
|
-
* Check if text has JSON markers (starts with { or [ and ends with corresponding closing bracket)
|
|
128
|
-
*/
|
|
129
|
-
function hasJsonMarkers(text) {
|
|
130
|
-
const trimmed = text.trim();
|
|
131
|
-
if (!trimmed) {
|
|
132
|
-
return false;
|
|
133
|
-
}
|
|
134
|
-
const firstChar = trimmed[0];
|
|
135
|
-
const lastChar = trimmed[trimmed.length - 1];
|
|
136
|
-
const hasMatchingBrackets = (firstChar === "{" && lastChar === "}") ||
|
|
137
|
-
(firstChar === "[" && lastChar === "]");
|
|
138
|
-
if (!hasMatchingBrackets) {
|
|
139
|
-
return false;
|
|
140
|
-
}
|
|
141
|
-
try {
|
|
142
|
-
JSON.parse(trimmed);
|
|
143
|
-
return true;
|
|
144
|
-
}
|
|
145
|
-
catch {
|
|
146
|
-
return false;
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
/**
|
|
150
|
-
* Format file size in human-readable units
|
|
151
|
-
*/
|
|
152
|
-
function formatFileSize(bytes) {
|
|
153
|
-
if (bytes < 1024) {
|
|
154
|
-
return `${bytes} bytes`;
|
|
155
|
-
}
|
|
156
|
-
if (bytes < 1024 * 1024) {
|
|
157
|
-
return `${(bytes / 1024).toFixed(2)} KB`;
|
|
158
|
-
}
|
|
159
|
-
if (bytes < 1024 * 1024 * 1024) {
|
|
160
|
-
return `${(bytes / (1024 * 1024)).toFixed(2)} MB`;
|
|
161
|
-
}
|
|
162
|
-
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
|
|
163
|
-
}
|
|
164
|
-
/**
|
|
165
|
-
* Centralized file type detection and processing
|
|
166
|
-
*
|
|
167
|
-
* @example
|
|
168
|
-
* ```typescript
|
|
169
|
-
* // Auto-detect and process any file
|
|
170
|
-
* const result = await FileDetector.detectAndProcess("data.csv");
|
|
171
|
-
* logger.info(result.type); // 'csv'
|
|
172
|
-
* ```
|
|
173
|
-
*/
|
|
174
|
-
export class FileDetector {
|
|
175
|
-
// FD-017: Replace hardcoded timeouts with constants.
|
|
176
|
-
// These default ensure consistent timeout behavior across all file-detection logic.
|
|
177
|
-
static DEFAULT_NETWORK_TIMEOUT = 30000; // 30 seconds
|
|
178
|
-
static DEFAULT_HEAD_TIMEOUT = 5000; // 5 seconds
|
|
179
|
-
/**
|
|
180
|
-
* Auto-detect file type and process in one call
|
|
181
|
-
*
|
|
182
|
-
* Runs detection strategies in priority order:
|
|
183
|
-
* 1. MagicBytesStrategy (95% confidence) - Binary file headers
|
|
184
|
-
* 2. MimeTypeStrategy (85% confidence) - HTTP Content-Type for URLs
|
|
185
|
-
* 3. ExtensionStrategy (70% confidence) - File extension
|
|
186
|
-
* 4. ContentHeuristicStrategy (75% confidence) - Content analysis
|
|
187
|
-
*
|
|
188
|
-
* @param input - File path, URL, Buffer, or data URI
|
|
189
|
-
* @param options - Detection and processing options
|
|
190
|
-
* @returns Processed file result with type and content
|
|
191
|
-
*/
|
|
192
|
-
static async detectAndProcess(input, options) {
|
|
193
|
-
// Derive filename and size for tracing before detection runs
|
|
194
|
-
const inputFilename = FileDetector.deriveInputFilename(input);
|
|
195
|
-
const inputSizeBytes = FileDetector.deriveInputSize(input);
|
|
196
|
-
return withSpan({
|
|
197
|
-
name: "neurolink.file.detect_and_process",
|
|
198
|
-
tracer: tracers.file,
|
|
199
|
-
attributes: {
|
|
200
|
-
[ATTR.FILE_NAME]: inputFilename,
|
|
201
|
-
[ATTR.FILE_SIZE_BYTES]: inputSizeBytes,
|
|
202
|
-
},
|
|
203
|
-
}, async (span) => {
|
|
204
|
-
const detection = await FileDetector.detect(input, options);
|
|
205
|
-
span.setAttribute(ATTR.FILE_CATEGORY, detection.type);
|
|
206
|
-
span.setAttribute(ATTR.FILE_MIMETYPE, detection.mimeType || "unknown");
|
|
207
|
-
span.setAttribute(ATTR.FILE_CONFIDENCE, detection.metadata.confidence);
|
|
208
|
-
logger.info(`[NEUROLINK] File detected: ${inputFilename} (${detection.mimeType || "unknown"}, ${formatFileSize(inputSizeBytes)}) → category: ${detection.type}`);
|
|
209
|
-
// FD-018: Comprehensive fallback parsing for extension-less files
|
|
210
|
-
if (options?.allowedTypes &&
|
|
211
|
-
!options.allowedTypes.includes(detection.type)) {
|
|
212
|
-
const content = await FileDetector.loadContent(input, detection, options);
|
|
213
|
-
const errors = [];
|
|
214
|
-
for (const allowedType of options.allowedTypes) {
|
|
215
|
-
try {
|
|
216
|
-
const result = await FileDetector.tryFallbackParsing(content, allowedType, options);
|
|
217
|
-
if (result) {
|
|
218
|
-
logger.info(`[FileDetector] ✅ ${allowedType.toUpperCase()} fallback successful`);
|
|
219
|
-
const outputLength = typeof result.content === "string"
|
|
220
|
-
? result.content.length
|
|
221
|
-
: result.content?.length || 0;
|
|
222
|
-
span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength);
|
|
223
|
-
span.setAttribute(ATTR.FILE_SUCCESS, true);
|
|
224
|
-
span.setAttribute(ATTR.FILE_PROCESSOR_USED, `fallback:${allowedType}`);
|
|
225
|
-
logger.info(`[NEUROLINK] File processed: ${inputFilename} → ${outputLength} bytes output (fallback: ${allowedType})`);
|
|
226
|
-
return result;
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
catch (error) {
|
|
230
|
-
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
231
|
-
errors.push(`${allowedType}: ${errorMsg}`);
|
|
232
|
-
logger.debug(`[FileDetector] ${allowedType} fallback failed: ${errorMsg}`);
|
|
233
|
-
}
|
|
234
|
-
}
|
|
235
|
-
logger.warn(`[FileDetector] All fallback parsing failed for type "${detection.type}". ` +
|
|
236
|
-
`Attempted: ${options.allowedTypes.join(", ")}. Falling through to universal handler.`);
|
|
237
|
-
const csvOptions = options?.csvOptions;
|
|
238
|
-
const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider);
|
|
239
|
-
FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type);
|
|
240
|
-
return result;
|
|
241
|
-
}
|
|
242
|
-
const content = await FileDetector.loadContent(input, detection, options);
|
|
243
|
-
const csvOptions = options?.csvOptions;
|
|
244
|
-
const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider);
|
|
245
|
-
FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type);
|
|
246
|
-
return result;
|
|
247
|
-
});
|
|
248
|
-
}
|
|
249
|
-
/**
|
|
250
|
-
* Set span attributes and log after file processing completes.
|
|
251
|
-
*/
|
|
252
|
-
static setFileResultSpanAttributes(span, result, filename, processorType) {
|
|
253
|
-
const outputLength = typeof result.content === "string"
|
|
254
|
-
? result.content.length
|
|
255
|
-
: result.content?.length || 0;
|
|
256
|
-
const hasImages = Array.isArray(result.images)
|
|
257
|
-
? result.images.length > 0
|
|
258
|
-
: false;
|
|
259
|
-
const imageCount = Array.isArray(result.images)
|
|
260
|
-
? result.images.length
|
|
261
|
-
: 0;
|
|
262
|
-
span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength);
|
|
263
|
-
span.setAttribute(ATTR.FILE_SUCCESS, true);
|
|
264
|
-
span.setAttribute(ATTR.FILE_PROCESSOR_USED, processorType);
|
|
265
|
-
span.setAttribute(ATTR.FILE_HAS_IMAGES, hasImages);
|
|
266
|
-
span.setAttribute(ATTR.FILE_IMAGE_COUNT, imageCount);
|
|
267
|
-
logger.info(`[NEUROLINK] File processed: ${filename} → ${outputLength} bytes output` +
|
|
268
|
-
(imageCount > 0 ? ` + ${imageCount} image(s)` : "") +
|
|
269
|
-
` (processor: ${processorType})`);
|
|
270
|
-
}
|
|
271
|
-
/**
|
|
272
|
-
* Derive a human-readable filename from FileInput for tracing.
|
|
273
|
-
*/
|
|
274
|
-
static deriveInputFilename(input) {
|
|
275
|
-
if (typeof input === "string") {
|
|
276
|
-
if (input.startsWith("data:")) {
|
|
277
|
-
return "data-uri";
|
|
278
|
-
}
|
|
279
|
-
if (input.startsWith("http")) {
|
|
280
|
-
try {
|
|
281
|
-
return new URL(input).pathname.split("/").pop() || "url-file";
|
|
282
|
-
}
|
|
283
|
-
catch {
|
|
284
|
-
return "url-file";
|
|
285
|
-
}
|
|
286
|
-
}
|
|
287
|
-
// File path
|
|
288
|
-
return input.split("/").pop() || input.split("\\").pop() || "file";
|
|
289
|
-
}
|
|
290
|
-
if (Buffer.isBuffer(input)) {
|
|
291
|
-
return "buffer";
|
|
292
|
-
}
|
|
293
|
-
return "unknown-input";
|
|
294
|
-
}
|
|
295
|
-
/**
|
|
296
|
-
* Derive byte size from FileInput for tracing.
|
|
297
|
-
*/
|
|
298
|
-
static deriveInputSize(input) {
|
|
299
|
-
if (Buffer.isBuffer(input)) {
|
|
300
|
-
return input.length;
|
|
301
|
-
}
|
|
302
|
-
if (typeof input === "string") {
|
|
303
|
-
if (input.startsWith("data:")) {
|
|
304
|
-
// Rough estimate: base64 is ~4/3 of raw
|
|
305
|
-
const base64Part = input.split(",")[1];
|
|
306
|
-
return base64Part ? Math.floor((base64Part.length * 3) / 4) : 0;
|
|
307
|
-
}
|
|
308
|
-
return input.length; // path or URL string length (not file size)
|
|
309
|
-
}
|
|
310
|
-
return 0;
|
|
311
|
-
}
|
|
312
|
-
/**
|
|
313
|
-
* Try fallback parsing for a specific file type
|
|
314
|
-
* Used when file detection returns "unknown" but we want to try parsing anyway
|
|
315
|
-
*/
|
|
316
|
-
static async tryFallbackParsing(content, fileType, options) {
|
|
317
|
-
logger.info(`[FileDetector] Attempting ${fileType.toUpperCase()} fallback parsing`);
|
|
318
|
-
switch (fileType) {
|
|
319
|
-
case "csv": {
|
|
320
|
-
// Try CSV parsing
|
|
321
|
-
const csvOptions = options?.csvOptions;
|
|
322
|
-
const result = await CSVProcessor.process(content, csvOptions);
|
|
323
|
-
logger.info(`[FileDetector] CSV fallback: ${result.metadata?.rowCount || 0} rows, ${result.metadata?.columnCount || 0} columns`);
|
|
324
|
-
return result;
|
|
325
|
-
}
|
|
326
|
-
case "text": {
|
|
327
|
-
// Try text parsing - check if content is valid UTF-8 text
|
|
328
|
-
const textContent = content.toString("utf-8");
|
|
329
|
-
// Validate it's actually text (no null bytes, mostly printable)
|
|
330
|
-
if (FileDetector.isValidText(textContent)) {
|
|
331
|
-
return {
|
|
332
|
-
type: "text",
|
|
333
|
-
content: textContent,
|
|
334
|
-
mimeType: FileDetector.guessTextMimeType(textContent),
|
|
335
|
-
metadata: {
|
|
336
|
-
confidence: 70,
|
|
337
|
-
size: content.length,
|
|
338
|
-
},
|
|
339
|
-
};
|
|
340
|
-
}
|
|
341
|
-
throw new Error("Content does not appear to be valid text");
|
|
342
|
-
}
|
|
343
|
-
case "image": {
|
|
344
|
-
// Image requires magic bytes - can't fallback without detection
|
|
345
|
-
throw new Error("Image type requires binary detection, cannot fallback parse");
|
|
346
|
-
}
|
|
347
|
-
case "pdf": {
|
|
348
|
-
// PDF requires magic bytes - can't fallback without detection
|
|
349
|
-
throw new Error("PDF type requires binary detection, cannot fallback parse");
|
|
350
|
-
}
|
|
351
|
-
case "audio": {
|
|
352
|
-
// Audio requires magic bytes - can't fallback without detection
|
|
353
|
-
throw new Error("Audio type requires binary detection, cannot fallback parse");
|
|
354
|
-
}
|
|
355
|
-
case "video": {
|
|
356
|
-
// Video requires magic bytes - can't fallback without detection
|
|
357
|
-
throw new Error("Video type requires binary detection, cannot fallback parse");
|
|
358
|
-
}
|
|
359
|
-
case "archive": {
|
|
360
|
-
// Archive requires magic bytes - can't fallback without detection
|
|
361
|
-
throw new Error("Archive type requires binary detection, cannot fallback parse");
|
|
362
|
-
}
|
|
363
|
-
case "xlsx": {
|
|
364
|
-
// Document formats require binary detection
|
|
365
|
-
throw new Error("Excel type requires binary detection, cannot fallback parse");
|
|
366
|
-
}
|
|
367
|
-
case "docx": {
|
|
368
|
-
throw new Error("Word type requires binary detection, cannot fallback parse");
|
|
369
|
-
}
|
|
370
|
-
case "pptx": {
|
|
371
|
-
throw new Error("PowerPoint type requires binary detection, cannot fallback parse");
|
|
372
|
-
}
|
|
373
|
-
case "svg": {
|
|
374
|
-
// SVG can be detected from text content
|
|
375
|
-
const svgContent = content.toString("utf-8");
|
|
376
|
-
if (svgContent.includes("<svg") && svgContent.includes("</svg>")) {
|
|
377
|
-
return {
|
|
378
|
-
type: "svg",
|
|
379
|
-
content: svgContent,
|
|
380
|
-
mimeType: "image/svg+xml",
|
|
381
|
-
metadata: {
|
|
382
|
-
confidence: 70,
|
|
383
|
-
size: content.length,
|
|
384
|
-
},
|
|
385
|
-
};
|
|
386
|
-
}
|
|
387
|
-
throw new Error("Content does not appear to be valid SVG");
|
|
388
|
-
}
|
|
389
|
-
default:
|
|
390
|
-
return null;
|
|
391
|
-
}
|
|
392
|
-
}
|
|
393
|
-
/**
|
|
394
|
-
* Check if content is valid text (UTF-8, mostly printable)
|
|
395
|
-
*/
|
|
396
|
-
static isValidText(content) {
|
|
397
|
-
// Check for null bytes which indicate binary content
|
|
398
|
-
if (content.includes("\0")) {
|
|
399
|
-
return false;
|
|
400
|
-
}
|
|
401
|
-
// Check if content has reasonable amount of printable characters
|
|
402
|
-
let printableCount = 0;
|
|
403
|
-
for (let i = 0; i < content.length; i++) {
|
|
404
|
-
const code = content.charCodeAt(i);
|
|
405
|
-
if ((code >= 32 && code < 127) || // ASCII printable
|
|
406
|
-
code === 9 || // Tab
|
|
407
|
-
code === 10 || // Newline
|
|
408
|
-
code === 13 || // Carriage return
|
|
409
|
-
code > 127 // Unicode (non-ASCII)
|
|
410
|
-
) {
|
|
411
|
-
printableCount++;
|
|
412
|
-
}
|
|
413
|
-
}
|
|
414
|
-
// At least 90% should be printable
|
|
415
|
-
return printableCount / content.length >= 0.9;
|
|
416
|
-
}
|
|
417
|
-
/**
|
|
418
|
-
* Guess the MIME type for text content based on content patterns
|
|
419
|
-
*/
|
|
420
|
-
static guessTextMimeType(content) {
|
|
421
|
-
const trimmed = content.trim();
|
|
422
|
-
// Check for JSON
|
|
423
|
-
if ((trimmed.startsWith("{") && trimmed.endsWith("}")) ||
|
|
424
|
-
(trimmed.startsWith("[") && trimmed.endsWith("]"))) {
|
|
425
|
-
try {
|
|
426
|
-
JSON.parse(trimmed);
|
|
427
|
-
return "application/json";
|
|
428
|
-
}
|
|
429
|
-
catch {
|
|
430
|
-
// Not valid JSON, continue checking
|
|
431
|
-
}
|
|
432
|
-
}
|
|
433
|
-
// Check for XML/HTML using stricter detection
|
|
434
|
-
if (FileDetector.looksLikeXMLStrict(trimmed)) {
|
|
435
|
-
const isHTML = trimmed.includes("<!DOCTYPE html") ||
|
|
436
|
-
trimmed.toLowerCase().includes("<html") ||
|
|
437
|
-
trimmed.includes("<head") ||
|
|
438
|
-
trimmed.includes("<body");
|
|
439
|
-
return isHTML ? "text/html" : "application/xml";
|
|
440
|
-
}
|
|
441
|
-
// Check for YAML using robust multi-indicator detection
|
|
442
|
-
if (FileDetector.looksLikeYAMLStrict(trimmed)) {
|
|
443
|
-
return "application/yaml";
|
|
444
|
-
}
|
|
445
|
-
// Default to plain text
|
|
446
|
-
return "text/plain";
|
|
447
|
-
}
|
|
448
|
-
/**
|
|
449
|
-
* Strict YAML detection for guessTextMimeType
|
|
450
|
-
* Similar to ContentHeuristicStrategy but requires at least 2 indicators
|
|
451
|
-
* to avoid false positives from simple key: value patterns
|
|
452
|
-
*/
|
|
453
|
-
static looksLikeYAMLStrict(text) {
|
|
454
|
-
if (text.length === 0) {
|
|
455
|
-
return false;
|
|
456
|
-
}
|
|
457
|
-
const lines = text.split("\n");
|
|
458
|
-
// For single-line content, only --- or ... qualify as YAML
|
|
459
|
-
if (lines.length === 1) {
|
|
460
|
-
return text === "---" || text === "...";
|
|
461
|
-
}
|
|
462
|
-
// Collect YAML indicators (requires at least 2 for positive detection)
|
|
463
|
-
const indicators = [];
|
|
464
|
-
// Indicator 1: Document start marker (---)
|
|
465
|
-
indicators.push(text.startsWith("---"));
|
|
466
|
-
// Indicator 2: Document end marker (...)
|
|
467
|
-
indicators.push(/^\.\.\.$|[\n]\.\.\.$/.test(text));
|
|
468
|
-
// Indicator 3: YAML list items (- followed by space)
|
|
469
|
-
indicators.push(/^[\s]*-\s+[^-]/m.test(text));
|
|
470
|
-
// Indicator 4: Multiple key-value pairs (at least 2)
|
|
471
|
-
const keyValuePattern = /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*(.+)$/;
|
|
472
|
-
const keyValueMatches = lines.filter((line) => keyValuePattern.test(line)).length;
|
|
473
|
-
indicators.push(keyValueMatches >= 2);
|
|
474
|
-
// Require at least 2 indicators for confident YAML detection
|
|
475
|
-
const matchCount = indicators.filter(Boolean).length;
|
|
476
|
-
return matchCount >= 2;
|
|
477
|
-
}
|
|
478
|
-
/**
|
|
479
|
-
* Strict XML detection for guessTextMimeType
|
|
480
|
-
* Ensures content has proper XML declaration or valid tag structure with closing tags
|
|
481
|
-
* Prevents false positives from arbitrary content starting with <
|
|
482
|
-
*/
|
|
483
|
-
static looksLikeXMLStrict(content) {
|
|
484
|
-
// XML declaration is a definitive marker
|
|
485
|
-
if (content.startsWith("<?xml")) {
|
|
486
|
-
return true;
|
|
487
|
-
}
|
|
488
|
-
// Must start with < for XML/HTML
|
|
489
|
-
if (!content.startsWith("<")) {
|
|
490
|
-
return false;
|
|
491
|
-
}
|
|
492
|
-
// Check for HTML DOCTYPE declaration
|
|
493
|
-
if (content.includes("<!DOCTYPE html")) {
|
|
494
|
-
return true;
|
|
495
|
-
}
|
|
496
|
-
// Must have valid opening tag structure: <tagname
|
|
497
|
-
// Not just any < character like "< something"
|
|
498
|
-
const hasValidOpeningTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/;
|
|
499
|
-
if (!hasValidOpeningTag.test(content)) {
|
|
500
|
-
return false;
|
|
501
|
-
}
|
|
502
|
-
// Must have at least one closing tag or self-closing tag to be valid XML/HTML
|
|
503
|
-
const hasClosingTag = /<\/[a-zA-Z][a-zA-Z0-9-]*>/.test(content);
|
|
504
|
-
const hasSelfClosingTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?\s*\/\s*>/.test(content);
|
|
505
|
-
return hasClosingTag || hasSelfClosingTag;
|
|
506
|
-
}
|
|
507
|
-
/**
|
|
508
|
-
* Detect file type using multi-strategy approach
|
|
509
|
-
* Stops at first strategy with confidence >= threshold (default: 80%)
|
|
510
|
-
*/
|
|
511
|
-
static async detect(input, options) {
|
|
512
|
-
const confidenceThreshold = options?.confidenceThreshold ?? 80;
|
|
513
|
-
const strategies = [
|
|
514
|
-
new MagicBytesStrategy(),
|
|
515
|
-
new MimeTypeStrategy(),
|
|
516
|
-
new ExtensionStrategy(),
|
|
517
|
-
new ContentHeuristicStrategy(),
|
|
518
|
-
];
|
|
519
|
-
let best = null;
|
|
520
|
-
for (const strategy of strategies) {
|
|
521
|
-
const result = await strategy.detect(input);
|
|
522
|
-
if (!best || result.metadata.confidence > best.metadata.confidence) {
|
|
523
|
-
best = result;
|
|
524
|
-
}
|
|
525
|
-
if (result.metadata.confidence >= confidenceThreshold) {
|
|
526
|
-
logger.info(`[FileDetector] Type: ${result.type} (${result.metadata.confidence}%)`);
|
|
527
|
-
return result;
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
logger.warn(`[FileDetector] Low confidence: ${best?.type ?? "unknown"} (${best?.metadata.confidence ?? 0}%)`);
|
|
531
|
-
return best;
|
|
532
|
-
}
|
|
533
|
-
/**
|
|
534
|
-
* Load file content from various sources
|
|
535
|
-
*/
|
|
536
|
-
static async loadContent(input, detection, options) {
|
|
537
|
-
let source = detection.source;
|
|
538
|
-
if (source === "buffer" && !Buffer.isBuffer(input)) {
|
|
539
|
-
if (typeof input === "string") {
|
|
540
|
-
if (input.startsWith("data:")) {
|
|
541
|
-
source = "datauri";
|
|
542
|
-
}
|
|
543
|
-
else if (input.startsWith("http://") ||
|
|
544
|
-
input.startsWith("https://")) {
|
|
545
|
-
source = "url";
|
|
546
|
-
}
|
|
547
|
-
else {
|
|
548
|
-
source = "path";
|
|
549
|
-
}
|
|
550
|
-
}
|
|
551
|
-
}
|
|
552
|
-
switch (source) {
|
|
553
|
-
case "url":
|
|
554
|
-
return await FileDetector.loadFromURL(input, options);
|
|
555
|
-
case "path":
|
|
556
|
-
return await FileDetector.loadFromPath(input, options);
|
|
557
|
-
case "buffer":
|
|
558
|
-
return input;
|
|
559
|
-
case "datauri":
|
|
560
|
-
return FileDetector.loadFromDataURI(input);
|
|
561
|
-
default:
|
|
562
|
-
throw new Error(`Unknown source: ${source}`);
|
|
563
|
-
}
|
|
564
|
-
}
|
|
565
|
-
/**
|
|
566
|
-
* SDK-8: Format an informative placeholder when a file processor fails.
|
|
567
|
-
* Instead of bare "[Video file: name]" strings, include size, format, and
|
|
568
|
-
* the reason for failure so the LLM can acknowledge the attachment.
|
|
569
|
-
*/
|
|
570
|
-
static formatInformativePlaceholder(typeName, filename, content, detection, error) {
|
|
571
|
-
const sizeStr = content.length < 1024
|
|
572
|
-
? `${content.length} bytes`
|
|
573
|
-
: content.length < 1024 * 1024
|
|
574
|
-
? `${(content.length / 1024).toFixed(1)} KB`
|
|
575
|
-
: `${(content.length / (1024 * 1024)).toFixed(1)} MB`;
|
|
576
|
-
const errorMsg = error instanceof Error
|
|
577
|
-
? error.message
|
|
578
|
-
: error
|
|
579
|
-
? String(error)
|
|
580
|
-
: "Processing returned no usable content";
|
|
581
|
-
return (`[${typeName} File: "${filename}"]\n` +
|
|
582
|
-
`Size: ${sizeStr}\n` +
|
|
583
|
-
`Format: ${detection.mimeType || "unknown"}\n` +
|
|
584
|
-
`Error: Could not extract content (${errorMsg}).\n` +
|
|
585
|
-
`The file was attached but could not be fully analyzed.`);
|
|
586
|
-
}
|
|
587
|
-
/**
|
|
588
|
-
* Extract metadata and printable strings from an unrecognized binary file.
|
|
589
|
-
* This is the "extract what you can" path for unknown file types.
|
|
590
|
-
*
|
|
591
|
-
* Extracts:
|
|
592
|
-
* - File size (human-readable)
|
|
593
|
-
* - MIME type / detected format
|
|
594
|
-
* - First N bytes as hex dump (for identification)
|
|
595
|
-
* - Printable ASCII/UTF-8 strings found in the binary (like `strings` command)
|
|
596
|
-
* - Known file signatures that we don't have full processors for
|
|
597
|
-
*
|
|
598
|
-
* @param content Raw file buffer
|
|
599
|
-
* @param detection Detection result (may be "unknown")
|
|
600
|
-
* @param filename Original filename (if known)
|
|
601
|
-
* @returns Formatted text summary suitable for LLM consumption
|
|
602
|
-
*/
|
|
603
|
-
static extractBinaryMetadata(content, detection, filename) {
|
|
604
|
-
const parts = [];
|
|
605
|
-
// Header
|
|
606
|
-
const ext = detection.extension
|
|
607
|
-
? `.${detection.extension}`
|
|
608
|
-
: filename.includes(".")
|
|
609
|
-
? filename.slice(filename.lastIndexOf("."))
|
|
610
|
-
: "";
|
|
611
|
-
const typeLabel = ext
|
|
612
|
-
? `${ext.toUpperCase().slice(1)} file`
|
|
613
|
-
: "Binary file";
|
|
614
|
-
parts.push(`[${typeLabel}: "${filename}"]`);
|
|
615
|
-
// Basic metadata
|
|
616
|
-
const sizeStr = formatFileSize(content.length);
|
|
617
|
-
parts.push(`Size: ${sizeStr}`);
|
|
618
|
-
if (detection.mimeType &&
|
|
619
|
-
detection.mimeType !== "application/octet-stream") {
|
|
620
|
-
parts.push(`Format: ${detection.mimeType}`);
|
|
621
|
-
}
|
|
622
|
-
// Known binary signature identification (broader than our processing capabilities)
|
|
623
|
-
const sigLabel = FileDetector.identifyBinarySignature(content);
|
|
624
|
-
if (sigLabel) {
|
|
625
|
-
parts.push(`Identified as: ${sigLabel}`);
|
|
626
|
-
}
|
|
627
|
-
// Hex dump of first 32 bytes for identification
|
|
628
|
-
const hexPreview = content
|
|
629
|
-
.subarray(0, Math.min(32, content.length))
|
|
630
|
-
.toString("hex")
|
|
631
|
-
.match(/.{1,2}/g)
|
|
632
|
-
?.join(" ");
|
|
633
|
-
if (hexPreview) {
|
|
634
|
-
parts.push(`Header bytes: ${hexPreview}`);
|
|
635
|
-
}
|
|
636
|
-
// Extract printable strings (similar to Unix `strings` command)
|
|
637
|
-
const strings = FileDetector.extractPrintableStrings(content, 4, 50);
|
|
638
|
-
if (strings.length > 0) {
|
|
639
|
-
parts.push(`\nEmbedded text found (${strings.length} string${strings.length > 1 ? "s" : ""}):`);
|
|
640
|
-
for (const s of strings) {
|
|
641
|
-
parts.push(` "${s}"`);
|
|
642
|
-
}
|
|
643
|
-
}
|
|
644
|
-
parts.push(`\nThis file was attached but its format is not fully supported for content extraction.`);
|
|
645
|
-
parts.push(`The above metadata and any embedded text have been extracted for context.`);
|
|
646
|
-
return parts.join("\n");
|
|
647
|
-
}
|
|
648
|
-
/**
|
|
649
|
-
* Identify known binary file signatures beyond what we can process.
|
|
650
|
-
* Returns a human-readable description, or null if unrecognized.
|
|
651
|
-
*/
|
|
652
|
-
static identifyBinarySignature(buf) {
|
|
653
|
-
if (buf.length < 4) {
|
|
654
|
-
return null;
|
|
655
|
-
}
|
|
656
|
-
// SQLite: "SQLite format 3\0"
|
|
657
|
-
if (buf.length >= 16 &&
|
|
658
|
-
buf.subarray(0, 15).toString("ascii") === "SQLite format 3") {
|
|
659
|
-
return "SQLite database";
|
|
660
|
-
}
|
|
661
|
-
// WOFF: "wOFF"
|
|
662
|
-
if (buf[0] === 0x77 &&
|
|
663
|
-
buf[1] === 0x4f &&
|
|
664
|
-
buf[2] === 0x46 &&
|
|
665
|
-
buf[3] === 0x46) {
|
|
666
|
-
return "WOFF font";
|
|
667
|
-
}
|
|
668
|
-
// WOFF2: "wOF2"
|
|
669
|
-
if (buf[0] === 0x77 &&
|
|
670
|
-
buf[1] === 0x4f &&
|
|
671
|
-
buf[2] === 0x46 &&
|
|
672
|
-
buf[3] === 0x32) {
|
|
673
|
-
return "WOFF2 font";
|
|
674
|
-
}
|
|
675
|
-
// TrueType/OpenType: starts with 0x00010000 or "OTTO"
|
|
676
|
-
if ((buf[0] === 0x00 &&
|
|
677
|
-
buf[1] === 0x01 &&
|
|
678
|
-
buf[2] === 0x00 &&
|
|
679
|
-
buf[3] === 0x00) ||
|
|
680
|
-
(buf[0] === 0x4f && buf[1] === 0x54 && buf[2] === 0x54 && buf[3] === 0x4f)) {
|
|
681
|
-
return "TrueType/OpenType font";
|
|
682
|
-
}
|
|
683
|
-
// ELF executable: \x7fELF
|
|
684
|
-
if (buf[0] === 0x7f &&
|
|
685
|
-
buf[1] === 0x45 &&
|
|
686
|
-
buf[2] === 0x4c &&
|
|
687
|
-
buf[3] === 0x46) {
|
|
688
|
-
return "ELF executable/library";
|
|
689
|
-
}
|
|
690
|
-
// Mach-O: 0xFEEDFACE or 0xFEEDFACF (64-bit) or 0xCAFEBABE (universal)
|
|
691
|
-
if ((buf[0] === 0xfe &&
|
|
692
|
-
buf[1] === 0xed &&
|
|
693
|
-
buf[2] === 0xfa &&
|
|
694
|
-
buf[3] === 0xce) ||
|
|
695
|
-
(buf[0] === 0xfe &&
|
|
696
|
-
buf[1] === 0xed &&
|
|
697
|
-
buf[2] === 0xfa &&
|
|
698
|
-
buf[3] === 0xcf) ||
|
|
699
|
-
(buf[0] === 0xca && buf[1] === 0xfe && buf[2] === 0xba && buf[3] === 0xbe)) {
|
|
700
|
-
return "Mach-O executable/library";
|
|
701
|
-
}
|
|
702
|
-
// PE/Windows executable: "MZ"
|
|
703
|
-
if (buf[0] === 0x4d && buf[1] === 0x5a) {
|
|
704
|
-
return "Windows PE executable/DLL";
|
|
705
|
-
}
|
|
706
|
-
// WebAssembly: "\0asm"
|
|
707
|
-
if (buf[0] === 0x00 &&
|
|
708
|
-
buf[1] === 0x61 &&
|
|
709
|
-
buf[2] === 0x73 &&
|
|
710
|
-
buf[3] === 0x6d) {
|
|
711
|
-
return "WebAssembly binary";
|
|
712
|
-
}
|
|
713
|
-
// DWG (AutoCAD): starts with "AC10"
|
|
714
|
-
if (buf[0] === 0x41 &&
|
|
715
|
-
buf[1] === 0x43 &&
|
|
716
|
-
buf[2] === 0x31 &&
|
|
717
|
-
buf[3] === 0x30) {
|
|
718
|
-
return "AutoCAD DWG drawing";
|
|
719
|
-
}
|
|
720
|
-
// BZ2: "BZ" + 'h'
|
|
721
|
-
if (buf[0] === 0x42 && buf[1] === 0x5a && buf[2] === 0x68) {
|
|
722
|
-
return "BZip2 compressed archive";
|
|
723
|
-
}
|
|
724
|
-
// XZ: 0xFD + "7zXZ"
|
|
725
|
-
if (buf.length >= 6 &&
|
|
726
|
-
buf[0] === 0xfd &&
|
|
727
|
-
buf[1] === 0x37 &&
|
|
728
|
-
buf[2] === 0x7a &&
|
|
729
|
-
buf[3] === 0x58 &&
|
|
730
|
-
buf[4] === 0x5a &&
|
|
731
|
-
buf[5] === 0x00) {
|
|
732
|
-
return "XZ compressed archive";
|
|
733
|
-
}
|
|
734
|
-
// 7z: "7z" + BC AF 27 1C
|
|
735
|
-
if (buf.length >= 6 &&
|
|
736
|
-
buf[0] === 0x37 &&
|
|
737
|
-
buf[1] === 0x7a &&
|
|
738
|
-
buf[2] === 0xbc &&
|
|
739
|
-
buf[3] === 0xaf &&
|
|
740
|
-
buf[4] === 0x27 &&
|
|
741
|
-
buf[5] === 0x1c) {
|
|
742
|
-
return "7-Zip archive";
|
|
743
|
-
}
|
|
744
|
-
// ISO 9660: "CD001" at offset 32769
|
|
745
|
-
if (buf.length > 32773 &&
|
|
746
|
-
buf.subarray(32769, 32774).toString("ascii") === "CD001") {
|
|
747
|
-
return "ISO 9660 disc image";
|
|
748
|
-
}
|
|
749
|
-
// Apache Parquet: "PAR1"
|
|
750
|
-
if (buf[0] === 0x50 &&
|
|
751
|
-
buf[1] === 0x41 &&
|
|
752
|
-
buf[2] === 0x52 &&
|
|
753
|
-
buf[3] === 0x31) {
|
|
754
|
-
return "Apache Parquet data file";
|
|
755
|
-
}
|
|
756
|
-
// Protocol Buffers compiled: (no fixed magic, skip)
|
|
757
|
-
// TIFF (already handled as image, but including for completeness)
|
|
758
|
-
if ((buf[0] === 0x49 &&
|
|
759
|
-
buf[1] === 0x49 &&
|
|
760
|
-
buf[2] === 0x2a &&
|
|
761
|
-
buf[3] === 0x00) ||
|
|
762
|
-
(buf[0] === 0x4d && buf[1] === 0x4d && buf[2] === 0x00 && buf[3] === 0x2a)) {
|
|
763
|
-
return "TIFF image";
|
|
764
|
-
}
|
|
765
|
-
// ICO: 00 00 01 00
|
|
766
|
-
if (buf[0] === 0x00 &&
|
|
767
|
-
buf[1] === 0x00 &&
|
|
768
|
-
buf[2] === 0x01 &&
|
|
769
|
-
buf[3] === 0x00) {
|
|
770
|
-
return "ICO icon image";
|
|
771
|
-
}
|
|
772
|
-
return null;
|
|
773
|
-
}
|
|
774
|
-
/**
|
|
775
|
-
* Extract printable ASCII strings from a binary buffer.
|
|
776
|
-
* Similar to the Unix `strings` utility.
|
|
777
|
-
*
|
|
778
|
-
* @param buf Buffer to scan
|
|
779
|
-
* @param minLength Minimum string length to include (default 4)
|
|
780
|
-
* @param maxStrings Maximum number of strings to return (default 50)
|
|
781
|
-
* @returns Array of printable strings found in the binary
|
|
782
|
-
*/
|
|
783
|
-
static extractPrintableStrings(buf, minLength = 4, maxStrings = 50) {
|
|
784
|
-
const strings = [];
|
|
785
|
-
let current = "";
|
|
786
|
-
// Only scan first 64KB to avoid huge processing time
|
|
787
|
-
const scanLimit = Math.min(buf.length, 64 * 1024);
|
|
788
|
-
for (let i = 0; i < scanLimit; i++) {
|
|
789
|
-
const byte = buf[i];
|
|
790
|
-
// Printable ASCII range (space through tilde) plus tab
|
|
791
|
-
if ((byte >= 0x20 && byte <= 0x7e) || byte === 0x09) {
|
|
792
|
-
current += String.fromCharCode(byte);
|
|
793
|
-
}
|
|
794
|
-
else {
|
|
795
|
-
if (current.length >= minLength) {
|
|
796
|
-
strings.push(current);
|
|
797
|
-
if (strings.length >= maxStrings) {
|
|
798
|
-
break;
|
|
799
|
-
}
|
|
800
|
-
}
|
|
801
|
-
current = "";
|
|
802
|
-
}
|
|
803
|
-
}
|
|
804
|
-
// Flush last string
|
|
805
|
-
if (current.length >= minLength && strings.length < maxStrings) {
|
|
806
|
-
strings.push(current);
|
|
807
|
-
}
|
|
808
|
-
return strings;
|
|
809
|
-
}
|
|
810
|
-
/**
|
|
811
|
-
* Route to appropriate processor
|
|
812
|
-
*/
|
|
813
|
-
static async processFile(content, detection, options, provider) {
|
|
814
|
-
switch (detection.type) {
|
|
815
|
-
case "csv":
|
|
816
|
-
// Pass original extension through to CSV processor; if detection has none,
|
|
817
|
-
// fall back to any extension provided in csvOptions.
|
|
818
|
-
return await CSVProcessor.process(content, {
|
|
819
|
-
...options,
|
|
820
|
-
extension: detection.extension ?? options?.extension,
|
|
821
|
-
});
|
|
822
|
-
case "image":
|
|
823
|
-
return await ImageProcessor.process(content);
|
|
824
|
-
case "pdf":
|
|
825
|
-
return await PDFProcessor.process(content, { provider });
|
|
826
|
-
case "svg":
|
|
827
|
-
// SVG is processed as text content (sanitized XML markup)
|
|
828
|
-
// AI providers don't support SVG as image format, so we extract text content
|
|
829
|
-
return await FileDetector.processSvgAsText(content, detection);
|
|
830
|
-
case "video":
|
|
831
|
-
return await FileDetector.processVideoFile(content, detection);
|
|
832
|
-
case "audio":
|
|
833
|
-
return await FileDetector.processAudioFile(content, detection);
|
|
834
|
-
case "archive":
|
|
835
|
-
return await FileDetector.processArchiveFile(content, detection);
|
|
836
|
-
case "xlsx":
|
|
837
|
-
return await FileDetector.processXlsxFile(content, detection);
|
|
838
|
-
case "docx":
|
|
839
|
-
return await FileDetector.processDocxFile(content, detection);
|
|
840
|
-
case "pptx":
|
|
841
|
-
return await FileDetector.processPptxFile(content, detection);
|
|
842
|
-
case "text":
|
|
843
|
-
return {
|
|
844
|
-
type: "text",
|
|
845
|
-
content: content.toString("utf-8"),
|
|
846
|
-
mimeType: detection.mimeType || "text/plain",
|
|
847
|
-
metadata: detection.metadata,
|
|
848
|
-
};
|
|
849
|
-
default: {
|
|
850
|
-
// Graceful degradation: try to treat unknown types as text if content is valid UTF-8
|
|
851
|
-
const unknownContent = content.toString("utf-8");
|
|
852
|
-
if (FileDetector.isValidText(unknownContent)) {
|
|
853
|
-
logger.warn(`[FileDetector] Unknown type "${detection.type}", treating as text`);
|
|
854
|
-
return {
|
|
855
|
-
type: "text",
|
|
856
|
-
content: unknownContent,
|
|
857
|
-
mimeType: detection.mimeType || "text/plain",
|
|
858
|
-
metadata: detection.metadata,
|
|
859
|
-
};
|
|
860
|
-
}
|
|
861
|
-
// Binary file that we can't fully process — extract what we can
|
|
862
|
-
// (metadata, printable strings, signature identification)
|
|
863
|
-
const filename = detection.metadata.filename || "file";
|
|
864
|
-
logger.warn(`[FileDetector] Unknown binary type "${detection.type}", extracting metadata for "${filename}"`);
|
|
865
|
-
return {
|
|
866
|
-
type: "unknown",
|
|
867
|
-
content: FileDetector.extractBinaryMetadata(content, detection, filename),
|
|
868
|
-
mimeType: detection.mimeType || "application/octet-stream",
|
|
869
|
-
metadata: detection.metadata,
|
|
870
|
-
};
|
|
871
|
-
}
|
|
872
|
-
}
|
|
873
|
-
}
|
|
874
|
-
/**
|
|
875
|
-
* Process video file: extract metadata, keyframes, and subtitles via VideoProcessor
|
|
876
|
-
*/
|
|
877
|
-
static async processVideoFile(content, detection) {
|
|
878
|
-
const videoFilename = detection.metadata.filename || "video";
|
|
879
|
-
try {
|
|
880
|
-
const videoResult = await videoProcessor.processFile({
|
|
881
|
-
id: videoFilename,
|
|
882
|
-
name: videoFilename,
|
|
883
|
-
mimetype: detection.mimeType || "video/mp4",
|
|
884
|
-
size: content.length,
|
|
885
|
-
buffer: content,
|
|
886
|
-
});
|
|
887
|
-
if (videoResult.success && videoResult.data) {
|
|
888
|
-
return {
|
|
889
|
-
type: "video",
|
|
890
|
-
content: videoResult.data.textContent ||
|
|
891
|
-
FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection),
|
|
892
|
-
mimeType: detection.mimeType,
|
|
893
|
-
images: videoResult.data.keyframes && videoResult.data.keyframes.length > 0
|
|
894
|
-
? videoResult.data.keyframes
|
|
895
|
-
: undefined,
|
|
896
|
-
metadata: {
|
|
897
|
-
...detection.metadata,
|
|
898
|
-
frameCount: videoResult.data.frameCount,
|
|
899
|
-
hasKeyframes: videoResult.data.hasKeyframes,
|
|
900
|
-
},
|
|
901
|
-
};
|
|
902
|
-
}
|
|
903
|
-
}
|
|
904
|
-
catch (videoError) {
|
|
905
|
-
logger.warn(`[FileDetector] VideoProcessor failed for ${videoFilename}, using fallback`, videoError instanceof Error ? videoError.message : String(videoError));
|
|
906
|
-
return {
|
|
907
|
-
type: "video",
|
|
908
|
-
content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection, videoError),
|
|
909
|
-
mimeType: detection.mimeType,
|
|
910
|
-
metadata: detection.metadata,
|
|
911
|
-
};
|
|
912
|
-
}
|
|
913
|
-
// Fallback if processor returned no data
|
|
914
|
-
return {
|
|
915
|
-
type: "video",
|
|
916
|
-
content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection),
|
|
917
|
-
mimeType: detection.mimeType,
|
|
918
|
-
metadata: detection.metadata,
|
|
919
|
-
};
|
|
920
|
-
}
|
|
921
|
-
/**
|
|
922
|
-
* Process audio file: extract metadata, tags, and cover art via AudioProcessor
|
|
923
|
-
*/
|
|
924
|
-
static async processAudioFile(content, detection) {
|
|
925
|
-
const audioFilename = detection.metadata.filename || "audio";
|
|
926
|
-
try {
|
|
927
|
-
const audioResult = await audioProcessor.processFile({
|
|
928
|
-
id: audioFilename,
|
|
929
|
-
name: audioFilename,
|
|
930
|
-
mimetype: detection.mimeType || "audio/mpeg",
|
|
931
|
-
size: content.length,
|
|
932
|
-
buffer: content,
|
|
933
|
-
});
|
|
934
|
-
if (audioResult.success && audioResult.data) {
|
|
935
|
-
return {
|
|
936
|
-
type: "audio",
|
|
937
|
-
content: audioResult.data.textContent ||
|
|
938
|
-
FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection),
|
|
939
|
-
mimeType: detection.mimeType,
|
|
940
|
-
// Surface embedded cover art as an image content block
|
|
941
|
-
images: audioResult.data.coverArt
|
|
942
|
-
? [audioResult.data.coverArt]
|
|
943
|
-
: undefined,
|
|
944
|
-
metadata: detection.metadata,
|
|
945
|
-
};
|
|
946
|
-
}
|
|
947
|
-
}
|
|
948
|
-
catch (audioError) {
|
|
949
|
-
logger.warn(`[FileDetector] AudioProcessor failed for ${audioFilename}, using fallback`, audioError instanceof Error ? audioError.message : String(audioError));
|
|
950
|
-
return {
|
|
951
|
-
type: "audio",
|
|
952
|
-
content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection, audioError),
|
|
953
|
-
mimeType: detection.mimeType,
|
|
954
|
-
metadata: detection.metadata,
|
|
955
|
-
};
|
|
956
|
-
}
|
|
957
|
-
// Fallback if processor returned no data
|
|
958
|
-
return {
|
|
959
|
-
type: "audio",
|
|
960
|
-
content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection),
|
|
961
|
-
mimeType: detection.mimeType,
|
|
962
|
-
metadata: detection.metadata,
|
|
963
|
-
};
|
|
964
|
-
}
|
|
965
|
-
/**
|
|
966
|
-
* Process archive file: list contents and extract metadata via ArchiveProcessor
|
|
967
|
-
*/
|
|
968
|
-
static async processArchiveFile(content, detection) {
|
|
969
|
-
const archiveFilename = detection.metadata.filename || "archive";
|
|
970
|
-
try {
|
|
971
|
-
const archiveResult = await archiveProcessor.processFile({
|
|
972
|
-
id: archiveFilename,
|
|
973
|
-
name: archiveFilename,
|
|
974
|
-
mimetype: detection.mimeType || "application/zip",
|
|
975
|
-
size: content.length,
|
|
976
|
-
buffer: content,
|
|
977
|
-
});
|
|
978
|
-
if (archiveResult.success && archiveResult.data) {
|
|
979
|
-
return {
|
|
980
|
-
type: "archive",
|
|
981
|
-
content: archiveResult.data.textContent ||
|
|
982
|
-
FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection),
|
|
983
|
-
mimeType: detection.mimeType,
|
|
984
|
-
metadata: detection.metadata,
|
|
985
|
-
};
|
|
986
|
-
}
|
|
987
|
-
}
|
|
988
|
-
catch (archiveError) {
|
|
989
|
-
logger.warn(`[FileDetector] ArchiveProcessor failed for ${archiveFilename}, using fallback`, archiveError instanceof Error
|
|
990
|
-
? archiveError.message
|
|
991
|
-
: String(archiveError));
|
|
992
|
-
return {
|
|
993
|
-
type: "archive",
|
|
994
|
-
content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection, archiveError),
|
|
995
|
-
mimeType: detection.mimeType,
|
|
996
|
-
metadata: detection.metadata,
|
|
997
|
-
};
|
|
998
|
-
}
|
|
999
|
-
// Fallback if processor returned no data
|
|
1000
|
-
return {
|
|
1001
|
-
type: "archive",
|
|
1002
|
-
content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection),
|
|
1003
|
-
mimeType: detection.mimeType,
|
|
1004
|
-
metadata: detection.metadata,
|
|
1005
|
-
};
|
|
1006
|
-
}
|
|
1007
|
-
/**
|
|
1008
|
-
* Process Excel/OpenDocument spreadsheet file via ExcelProcessor or OpenDocumentProcessor
|
|
1009
|
-
*/
|
|
1010
|
-
static async processXlsxFile(content, detection) {
|
|
1011
|
-
const xlsxFilename = detection.metadata.filename || "spreadsheet";
|
|
1012
|
-
try {
|
|
1013
|
-
const ext = detection.extension?.toLowerCase();
|
|
1014
|
-
if (ext === "ods") {
|
|
1015
|
-
const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js");
|
|
1016
|
-
const odsResult = await openDocumentProcessor.processFile({
|
|
1017
|
-
id: xlsxFilename,
|
|
1018
|
-
name: xlsxFilename,
|
|
1019
|
-
mimetype: detection.mimeType ||
|
|
1020
|
-
"application/vnd.oasis.opendocument.spreadsheet",
|
|
1021
|
-
size: content.length,
|
|
1022
|
-
buffer: content,
|
|
1023
|
-
});
|
|
1024
|
-
if (odsResult.success && odsResult.data) {
|
|
1025
|
-
return {
|
|
1026
|
-
type: "xlsx",
|
|
1027
|
-
content: odsResult.data.textContent ||
|
|
1028
|
-
FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection),
|
|
1029
|
-
mimeType: detection.mimeType,
|
|
1030
|
-
metadata: detection.metadata,
|
|
1031
|
-
};
|
|
1032
|
-
}
|
|
1033
|
-
}
|
|
1034
|
-
else {
|
|
1035
|
-
const { excelProcessor } = await import("../processors/document/ExcelProcessor.js");
|
|
1036
|
-
const xlsxResult = await excelProcessor.processFile({
|
|
1037
|
-
id: xlsxFilename,
|
|
1038
|
-
name: xlsxFilename,
|
|
1039
|
-
mimetype: detection.mimeType ||
|
|
1040
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
1041
|
-
size: content.length,
|
|
1042
|
-
buffer: content,
|
|
1043
|
-
});
|
|
1044
|
-
if (xlsxResult.success && xlsxResult.data) {
|
|
1045
|
-
// Build text content from worksheets
|
|
1046
|
-
const sheets = xlsxResult.data.worksheets || [];
|
|
1047
|
-
let textContent = `Spreadsheet: ${sheets.length} sheet(s), ${xlsxResult.data.totalRows} total rows\n`;
|
|
1048
|
-
for (const sheet of sheets) {
|
|
1049
|
-
textContent += `\n### Sheet: ${sheet.name}\n`;
|
|
1050
|
-
textContent += `Columns (${sheet.columnCount}): ${sheet.headers.join(", ")}\n`;
|
|
1051
|
-
textContent += `Rows: ${sheet.rowCount}\n`;
|
|
1052
|
-
// Include first rows as sample data
|
|
1053
|
-
const sampleRows = sheet.rows.slice(0, 20);
|
|
1054
|
-
const rowText = sampleRows
|
|
1055
|
-
.map((row) => row.map((c) => String(c ?? "")).join("\t"))
|
|
1056
|
-
.join("\n");
|
|
1057
|
-
if (!rowText) {
|
|
1058
|
-
continue;
|
|
1059
|
-
}
|
|
1060
|
-
textContent += `\nData:\n${sheet.headers.join("\t")}\n${rowText}\n`;
|
|
1061
|
-
const remaining = sheet.rowCount - 20;
|
|
1062
|
-
if (remaining > 0) {
|
|
1063
|
-
textContent += `... (${remaining} more rows)\n`;
|
|
1064
|
-
}
|
|
1065
|
-
}
|
|
1066
|
-
return {
|
|
1067
|
-
type: "xlsx",
|
|
1068
|
-
content: textContent,
|
|
1069
|
-
mimeType: detection.mimeType,
|
|
1070
|
-
metadata: detection.metadata,
|
|
1071
|
-
};
|
|
1072
|
-
}
|
|
1073
|
-
}
|
|
1074
|
-
}
|
|
1075
|
-
catch (xlsxError) {
|
|
1076
|
-
logger.warn(`[FileDetector] ExcelProcessor failed for ${xlsxFilename}, using fallback`, xlsxError instanceof Error ? xlsxError.message : String(xlsxError));
|
|
1077
|
-
return {
|
|
1078
|
-
type: "xlsx",
|
|
1079
|
-
content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection, xlsxError),
|
|
1080
|
-
mimeType: detection.mimeType,
|
|
1081
|
-
metadata: detection.metadata,
|
|
1082
|
-
};
|
|
1083
|
-
}
|
|
1084
|
-
// Fallback if processor returned no data
|
|
1085
|
-
return {
|
|
1086
|
-
type: "xlsx",
|
|
1087
|
-
content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection),
|
|
1088
|
-
mimeType: detection.mimeType,
|
|
1089
|
-
metadata: detection.metadata,
|
|
1090
|
-
};
|
|
1091
|
-
}
|
|
1092
|
-
/**
|
|
1093
|
-
* Process Word/OpenDocument/RTF document via WordProcessor, OpenDocumentProcessor, or RtfProcessor
|
|
1094
|
-
*/
|
|
1095
|
-
static async processDocxFile(content, detection) {
|
|
1096
|
-
const docxFilename = detection.metadata.filename || "document";
|
|
1097
|
-
const ext = detection.extension?.toLowerCase();
|
|
1098
|
-
try {
|
|
1099
|
-
if (ext === "odt") {
|
|
1100
|
-
const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js");
|
|
1101
|
-
const odtResult = await openDocumentProcessor.processFile({
|
|
1102
|
-
id: docxFilename,
|
|
1103
|
-
name: docxFilename,
|
|
1104
|
-
mimetype: detection.mimeType || "application/vnd.oasis.opendocument.text",
|
|
1105
|
-
size: content.length,
|
|
1106
|
-
buffer: content,
|
|
1107
|
-
});
|
|
1108
|
-
if (odtResult.success && odtResult.data) {
|
|
1109
|
-
return {
|
|
1110
|
-
type: "docx",
|
|
1111
|
-
content: odtResult.data.textContent ||
|
|
1112
|
-
FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
|
|
1113
|
-
mimeType: detection.mimeType,
|
|
1114
|
-
metadata: detection.metadata,
|
|
1115
|
-
};
|
|
1116
|
-
}
|
|
1117
|
-
}
|
|
1118
|
-
else if (ext === "rtf") {
|
|
1119
|
-
const { rtfProcessor } = await import("../processors/document/RtfProcessor.js");
|
|
1120
|
-
const rtfResult = await rtfProcessor.processFile({
|
|
1121
|
-
id: docxFilename,
|
|
1122
|
-
name: docxFilename,
|
|
1123
|
-
mimetype: detection.mimeType || "application/rtf",
|
|
1124
|
-
size: content.length,
|
|
1125
|
-
buffer: content,
|
|
1126
|
-
});
|
|
1127
|
-
if (rtfResult.success && rtfResult.data) {
|
|
1128
|
-
return {
|
|
1129
|
-
type: "docx",
|
|
1130
|
-
content: rtfResult.data.textContent ||
|
|
1131
|
-
FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
|
|
1132
|
-
mimeType: detection.mimeType,
|
|
1133
|
-
metadata: detection.metadata,
|
|
1134
|
-
};
|
|
1135
|
-
}
|
|
1136
|
-
}
|
|
1137
|
-
else {
|
|
1138
|
-
const { wordProcessor } = await import("../processors/document/WordProcessor.js");
|
|
1139
|
-
const docxResult = await wordProcessor.processFile({
|
|
1140
|
-
id: docxFilename,
|
|
1141
|
-
name: docxFilename,
|
|
1142
|
-
mimetype: detection.mimeType ||
|
|
1143
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
1144
|
-
size: content.length,
|
|
1145
|
-
buffer: content,
|
|
1146
|
-
});
|
|
1147
|
-
if (docxResult.success && docxResult.data) {
|
|
1148
|
-
return {
|
|
1149
|
-
type: "docx",
|
|
1150
|
-
content: docxResult.data.textContent ||
|
|
1151
|
-
FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
|
|
1152
|
-
mimeType: detection.mimeType,
|
|
1153
|
-
metadata: detection.metadata,
|
|
1154
|
-
};
|
|
1155
|
-
}
|
|
1156
|
-
}
|
|
1157
|
-
}
|
|
1158
|
-
catch (docxError) {
|
|
1159
|
-
logger.warn(`[FileDetector] Document processor failed for ${docxFilename}, using fallback`, docxError instanceof Error ? docxError.message : String(docxError));
|
|
1160
|
-
return {
|
|
1161
|
-
type: "docx",
|
|
1162
|
-
content: FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection, docxError),
|
|
1163
|
-
mimeType: detection.mimeType,
|
|
1164
|
-
metadata: detection.metadata,
|
|
1165
|
-
};
|
|
1166
|
-
}
|
|
1167
|
-
// Fallback if processor returned no data
|
|
1168
|
-
return {
|
|
1169
|
-
type: "docx",
|
|
1170
|
-
content: FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
|
|
1171
|
-
mimeType: detection.mimeType,
|
|
1172
|
-
metadata: detection.metadata,
|
|
1173
|
-
};
|
|
1174
|
-
}
|
|
1175
|
-
/**
|
|
1176
|
-
* Process PowerPoint/OpenDocument presentation via PptxProcessor
|
|
1177
|
-
*/
|
|
1178
|
-
static async processPptxFile(content, detection) {
|
|
1179
|
-
const pptxFilename = detection.metadata.filename || "presentation";
|
|
1180
|
-
try {
|
|
1181
|
-
const { PptxProcessor } = await import("../processors/document/PptxProcessor.js");
|
|
1182
|
-
const pptxResult = await PptxProcessor.extractText(content);
|
|
1183
|
-
if (pptxResult) {
|
|
1184
|
-
return {
|
|
1185
|
-
type: "pptx",
|
|
1186
|
-
content: pptxResult,
|
|
1187
|
-
mimeType: detection.mimeType,
|
|
1188
|
-
metadata: detection.metadata,
|
|
1189
|
-
};
|
|
1190
|
-
}
|
|
1191
|
-
}
|
|
1192
|
-
catch (pptxError) {
|
|
1193
|
-
logger.warn(`[FileDetector] PptxProcessor failed for ${pptxFilename}, using fallback`, pptxError instanceof Error ? pptxError.message : String(pptxError));
|
|
1194
|
-
return {
|
|
1195
|
-
type: "pptx",
|
|
1196
|
-
content: FileDetector.formatInformativePlaceholder("Presentation", pptxFilename, content, detection, pptxError),
|
|
1197
|
-
mimeType: detection.mimeType,
|
|
1198
|
-
metadata: detection.metadata,
|
|
1199
|
-
};
|
|
1200
|
-
}
|
|
1201
|
-
// Fallback if processor returned no content
|
|
1202
|
-
return {
|
|
1203
|
-
type: "pptx",
|
|
1204
|
-
content: FileDetector.formatInformativePlaceholder("Presentation", pptxFilename, content, detection),
|
|
1205
|
-
mimeType: detection.mimeType,
|
|
1206
|
-
metadata: detection.metadata,
|
|
1207
|
-
};
|
|
1208
|
-
}
|
|
1209
|
-
/**
|
|
1210
|
-
* Process SVG file as text content
|
|
1211
|
-
* Uses SvgProcessor for security sanitization (removes XSS vectors)
|
|
1212
|
-
* Returns sanitized SVG markup as text for AI analysis
|
|
1213
|
-
*/
|
|
1214
|
-
static async processSvgAsText(content, detection) {
|
|
1215
|
-
try {
|
|
1216
|
-
// Dynamic import to avoid circular dependencies
|
|
1217
|
-
const { processSvg } = await import("../processors/markup/SvgProcessor.js");
|
|
1218
|
-
const result = await processSvg({
|
|
1219
|
-
id: "svg-file",
|
|
1220
|
-
name: detection.metadata.filename || "image.svg",
|
|
1221
|
-
mimetype: "image/svg+xml",
|
|
1222
|
-
size: content.length,
|
|
1223
|
-
buffer: content,
|
|
1224
|
-
});
|
|
1225
|
-
if (result.success && result.data) {
|
|
1226
|
-
logger.info(`[FileDetector] SVG processed as text: ${detection.metadata.filename || "image.svg"}`);
|
|
1227
|
-
return {
|
|
1228
|
-
type: "svg",
|
|
1229
|
-
content: result.data.textContent, // Sanitized SVG content
|
|
1230
|
-
mimeType: "image/svg+xml",
|
|
1231
|
-
metadata: {
|
|
1232
|
-
confidence: detection.metadata.confidence,
|
|
1233
|
-
size: content.length,
|
|
1234
|
-
filename: detection.metadata.filename,
|
|
1235
|
-
extension: detection.extension,
|
|
1236
|
-
},
|
|
1237
|
-
};
|
|
1238
|
-
}
|
|
1239
|
-
else {
|
|
1240
|
-
// Fail closed: return safe empty SVG instead of raw unsanitized content
|
|
1241
|
-
logger.warn(`[FileDetector] SVG processor failed, returning safe empty SVG: ${result.error?.userMessage}`);
|
|
1242
|
-
return {
|
|
1243
|
-
type: "svg",
|
|
1244
|
-
content: '<svg xmlns="http://www.w3.org/2000/svg"></svg>',
|
|
1245
|
-
mimeType: "image/svg+xml",
|
|
1246
|
-
metadata: {
|
|
1247
|
-
confidence: detection.metadata.confidence,
|
|
1248
|
-
size: content.length,
|
|
1249
|
-
filename: detection.metadata.filename,
|
|
1250
|
-
extension: detection.extension,
|
|
1251
|
-
},
|
|
1252
|
-
};
|
|
1253
|
-
}
|
|
1254
|
-
}
|
|
1255
|
-
catch (error) {
|
|
1256
|
-
// Fail closed: return safe empty SVG instead of raw unsanitized content
|
|
1257
|
-
logger.warn(`[FileDetector] SVG processor not available, returning safe empty SVG: ${error instanceof Error ? error.message : String(error)}`);
|
|
1258
|
-
return {
|
|
1259
|
-
type: "svg",
|
|
1260
|
-
content: '<svg xmlns="http://www.w3.org/2000/svg"></svg>',
|
|
1261
|
-
mimeType: "image/svg+xml",
|
|
1262
|
-
metadata: {
|
|
1263
|
-
confidence: detection.metadata.confidence,
|
|
1264
|
-
size: content.length,
|
|
1265
|
-
filename: detection.metadata.filename,
|
|
1266
|
-
extension: detection.extension,
|
|
1267
|
-
},
|
|
1268
|
-
};
|
|
1269
|
-
}
|
|
1270
|
-
}
|
|
1271
|
-
/**
|
|
1272
|
-
* Load file from URL with automatic retry on transient network errors
|
|
1273
|
-
*/
|
|
1274
|
-
static async loadFromURL(url, options) {
|
|
1275
|
-
const maxSize = options?.maxSize || 200 * 1024 * 1024; // 200MB default (matches Curator memory-safety cap)
|
|
1276
|
-
const timeout = options?.timeout || FileDetector.DEFAULT_NETWORK_TIMEOUT;
|
|
1277
|
-
const maxRetries = options?.maxRetries ?? DEFAULT_MAX_RETRIES;
|
|
1278
|
-
const retryDelay = options?.retryDelay ?? DEFAULT_RETRY_DELAY;
|
|
1279
|
-
return withRetry(async () => {
|
|
1280
|
-
const response = await request(url, {
|
|
1281
|
-
dispatcher: getGlobalDispatcher().compose(interceptors.redirect({ maxRedirections: 5 })),
|
|
1282
|
-
method: "GET",
|
|
1283
|
-
headersTimeout: timeout,
|
|
1284
|
-
bodyTimeout: timeout,
|
|
1285
|
-
});
|
|
1286
|
-
if (response.statusCode !== 200) {
|
|
1287
|
-
throw new Error(`HTTP ${response.statusCode}`);
|
|
1288
|
-
}
|
|
1289
|
-
const chunks = [];
|
|
1290
|
-
let totalSize = 0;
|
|
1291
|
-
for await (const chunk of response.body) {
|
|
1292
|
-
totalSize += chunk.length;
|
|
1293
|
-
if (totalSize > maxSize) {
|
|
1294
|
-
throw new Error(`File too large: ${formatFileSize(totalSize)} (max: ${formatFileSize(maxSize)})`);
|
|
1295
|
-
}
|
|
1296
|
-
chunks.push(chunk);
|
|
1297
|
-
}
|
|
1298
|
-
return Buffer.concat(chunks);
|
|
1299
|
-
}, { maxRetries, retryDelay });
|
|
1300
|
-
}
|
|
1301
|
-
/**
|
|
1302
|
-
* Load file from filesystem path
|
|
1303
|
-
*/
|
|
1304
|
-
static async loadFromPath(path, options) {
|
|
1305
|
-
const maxSize = options?.maxSize || 200 * 1024 * 1024; // 200MB default (matches Curator memory-safety cap)
|
|
1306
|
-
const statInfo = await stat(path);
|
|
1307
|
-
if (!statInfo.isFile()) {
|
|
1308
|
-
throw new Error("Not a file");
|
|
1309
|
-
}
|
|
1310
|
-
if (statInfo.size > maxSize) {
|
|
1311
|
-
throw new Error(`File too large: ${formatFileSize(statInfo.size)} (max: ${formatFileSize(maxSize)})`);
|
|
1312
|
-
}
|
|
1313
|
-
return await readFile(path);
|
|
1314
|
-
}
|
|
1315
|
-
/**
|
|
1316
|
-
* Load file from data URI
|
|
1317
|
-
*/
|
|
1318
|
-
static loadFromDataURI(dataUri) {
|
|
1319
|
-
const match = dataUri.match(/^data:([^;]+);base64,(.+)$/);
|
|
1320
|
-
if (!match) {
|
|
1321
|
-
throw new Error("Invalid data URI format");
|
|
1322
|
-
}
|
|
1323
|
-
return Buffer.from(match[2], "base64");
|
|
1324
|
-
}
|
|
1325
|
-
}
|
|
1326
|
-
/**
|
|
1327
|
-
* Strategy 1: Magic Bytes Detection (95% confidence)
|
|
1328
|
-
* Detects file type from binary file headers
|
|
1329
|
-
*/
|
|
1330
|
-
class MagicBytesStrategy {
|
|
1331
|
-
async detect(input) {
|
|
1332
|
-
if (!Buffer.isBuffer(input)) {
|
|
1333
|
-
return this.unknown();
|
|
1334
|
-
}
|
|
1335
|
-
if (this.isPNG(input)) {
|
|
1336
|
-
return this.result("image", "image/png", 95);
|
|
1337
|
-
}
|
|
1338
|
-
if (this.isJPEG(input)) {
|
|
1339
|
-
return this.result("image", "image/jpeg", 95);
|
|
1340
|
-
}
|
|
1341
|
-
if (this.isGIF(input)) {
|
|
1342
|
-
return this.result("image", "image/gif", 95);
|
|
1343
|
-
}
|
|
1344
|
-
if (this.isWebP(input)) {
|
|
1345
|
-
return this.result("image", "image/webp", 95);
|
|
1346
|
-
}
|
|
1347
|
-
if (this.isPDF(input)) {
|
|
1348
|
-
return this.result("pdf", "application/pdf", 95);
|
|
1349
|
-
}
|
|
1350
|
-
// MP4/MOV: "ftyp" at offset 4
|
|
1351
|
-
if (input.length >= 8 &&
|
|
1352
|
-
input[4] === 0x66 &&
|
|
1353
|
-
input[5] === 0x74 &&
|
|
1354
|
-
input[6] === 0x79 &&
|
|
1355
|
-
input[7] === 0x70) {
|
|
1356
|
-
return this.result("video", "video/mp4", 95);
|
|
1357
|
-
}
|
|
1358
|
-
// MKV/WebM: EBML header
|
|
1359
|
-
if (input.length >= 4 &&
|
|
1360
|
-
input[0] === 0x1a &&
|
|
1361
|
-
input[1] === 0x45 &&
|
|
1362
|
-
input[2] === 0xdf &&
|
|
1363
|
-
input[3] === 0xa3) {
|
|
1364
|
-
return this.result("video", "video/x-matroska", 90);
|
|
1365
|
-
}
|
|
1366
|
-
// AVI: "RIFF" + "AVI "
|
|
1367
|
-
if (input.length >= 12 &&
|
|
1368
|
-
input[0] === 0x52 &&
|
|
1369
|
-
input[1] === 0x49 &&
|
|
1370
|
-
input[2] === 0x46 &&
|
|
1371
|
-
input[3] === 0x46 &&
|
|
1372
|
-
input[8] === 0x41 &&
|
|
1373
|
-
input[9] === 0x56 &&
|
|
1374
|
-
input[10] === 0x49 &&
|
|
1375
|
-
input[11] === 0x20) {
|
|
1376
|
-
return this.result("video", "video/x-msvideo", 95);
|
|
1377
|
-
}
|
|
1378
|
-
// WAV: "RIFF" + "WAVE"
|
|
1379
|
-
if (input.length >= 12 &&
|
|
1380
|
-
input[0] === 0x52 &&
|
|
1381
|
-
input[1] === 0x49 &&
|
|
1382
|
-
input[2] === 0x46 &&
|
|
1383
|
-
input[3] === 0x46 &&
|
|
1384
|
-
input[8] === 0x57 &&
|
|
1385
|
-
input[9] === 0x41 &&
|
|
1386
|
-
input[10] === 0x56 &&
|
|
1387
|
-
input[11] === 0x45) {
|
|
1388
|
-
return this.result("audio", "audio/wav", 95);
|
|
1389
|
-
}
|
|
1390
|
-
// MP3: ID3 tag
|
|
1391
|
-
if (input.length >= 3 &&
|
|
1392
|
-
input[0] === 0x49 &&
|
|
1393
|
-
input[1] === 0x44 &&
|
|
1394
|
-
input[2] === 0x33) {
|
|
1395
|
-
return this.result("audio", "audio/mpeg", 95);
|
|
1396
|
-
}
|
|
1397
|
-
// MP3: sync word
|
|
1398
|
-
if (input.length >= 2 && input[0] === 0xff && (input[1] & 0xe0) === 0xe0) {
|
|
1399
|
-
return this.result("audio", "audio/mpeg", 80);
|
|
1400
|
-
}
|
|
1401
|
-
// FLAC: "fLaC"
|
|
1402
|
-
if (input.length >= 4 &&
|
|
1403
|
-
input[0] === 0x66 &&
|
|
1404
|
-
input[1] === 0x4c &&
|
|
1405
|
-
input[2] === 0x61 &&
|
|
1406
|
-
input[3] === 0x43) {
|
|
1407
|
-
return this.result("audio", "audio/flac", 95);
|
|
1408
|
-
}
|
|
1409
|
-
// OGG: "OggS"
|
|
1410
|
-
if (input.length >= 4 &&
|
|
1411
|
-
input[0] === 0x4f &&
|
|
1412
|
-
input[1] === 0x67 &&
|
|
1413
|
-
input[2] === 0x67 &&
|
|
1414
|
-
input[3] === 0x53) {
|
|
1415
|
-
return this.result("audio", "audio/ogg", 90);
|
|
1416
|
-
}
|
|
1417
|
-
// ZIP: "PK\x03\x04"
|
|
1418
|
-
// NOTE: Many document formats (OOXML: .xlsx, .docx, .pptx; ODF: .odt, .ods)
|
|
1419
|
-
// are internally ZIP archives and share these magic bytes. We return a lower
|
|
1420
|
-
// confidence (70%) so the ExtensionStrategy (85%) can override with the correct
|
|
1421
|
-
// document type when a file path with extension is available. For raw buffers
|
|
1422
|
-
// without path info, this falls through to archive as a safe default.
|
|
1423
|
-
if (input.length >= 4 &&
|
|
1424
|
-
input[0] === 0x50 &&
|
|
1425
|
-
input[1] === 0x4b &&
|
|
1426
|
-
input[2] === 0x03 &&
|
|
1427
|
-
input[3] === 0x04) {
|
|
1428
|
-
return this.result("archive", "application/zip", 70);
|
|
1429
|
-
}
|
|
1430
|
-
// GZIP: 1F 8B
|
|
1431
|
-
if (input.length >= 2 && input[0] === 0x1f && input[1] === 0x8b) {
|
|
1432
|
-
return this.result("archive", "application/gzip", 90);
|
|
1433
|
-
}
|
|
1434
|
-
// RAR: "Rar!"
|
|
1435
|
-
if (input.length >= 4 &&
|
|
1436
|
-
input[0] === 0x52 &&
|
|
1437
|
-
input[1] === 0x61 &&
|
|
1438
|
-
input[2] === 0x72 &&
|
|
1439
|
-
input[3] === 0x21) {
|
|
1440
|
-
return this.result("archive", "application/x-rar-compressed", 95);
|
|
1441
|
-
}
|
|
1442
|
-
return this.unknown();
|
|
1443
|
-
}
|
|
1444
|
-
isPNG(buf) {
|
|
1445
|
-
return (buf.length >= 4 &&
|
|
1446
|
-
buf[0] === 0x89 &&
|
|
1447
|
-
buf[1] === 0x50 &&
|
|
1448
|
-
buf[2] === 0x4e &&
|
|
1449
|
-
buf[3] === 0x47);
|
|
1450
|
-
}
|
|
1451
|
-
isJPEG(buf) {
|
|
1452
|
-
return (buf.length >= 3 && buf[0] === 0xff && buf[1] === 0xd8 && buf[2] === 0xff);
|
|
1453
|
-
}
|
|
1454
|
-
isGIF(buf) {
|
|
1455
|
-
return (buf.length >= 4 &&
|
|
1456
|
-
buf[0] === 0x47 &&
|
|
1457
|
-
buf[1] === 0x49 &&
|
|
1458
|
-
buf[2] === 0x46 &&
|
|
1459
|
-
buf[3] === 0x38);
|
|
1460
|
-
}
|
|
1461
|
-
isWebP(buf) {
|
|
1462
|
-
return (buf.length >= 12 &&
|
|
1463
|
-
buf.slice(0, 4).toString() === "RIFF" &&
|
|
1464
|
-
buf.slice(8, 12).toString() === "WEBP");
|
|
1465
|
-
}
|
|
1466
|
-
isPDF(buf) {
|
|
1467
|
-
return buf.length >= 5 && buf.slice(0, 5).toString() === "%PDF-";
|
|
1468
|
-
}
|
|
1469
|
-
result(type, mime, confidence) {
|
|
1470
|
-
return {
|
|
1471
|
-
type,
|
|
1472
|
-
mimeType: mime,
|
|
1473
|
-
extension: null,
|
|
1474
|
-
source: "buffer",
|
|
1475
|
-
metadata: { confidence },
|
|
1476
|
-
};
|
|
1477
|
-
}
|
|
1478
|
-
unknown() {
|
|
1479
|
-
return {
|
|
1480
|
-
type: "unknown",
|
|
1481
|
-
mimeType: "application/octet-stream",
|
|
1482
|
-
extension: null,
|
|
1483
|
-
source: "buffer",
|
|
1484
|
-
metadata: { confidence: 0 },
|
|
1485
|
-
};
|
|
1486
|
-
}
|
|
1487
|
-
}
|
|
1488
|
-
/**
|
|
1489
|
-
* Strategy 2: MIME Type Detection (85% confidence)
|
|
1490
|
-
* Detects file type from HTTP Content-Type headers
|
|
1491
|
-
*/
|
|
1492
|
-
class MimeTypeStrategy {
|
|
1493
|
-
async detect(input) {
|
|
1494
|
-
if (typeof input !== "string" || !this.isURL(input)) {
|
|
1495
|
-
return this.unknown();
|
|
1496
|
-
}
|
|
1497
|
-
try {
|
|
1498
|
-
const response = await request(input, {
|
|
1499
|
-
dispatcher: getGlobalDispatcher().compose(interceptors.redirect({ maxRedirections: 5 })),
|
|
1500
|
-
method: "HEAD",
|
|
1501
|
-
headersTimeout: FileDetector.DEFAULT_HEAD_TIMEOUT,
|
|
1502
|
-
bodyTimeout: FileDetector.DEFAULT_HEAD_TIMEOUT,
|
|
1503
|
-
});
|
|
1504
|
-
const contentType = response.headers["content-type"] || "";
|
|
1505
|
-
const type = this.mimeToFileType(contentType);
|
|
1506
|
-
return {
|
|
1507
|
-
type,
|
|
1508
|
-
mimeType: contentType.split(";")[0].trim(),
|
|
1509
|
-
extension: null,
|
|
1510
|
-
source: "url",
|
|
1511
|
-
metadata: { confidence: type !== "unknown" ? 85 : 0 },
|
|
1512
|
-
};
|
|
1513
|
-
}
|
|
1514
|
-
catch {
|
|
1515
|
-
return this.unknown();
|
|
1516
|
-
}
|
|
1517
|
-
}
|
|
1518
|
-
mimeToFileType(mime) {
|
|
1519
|
-
const lower = mime.toLowerCase().split(";")[0].trim();
|
|
1520
|
-
// CSV
|
|
1521
|
-
if (lower === "text/csv" || lower === "text/tab-separated-values") {
|
|
1522
|
-
return "csv";
|
|
1523
|
-
}
|
|
1524
|
-
// SVG is processed as text/markup, NOT as image
|
|
1525
|
-
// Must check before generic image/ check
|
|
1526
|
-
if (lower === "image/svg+xml") {
|
|
1527
|
-
return "svg";
|
|
1528
|
-
}
|
|
1529
|
-
// Images
|
|
1530
|
-
if (lower.startsWith("image/")) {
|
|
1531
|
-
return "image";
|
|
1532
|
-
}
|
|
1533
|
-
// PDF
|
|
1534
|
-
if (lower === "application/pdf") {
|
|
1535
|
-
return "pdf";
|
|
1536
|
-
}
|
|
1537
|
-
// Video
|
|
1538
|
-
if (lower.startsWith("video/")) {
|
|
1539
|
-
return "video";
|
|
1540
|
-
}
|
|
1541
|
-
// Audio
|
|
1542
|
-
if (lower.startsWith("audio/")) {
|
|
1543
|
-
return "audio";
|
|
1544
|
-
}
|
|
1545
|
-
// Office documents — OOXML
|
|
1546
|
-
if (lower ===
|
|
1547
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
|
|
1548
|
-
lower === "application/msword") {
|
|
1549
|
-
return "docx";
|
|
1550
|
-
}
|
|
1551
|
-
if (lower ===
|
|
1552
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ||
|
|
1553
|
-
lower === "application/vnd.ms-excel") {
|
|
1554
|
-
return "xlsx";
|
|
1555
|
-
}
|
|
1556
|
-
if (lower ===
|
|
1557
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation" ||
|
|
1558
|
-
lower === "application/vnd.ms-powerpoint") {
|
|
1559
|
-
return "pptx";
|
|
1560
|
-
}
|
|
1561
|
-
// OpenDocument formats
|
|
1562
|
-
if (lower === "application/vnd.oasis.opendocument.text") {
|
|
1563
|
-
return "docx";
|
|
1564
|
-
}
|
|
1565
|
-
if (lower === "application/vnd.oasis.opendocument.spreadsheet") {
|
|
1566
|
-
return "xlsx";
|
|
1567
|
-
}
|
|
1568
|
-
if (lower === "application/vnd.oasis.opendocument.presentation") {
|
|
1569
|
-
return "pptx";
|
|
1570
|
-
}
|
|
1571
|
-
// RTF
|
|
1572
|
-
if (lower === "application/rtf" || lower === "text/rtf") {
|
|
1573
|
-
return "docx";
|
|
1574
|
-
}
|
|
1575
|
-
// Archive formats
|
|
1576
|
-
if (lower === "application/zip" ||
|
|
1577
|
-
lower === "application/x-zip-compressed" ||
|
|
1578
|
-
lower === "application/gzip" ||
|
|
1579
|
-
lower === "application/x-gzip" ||
|
|
1580
|
-
lower === "application/x-tar" ||
|
|
1581
|
-
lower === "application/x-compressed-tar" ||
|
|
1582
|
-
lower === "application/java-archive" ||
|
|
1583
|
-
lower === "application/x-rar-compressed" ||
|
|
1584
|
-
lower === "application/vnd.rar" ||
|
|
1585
|
-
lower === "application/x-7z-compressed") {
|
|
1586
|
-
return "archive";
|
|
1587
|
-
}
|
|
1588
|
-
// Text/markup/source code — broad matching
|
|
1589
|
-
if (lower === "text/plain" ||
|
|
1590
|
-
lower === "text/markdown" ||
|
|
1591
|
-
lower === "text/html" ||
|
|
1592
|
-
lower === "text/css" ||
|
|
1593
|
-
lower === "text/javascript" ||
|
|
1594
|
-
lower === "text/typescript" ||
|
|
1595
|
-
lower === "application/json" ||
|
|
1596
|
-
lower === "application/xml" ||
|
|
1597
|
-
lower === "text/xml" ||
|
|
1598
|
-
lower === "application/yaml" ||
|
|
1599
|
-
lower === "application/x-yaml") {
|
|
1600
|
-
return "text";
|
|
1601
|
-
}
|
|
1602
|
-
// Source code MIME types (text/x-*)
|
|
1603
|
-
if (lower.startsWith("text/x-")) {
|
|
1604
|
-
return "text";
|
|
1605
|
-
}
|
|
1606
|
-
// Generic text types we may not have listed explicitly
|
|
1607
|
-
if (lower.startsWith("text/")) {
|
|
1608
|
-
return "text";
|
|
1609
|
-
}
|
|
1610
|
-
return "unknown";
|
|
1611
|
-
}
|
|
1612
|
-
isURL(str) {
|
|
1613
|
-
return str.startsWith("http://") || str.startsWith("https://");
|
|
1614
|
-
}
|
|
1615
|
-
unknown() {
|
|
1616
|
-
return {
|
|
1617
|
-
type: "unknown",
|
|
1618
|
-
mimeType: "application/octet-stream",
|
|
1619
|
-
extension: null,
|
|
1620
|
-
source: "buffer",
|
|
1621
|
-
metadata: { confidence: 0 },
|
|
1622
|
-
};
|
|
1623
|
-
}
|
|
1624
|
-
}
|
|
1625
|
-
/**
|
|
1626
|
-
* Strategy 3: Extension Detection (70% confidence)
|
|
1627
|
-
* Detects file type from file extension
|
|
1628
|
-
*/
|
|
1629
|
-
class ExtensionStrategy {
|
|
1630
|
-
async detect(input) {
|
|
1631
|
-
if (typeof input !== "string") {
|
|
1632
|
-
return this.unknown();
|
|
1633
|
-
}
|
|
1634
|
-
const ext = this.getExtension(input);
|
|
1635
|
-
if (!ext) {
|
|
1636
|
-
return this.unknown();
|
|
1637
|
-
}
|
|
1638
|
-
const typeMap = {
|
|
1639
|
-
csv: "csv",
|
|
1640
|
-
tsv: "csv",
|
|
1641
|
-
jpg: "image",
|
|
1642
|
-
jpeg: "image",
|
|
1643
|
-
png: "image",
|
|
1644
|
-
gif: "image",
|
|
1645
|
-
webp: "image",
|
|
1646
|
-
bmp: "image",
|
|
1647
|
-
tiff: "image",
|
|
1648
|
-
tif: "image",
|
|
1649
|
-
// SVG is handled as text/markup, NOT as image
|
|
1650
|
-
// AI providers don't support SVG format, so we process it as sanitized text
|
|
1651
|
-
svg: "svg",
|
|
1652
|
-
avif: "image",
|
|
1653
|
-
pdf: "pdf",
|
|
1654
|
-
// Video formats
|
|
1655
|
-
mp4: "video",
|
|
1656
|
-
mkv: "video",
|
|
1657
|
-
mov: "video",
|
|
1658
|
-
avi: "video",
|
|
1659
|
-
webm: "video",
|
|
1660
|
-
wmv: "video",
|
|
1661
|
-
flv: "video",
|
|
1662
|
-
// Audio formats
|
|
1663
|
-
mp3: "audio",
|
|
1664
|
-
wav: "audio",
|
|
1665
|
-
ogg: "audio",
|
|
1666
|
-
flac: "audio",
|
|
1667
|
-
m4a: "audio",
|
|
1668
|
-
aac: "audio",
|
|
1669
|
-
wma: "audio",
|
|
1670
|
-
opus: "audio",
|
|
1671
|
-
// Archive formats
|
|
1672
|
-
zip: "archive",
|
|
1673
|
-
tar: "archive",
|
|
1674
|
-
gz: "archive",
|
|
1675
|
-
tgz: "archive",
|
|
1676
|
-
rar: "archive",
|
|
1677
|
-
"7z": "archive",
|
|
1678
|
-
jar: "archive",
|
|
1679
|
-
// Document formats (ZIP-based internally)
|
|
1680
|
-
xlsx: "xlsx",
|
|
1681
|
-
xls: "xlsx",
|
|
1682
|
-
docx: "docx",
|
|
1683
|
-
doc: "docx",
|
|
1684
|
-
pptx: "pptx",
|
|
1685
|
-
ppt: "pptx",
|
|
1686
|
-
odt: "docx", // OpenDocument text → processed like docx
|
|
1687
|
-
ods: "xlsx", // OpenDocument spreadsheet → processed like xlsx
|
|
1688
|
-
odp: "pptx", // OpenDocument presentation → processed like pptx
|
|
1689
|
-
rtf: "docx", // RTF → processed like docx (text extraction)
|
|
1690
|
-
// Text/markup formats
|
|
1691
|
-
txt: "text",
|
|
1692
|
-
md: "text",
|
|
1693
|
-
markdown: "text",
|
|
1694
|
-
json: "text",
|
|
1695
|
-
xml: "text",
|
|
1696
|
-
yaml: "text",
|
|
1697
|
-
yml: "text",
|
|
1698
|
-
html: "text",
|
|
1699
|
-
htm: "text",
|
|
1700
|
-
css: "text",
|
|
1701
|
-
log: "text",
|
|
1702
|
-
conf: "text",
|
|
1703
|
-
cfg: "text",
|
|
1704
|
-
ini: "text",
|
|
1705
|
-
env: "text",
|
|
1706
|
-
toml: "text",
|
|
1707
|
-
properties: "text",
|
|
1708
|
-
gitignore: "text",
|
|
1709
|
-
dockerignore: "text",
|
|
1710
|
-
editorconfig: "text",
|
|
1711
|
-
prettierrc: "text",
|
|
1712
|
-
eslintrc: "text",
|
|
1713
|
-
babelrc: "text",
|
|
1714
|
-
// Source code formats
|
|
1715
|
-
js: "text",
|
|
1716
|
-
mjs: "text",
|
|
1717
|
-
cjs: "text",
|
|
1718
|
-
jsx: "text",
|
|
1719
|
-
ts: "text",
|
|
1720
|
-
tsx: "text",
|
|
1721
|
-
py: "text",
|
|
1722
|
-
java: "text",
|
|
1723
|
-
go: "text",
|
|
1724
|
-
rs: "text",
|
|
1725
|
-
rb: "text",
|
|
1726
|
-
php: "text",
|
|
1727
|
-
c: "text",
|
|
1728
|
-
cpp: "text",
|
|
1729
|
-
cc: "text",
|
|
1730
|
-
h: "text",
|
|
1731
|
-
hpp: "text",
|
|
1732
|
-
cs: "text",
|
|
1733
|
-
swift: "text",
|
|
1734
|
-
kt: "text",
|
|
1735
|
-
kts: "text",
|
|
1736
|
-
scala: "text",
|
|
1737
|
-
sh: "text",
|
|
1738
|
-
bash: "text",
|
|
1739
|
-
zsh: "text",
|
|
1740
|
-
ps1: "text",
|
|
1741
|
-
sql: "text",
|
|
1742
|
-
r: "text",
|
|
1743
|
-
lua: "text",
|
|
1744
|
-
pl: "text",
|
|
1745
|
-
perl: "text",
|
|
1746
|
-
dart: "text",
|
|
1747
|
-
ex: "text",
|
|
1748
|
-
exs: "text",
|
|
1749
|
-
erl: "text",
|
|
1750
|
-
hs: "text",
|
|
1751
|
-
clj: "text",
|
|
1752
|
-
lisp: "text",
|
|
1753
|
-
vim: "text",
|
|
1754
|
-
// Additional video/image
|
|
1755
|
-
m4v: "video",
|
|
1756
|
-
ico: "image",
|
|
1757
|
-
};
|
|
1758
|
-
const type = typeMap[ext.toLowerCase()];
|
|
1759
|
-
return {
|
|
1760
|
-
type: type || "unknown",
|
|
1761
|
-
mimeType: this.getMimeType(ext),
|
|
1762
|
-
extension: ext,
|
|
1763
|
-
source: this.detectSource(input),
|
|
1764
|
-
metadata: { confidence: type ? 85 : 0 },
|
|
1765
|
-
};
|
|
1766
|
-
}
|
|
1767
|
-
getExtension(input) {
|
|
1768
|
-
if (this.isURL(input)) {
|
|
1769
|
-
const url = new URL(input);
|
|
1770
|
-
const match = url.pathname.match(/\.([^.]+)$/);
|
|
1771
|
-
return match ? match[1] : null;
|
|
1772
|
-
}
|
|
1773
|
-
const match = input.match(/\.([^.]+)$/);
|
|
1774
|
-
return match ? match[1] : null;
|
|
1775
|
-
}
|
|
1776
|
-
isURL(str) {
|
|
1777
|
-
return str.startsWith("http://") || str.startsWith("https://");
|
|
1778
|
-
}
|
|
1779
|
-
detectSource(input) {
|
|
1780
|
-
if (input.startsWith("data:")) {
|
|
1781
|
-
return "datauri";
|
|
1782
|
-
}
|
|
1783
|
-
if (this.isURL(input)) {
|
|
1784
|
-
return "url";
|
|
1785
|
-
}
|
|
1786
|
-
return "path";
|
|
1787
|
-
}
|
|
1788
|
-
getMimeType(ext) {
|
|
1789
|
-
const mimeMap = {
|
|
1790
|
-
csv: "text/csv",
|
|
1791
|
-
tsv: "text/tab-separated-values",
|
|
1792
|
-
jpg: "image/jpeg",
|
|
1793
|
-
jpeg: "image/jpeg",
|
|
1794
|
-
png: "image/png",
|
|
1795
|
-
gif: "image/gif",
|
|
1796
|
-
webp: "image/webp",
|
|
1797
|
-
bmp: "image/bmp",
|
|
1798
|
-
tiff: "image/tiff",
|
|
1799
|
-
tif: "image/tiff",
|
|
1800
|
-
svg: "image/svg+xml",
|
|
1801
|
-
avif: "image/avif",
|
|
1802
|
-
pdf: "application/pdf",
|
|
1803
|
-
// Video MIME types
|
|
1804
|
-
mp4: "video/mp4",
|
|
1805
|
-
mkv: "video/x-matroska",
|
|
1806
|
-
mov: "video/quicktime",
|
|
1807
|
-
avi: "video/x-msvideo",
|
|
1808
|
-
webm: "video/webm",
|
|
1809
|
-
wmv: "video/x-ms-wmv",
|
|
1810
|
-
flv: "video/x-flv",
|
|
1811
|
-
// Audio MIME types
|
|
1812
|
-
mp3: "audio/mpeg",
|
|
1813
|
-
wav: "audio/wav",
|
|
1814
|
-
ogg: "audio/ogg",
|
|
1815
|
-
flac: "audio/flac",
|
|
1816
|
-
m4a: "audio/mp4",
|
|
1817
|
-
aac: "audio/aac",
|
|
1818
|
-
wma: "audio/x-ms-wma",
|
|
1819
|
-
opus: "audio/opus",
|
|
1820
|
-
// Archive MIME types
|
|
1821
|
-
zip: "application/zip",
|
|
1822
|
-
tar: "application/x-tar",
|
|
1823
|
-
gz: "application/gzip",
|
|
1824
|
-
tgz: "application/gzip",
|
|
1825
|
-
rar: "application/x-rar-compressed",
|
|
1826
|
-
"7z": "application/x-7z-compressed",
|
|
1827
|
-
jar: "application/java-archive",
|
|
1828
|
-
// Document MIME types
|
|
1829
|
-
xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
1830
|
-
xls: "application/vnd.ms-excel",
|
|
1831
|
-
docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
1832
|
-
doc: "application/msword",
|
|
1833
|
-
pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
1834
|
-
ppt: "application/vnd.ms-powerpoint",
|
|
1835
|
-
odt: "application/vnd.oasis.opendocument.text",
|
|
1836
|
-
ods: "application/vnd.oasis.opendocument.spreadsheet",
|
|
1837
|
-
odp: "application/vnd.oasis.opendocument.presentation",
|
|
1838
|
-
rtf: "application/rtf",
|
|
1839
|
-
// Text/markup MIME types
|
|
1840
|
-
txt: "text/plain",
|
|
1841
|
-
md: "text/markdown",
|
|
1842
|
-
markdown: "text/markdown",
|
|
1843
|
-
json: "application/json",
|
|
1844
|
-
xml: "application/xml",
|
|
1845
|
-
yaml: "application/yaml",
|
|
1846
|
-
yml: "application/yaml",
|
|
1847
|
-
html: "text/html",
|
|
1848
|
-
htm: "text/html",
|
|
1849
|
-
css: "text/css",
|
|
1850
|
-
log: "text/plain",
|
|
1851
|
-
conf: "text/plain",
|
|
1852
|
-
cfg: "text/plain",
|
|
1853
|
-
ini: "text/plain",
|
|
1854
|
-
env: "text/plain",
|
|
1855
|
-
toml: "text/plain",
|
|
1856
|
-
properties: "text/plain",
|
|
1857
|
-
gitignore: "text/plain",
|
|
1858
|
-
dockerignore: "text/plain",
|
|
1859
|
-
editorconfig: "text/plain",
|
|
1860
|
-
prettierrc: "application/json",
|
|
1861
|
-
eslintrc: "application/json",
|
|
1862
|
-
babelrc: "application/json",
|
|
1863
|
-
// Source code MIME types
|
|
1864
|
-
js: "text/javascript",
|
|
1865
|
-
mjs: "text/javascript",
|
|
1866
|
-
cjs: "text/javascript",
|
|
1867
|
-
jsx: "text/javascript",
|
|
1868
|
-
ts: "text/typescript",
|
|
1869
|
-
tsx: "text/typescript",
|
|
1870
|
-
py: "text/x-python",
|
|
1871
|
-
java: "text/x-java-source",
|
|
1872
|
-
go: "text/x-go",
|
|
1873
|
-
rs: "text/x-rustsrc",
|
|
1874
|
-
rb: "text/x-ruby",
|
|
1875
|
-
php: "text/x-php",
|
|
1876
|
-
c: "text/x-c",
|
|
1877
|
-
cpp: "text/x-c++",
|
|
1878
|
-
cc: "text/x-c++",
|
|
1879
|
-
h: "text/x-c",
|
|
1880
|
-
hpp: "text/x-c++",
|
|
1881
|
-
cs: "text/x-csharp",
|
|
1882
|
-
swift: "text/x-swift",
|
|
1883
|
-
kt: "text/x-kotlin",
|
|
1884
|
-
kts: "text/x-kotlin",
|
|
1885
|
-
scala: "text/x-scala",
|
|
1886
|
-
sh: "text/x-shellscript",
|
|
1887
|
-
bash: "text/x-shellscript",
|
|
1888
|
-
zsh: "text/x-shellscript",
|
|
1889
|
-
ps1: "text/x-powershell",
|
|
1890
|
-
sql: "text/x-sql",
|
|
1891
|
-
r: "text/x-r",
|
|
1892
|
-
lua: "text/x-lua",
|
|
1893
|
-
pl: "text/x-perl",
|
|
1894
|
-
perl: "text/x-perl",
|
|
1895
|
-
dart: "text/x-dart",
|
|
1896
|
-
ex: "text/x-elixir",
|
|
1897
|
-
exs: "text/x-elixir",
|
|
1898
|
-
erl: "text/x-erlang",
|
|
1899
|
-
hs: "text/x-haskell",
|
|
1900
|
-
clj: "text/x-clojure",
|
|
1901
|
-
lisp: "text/x-lisp",
|
|
1902
|
-
vim: "text/plain",
|
|
1903
|
-
// Additional video/image
|
|
1904
|
-
m4v: "video/mp4",
|
|
1905
|
-
ico: "image/x-icon",
|
|
1906
|
-
};
|
|
1907
|
-
return mimeMap[ext.toLowerCase()] || "application/octet-stream";
|
|
1908
|
-
}
|
|
1909
|
-
unknown() {
|
|
1910
|
-
return {
|
|
1911
|
-
type: "unknown",
|
|
1912
|
-
mimeType: "application/octet-stream",
|
|
1913
|
-
extension: null,
|
|
1914
|
-
source: "buffer",
|
|
1915
|
-
metadata: { confidence: 0 },
|
|
1916
|
-
};
|
|
1917
|
-
}
|
|
1918
|
-
}
|
|
1919
|
-
/**
|
|
1920
|
-
* Strategy 4: Content Heuristics (75% confidence)
|
|
1921
|
-
* Detects file type by analyzing content patterns
|
|
1922
|
-
*/
|
|
1923
|
-
class ContentHeuristicStrategy {
|
|
1924
|
-
async detect(input) {
|
|
1925
|
-
let buffer;
|
|
1926
|
-
if (Buffer.isBuffer(input)) {
|
|
1927
|
-
buffer = input;
|
|
1928
|
-
}
|
|
1929
|
-
else if (typeof input === "string") {
|
|
1930
|
-
// Try to load from file path or data URI
|
|
1931
|
-
if (input.startsWith("data:")) {
|
|
1932
|
-
// Data URI
|
|
1933
|
-
const match = input.match(/^data:([^;]+);base64,(.+)$/);
|
|
1934
|
-
if (!match) {
|
|
1935
|
-
return this.unknown();
|
|
1936
|
-
}
|
|
1937
|
-
buffer = Buffer.from(match[2], "base64");
|
|
1938
|
-
}
|
|
1939
|
-
else if (input.startsWith("http://") || input.startsWith("https://")) {
|
|
1940
|
-
// URL - can't analyze without making HTTP request in ContentHeuristic
|
|
1941
|
-
return this.unknown();
|
|
1942
|
-
}
|
|
1943
|
-
else {
|
|
1944
|
-
// File path - try to load it
|
|
1945
|
-
try {
|
|
1946
|
-
buffer = await readFile(input);
|
|
1947
|
-
}
|
|
1948
|
-
catch {
|
|
1949
|
-
return this.unknown();
|
|
1950
|
-
}
|
|
1951
|
-
}
|
|
1952
|
-
}
|
|
1953
|
-
else {
|
|
1954
|
-
return this.unknown();
|
|
1955
|
-
}
|
|
1956
|
-
const sample = buffer.toString("utf-8", 0, Math.min(2000, buffer.length));
|
|
1957
|
-
// Check for JSON first (more specific than CSV)
|
|
1958
|
-
if (this.looksLikeJSON(sample)) {
|
|
1959
|
-
return this.result("text", "application/json", 75);
|
|
1960
|
-
}
|
|
1961
|
-
// Check CSV after JSON (CSV is more generic)
|
|
1962
|
-
if (this.looksLikeCSV(sample)) {
|
|
1963
|
-
return this.result("csv", "text/csv", 75);
|
|
1964
|
-
}
|
|
1965
|
-
// Check for XML/HTML
|
|
1966
|
-
if (this.looksLikeXML(sample)) {
|
|
1967
|
-
const isHTML = sample.includes("<!DOCTYPE html") || sample.includes("<html");
|
|
1968
|
-
return this.result("text", isHTML ? "text/html" : "application/xml", 70);
|
|
1969
|
-
}
|
|
1970
|
-
// Check for YAML
|
|
1971
|
-
if (this.looksLikeYAML(sample)) {
|
|
1972
|
-
return this.result("text", "application/yaml", 70);
|
|
1973
|
-
}
|
|
1974
|
-
// Check for plain text (if mostly printable characters)
|
|
1975
|
-
if (this.looksLikeText(sample)) {
|
|
1976
|
-
return this.result("text", "text/plain", 60);
|
|
1977
|
-
}
|
|
1978
|
-
return this.unknown();
|
|
1979
|
-
}
|
|
1980
|
-
looksLikeCSV(text) {
|
|
1981
|
-
const lines = text.trim().split("\n");
|
|
1982
|
-
if (lines.length < 2) {
|
|
1983
|
-
return false;
|
|
1984
|
-
}
|
|
1985
|
-
// Detect delimiter from first line
|
|
1986
|
-
const firstLine = lines[0];
|
|
1987
|
-
const delimiters = [",", ";", "\t", "|"];
|
|
1988
|
-
const delimiter = delimiters.find((d) => firstLine.includes(d));
|
|
1989
|
-
// Single-column CSV check (no delimiter)
|
|
1990
|
-
if (!delimiter) {
|
|
1991
|
-
// Exclude content that looks like other structured formats
|
|
1992
|
-
// YAML indicators
|
|
1993
|
-
if (text.startsWith("---") ||
|
|
1994
|
-
/^[\s]*-\s+/m.test(text) ||
|
|
1995
|
-
/^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*/m.test(text)) {
|
|
1996
|
-
return false;
|
|
1997
|
-
}
|
|
1998
|
-
// XML/HTML indicators
|
|
1999
|
-
if (text.startsWith("<") || text.includes("<?xml")) {
|
|
2000
|
-
return false;
|
|
2001
|
-
}
|
|
2002
|
-
// JSON indicators
|
|
2003
|
-
if ((text.startsWith("{") && text.includes("}")) ||
|
|
2004
|
-
(text.startsWith("[") && text.includes("]"))) {
|
|
2005
|
-
return false;
|
|
2006
|
-
}
|
|
2007
|
-
// Exclude prose/sentences (look for sentence patterns)
|
|
2008
|
-
// Check for multiple words per line (prose indicator)
|
|
2009
|
-
const hasProsePattern = lines.some((line) => {
|
|
2010
|
-
const words = line.trim().split(/\s+/);
|
|
2011
|
-
return words.length > 4; // More than 4 words suggests prose, not data
|
|
2012
|
-
});
|
|
2013
|
-
if (hasProsePattern) {
|
|
2014
|
-
return false;
|
|
2015
|
-
}
|
|
2016
|
-
// Check for consistent line structure (not binary, reasonable lengths)
|
|
2017
|
-
const hasReasonableLengths = lines.every((l) => l.length > 0 && l.length < 1000);
|
|
2018
|
-
const noBinaryChars = !text.includes("\0");
|
|
2019
|
-
// Single-column CSVs should have VERY uniform line lengths
|
|
2020
|
-
// (data values like IDs, codes, numbers - not varied content)
|
|
2021
|
-
const lengths = lines.map((l) => l.length);
|
|
2022
|
-
const avgLength = lengths.reduce((a, b) => a + b, 0) / lengths.length;
|
|
2023
|
-
const variance = lengths.reduce((sum, len) => sum + (len - avgLength) ** 2, 0) /
|
|
2024
|
-
lengths.length;
|
|
2025
|
-
const stdDev = Math.sqrt(variance);
|
|
2026
|
-
// Single-column CSVs can contain varied data (names, cities, emails, etc.)
|
|
2027
|
-
// but should still show some consistency compared to random text
|
|
2028
|
-
const hasUniformLengths = stdDev / avgLength < 0.75;
|
|
2029
|
-
return hasReasonableLengths && noBinaryChars && hasUniformLengths;
|
|
2030
|
-
}
|
|
2031
|
-
// Count delimiters per line and check consistency
|
|
2032
|
-
const delimRegex = delimiter === "|" ? /\|/g : new RegExp(delimiter, "g");
|
|
2033
|
-
const counts = lines.map((line) => (line.match(delimRegex) || []).length);
|
|
2034
|
-
const firstCount = counts[0];
|
|
2035
|
-
const consistentLines = counts.filter((c) => c === firstCount).length;
|
|
2036
|
-
return consistentLines / lines.length >= 0.8;
|
|
2037
|
-
}
|
|
2038
|
-
looksLikeJSON(text) {
|
|
2039
|
-
// hasJsonMarkers now does full validation including JSON.parse
|
|
2040
|
-
return hasJsonMarkers(text);
|
|
2041
|
-
}
|
|
2042
|
-
looksLikeXML(text) {
|
|
2043
|
-
const trimmed = text.trim();
|
|
2044
|
-
// XML declaration is a definitive marker
|
|
2045
|
-
if (trimmed.startsWith("<?xml")) {
|
|
2046
|
-
return true;
|
|
2047
|
-
}
|
|
2048
|
-
// Check for HTML DOCTYPE or tags
|
|
2049
|
-
if (trimmed.includes("<!DOCTYPE html") ||
|
|
2050
|
-
trimmed.toLowerCase().includes("<html")) {
|
|
2051
|
-
return true;
|
|
2052
|
-
}
|
|
2053
|
-
// Strict validation for arbitrary content starting with <:
|
|
2054
|
-
// Must have proper tag structure with at least one closing tag
|
|
2055
|
-
if (!trimmed.startsWith("<")) {
|
|
2056
|
-
return false;
|
|
2057
|
-
}
|
|
2058
|
-
// Must have valid opening tag structure: <tagname followed by space or >
|
|
2059
|
-
// Not just any < character
|
|
2060
|
-
const hasValidOpeningTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/;
|
|
2061
|
-
if (!hasValidOpeningTag.test(trimmed)) {
|
|
2062
|
-
return false;
|
|
2063
|
-
}
|
|
2064
|
-
// Must have at least one closing tag or self-closing tag to be valid XML/HTML
|
|
2065
|
-
const hasClosingTag = /<\/[a-zA-Z][a-zA-Z0-9-]*>/.test(trimmed);
|
|
2066
|
-
const hasSelfClosingTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?\s*\/\s*>/.test(trimmed);
|
|
2067
|
-
return hasClosingTag || hasSelfClosingTag;
|
|
2068
|
-
}
|
|
2069
|
-
looksLikeYAML(text) {
|
|
2070
|
-
const trimmed = text.trim();
|
|
2071
|
-
if (trimmed.length === 0) {
|
|
2072
|
-
return false;
|
|
2073
|
-
}
|
|
2074
|
-
// For single-line content, be very conservative about YAML detection
|
|
2075
|
-
const lines = trimmed.split("\n");
|
|
2076
|
-
if (lines.length === 1) {
|
|
2077
|
-
// Single line can only be YAML if it's a document marker
|
|
2078
|
-
return trimmed === "---" || trimmed === "...";
|
|
2079
|
-
}
|
|
2080
|
-
// Collect YAML indicators (requires at least 2 for positive detection)
|
|
2081
|
-
const indicators = [];
|
|
2082
|
-
// Indicator 1: Document start marker (---)
|
|
2083
|
-
indicators.push(trimmed.startsWith("---"));
|
|
2084
|
-
// Indicator 2: Document end marker (...) or appears within content
|
|
2085
|
-
indicators.push(/^\.\.\.$|[\n]\.\.\.$/.test(trimmed));
|
|
2086
|
-
// Indicator 3: YAML list items (- followed by space at line start)
|
|
2087
|
-
indicators.push(/^[\s]*-\s+[^-]/m.test(trimmed));
|
|
2088
|
-
// Indicator 4: Multiple key-value pairs (at least 2)
|
|
2089
|
-
// Allow hyphens and underscores in keys, support nested keys
|
|
2090
|
-
const keyValuePattern = /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*(.+)$/;
|
|
2091
|
-
const keyValueMatches = lines.filter((line) => keyValuePattern.test(line)).length;
|
|
2092
|
-
indicators.push(keyValueMatches >= 2);
|
|
2093
|
-
// Indicator 5: Nested indentation pattern (common in YAML objects/lists)
|
|
2094
|
-
let hasNesting = false;
|
|
2095
|
-
const sampleLines = lines.slice(0, 10);
|
|
2096
|
-
for (let i = 0; i < sampleLines.length - 1; i++) {
|
|
2097
|
-
const currentLine = sampleLines[i].trim();
|
|
2098
|
-
const nextLine = sampleLines[i + 1];
|
|
2099
|
-
if (currentLine.length > 0 &&
|
|
2100
|
-
nextLine.length > 0 &&
|
|
2101
|
-
/[:-]$/.test(currentLine)) {
|
|
2102
|
-
const currentIndent = sampleLines[i].match(/^[\s]*/)?.[0].length ?? 0;
|
|
2103
|
-
const nextIndent = nextLine.match(/^[\s]*/)?.[0].length ?? 0;
|
|
2104
|
-
if (nextIndent > currentIndent) {
|
|
2105
|
-
hasNesting = true;
|
|
2106
|
-
break;
|
|
2107
|
-
}
|
|
2108
|
-
}
|
|
2109
|
-
}
|
|
2110
|
-
indicators.push(hasNesting);
|
|
2111
|
-
// Indicator 6: YAML comments (# followed by space)
|
|
2112
|
-
indicators.push(/^\s*#\s+/m.test(trimmed));
|
|
2113
|
-
// Indicator 7: List continuation (multiple items with - )
|
|
2114
|
-
const listItemCount = lines.filter((line) => /^[\s]*-[\s]/.test(line)).length;
|
|
2115
|
-
indicators.push(listItemCount >= 2);
|
|
2116
|
-
// Indicator 8: Inline maps or complex structures
|
|
2117
|
-
indicators.push(/{\s*[a-zA-Z_]/.test(trimmed) || /\[.*\]/.test(trimmed));
|
|
2118
|
-
// Require at least 2 indicators for confident YAML detection
|
|
2119
|
-
const matchCount = indicators.filter(Boolean).length;
|
|
2120
|
-
return matchCount >= 2;
|
|
2121
|
-
}
|
|
2122
|
-
looksLikeText(text) {
|
|
2123
|
-
// Check if content has null bytes (binary indicator)
|
|
2124
|
-
if (text.includes("\0")) {
|
|
2125
|
-
return false;
|
|
2126
|
-
}
|
|
2127
|
-
// Count printable characters
|
|
2128
|
-
let printable = 0;
|
|
2129
|
-
for (let i = 0; i < text.length; i++) {
|
|
2130
|
-
const code = text.charCodeAt(i);
|
|
2131
|
-
if ((code >= 32 && code < 127) || // ASCII printable
|
|
2132
|
-
code === 9 || // Tab
|
|
2133
|
-
code === 10 || // Newline
|
|
2134
|
-
code === 13 || // Carriage return
|
|
2135
|
-
code > 127 // Unicode
|
|
2136
|
-
) {
|
|
2137
|
-
printable++;
|
|
2138
|
-
}
|
|
2139
|
-
}
|
|
2140
|
-
// At least 85% should be printable for text
|
|
2141
|
-
return printable / text.length >= 0.85;
|
|
2142
|
-
}
|
|
2143
|
-
result(type, mime, confidence) {
|
|
2144
|
-
return {
|
|
2145
|
-
type,
|
|
2146
|
-
mimeType: mime,
|
|
2147
|
-
extension: null,
|
|
2148
|
-
source: "buffer",
|
|
2149
|
-
metadata: { confidence },
|
|
2150
|
-
};
|
|
2151
|
-
}
|
|
2152
|
-
unknown() {
|
|
2153
|
-
return {
|
|
2154
|
-
type: "unknown",
|
|
2155
|
-
mimeType: "application/octet-stream",
|
|
2156
|
-
extension: null,
|
|
2157
|
-
source: "buffer",
|
|
2158
|
-
metadata: { confidence: 0 },
|
|
2159
|
-
};
|
|
2160
|
-
}
|
|
2161
|
-
}
|