@juspay/neurolink 9.5.2 → 9.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/README.md +29 -25
- package/dist/agent/directTools.d.ts +5 -5
- package/dist/cli/commands/config.d.ts +9 -9
- package/dist/cli/commands/serve.d.ts +37 -0
- package/dist/cli/commands/serve.js +302 -229
- package/dist/cli/commands/setup-anthropic.d.ts +2 -2
- package/dist/cli/commands/setup-azure.d.ts +2 -2
- package/dist/cli/commands/setup-bedrock.d.ts +2 -2
- package/dist/cli/commands/setup-gcp.d.ts +2 -2
- package/dist/cli/commands/setup-google-ai.d.ts +2 -2
- package/dist/cli/commands/setup-huggingface.d.ts +2 -2
- package/dist/cli/commands/setup-mistral.d.ts +2 -2
- package/dist/cli/commands/setup-openai.d.ts +2 -2
- package/dist/cli/commands/setup.d.ts +2 -2
- package/dist/cli/factories/commandFactory.js +16 -2
- package/dist/cli/loop/optionsSchema.d.ts +2 -2
- package/dist/cli/loop/session.d.ts +4 -0
- package/dist/cli/loop/session.js +49 -4
- package/dist/cli/utils/interactiveSetup.d.ts +4 -4
- package/dist/config/conversationMemory.d.ts +2 -0
- package/dist/config/conversationMemory.js +5 -5
- package/dist/constants/contextWindows.d.ts +46 -0
- package/dist/constants/contextWindows.js +156 -0
- package/dist/context/budgetChecker.d.ts +18 -0
- package/dist/context/budgetChecker.js +71 -0
- package/dist/context/contextCompactor.d.ts +22 -0
- package/dist/context/contextCompactor.js +106 -0
- package/dist/context/effectiveHistory.d.ts +52 -0
- package/dist/context/effectiveHistory.js +105 -0
- package/dist/context/errorDetection.d.ts +14 -0
- package/dist/context/errorDetection.js +124 -0
- package/dist/context/fileSummarizationService.d.ts +54 -0
- package/dist/context/fileSummarizationService.js +255 -0
- package/dist/context/fileSummarizer.d.ts +56 -0
- package/dist/context/fileSummarizer.js +145 -0
- package/dist/context/fileTokenBudget.d.ts +53 -0
- package/dist/context/fileTokenBudget.js +127 -0
- package/dist/context/prompts/summarizationPrompt.d.ts +17 -0
- package/dist/context/prompts/summarizationPrompt.js +110 -0
- package/dist/context/stages/fileReadDeduplicator.d.ts +10 -0
- package/dist/context/stages/fileReadDeduplicator.js +66 -0
- package/dist/context/stages/slidingWindowTruncator.d.ts +11 -0
- package/dist/context/stages/slidingWindowTruncator.js +42 -0
- package/dist/context/stages/structuredSummarizer.d.ts +10 -0
- package/dist/context/stages/structuredSummarizer.js +49 -0
- package/dist/context/stages/toolOutputPruner.d.ts +10 -0
- package/dist/context/stages/toolOutputPruner.js +52 -0
- package/dist/context/summarizationEngine.d.ts +45 -0
- package/dist/context/summarizationEngine.js +110 -0
- package/dist/context/toolOutputLimits.d.ts +17 -0
- package/dist/context/toolOutputLimits.js +84 -0
- package/dist/context/toolPairRepair.d.ts +16 -0
- package/dist/context/toolPairRepair.js +66 -0
- package/dist/core/conversationMemoryManager.d.ts +5 -15
- package/dist/core/conversationMemoryManager.js +15 -75
- package/dist/core/modules/MessageBuilder.d.ts +1 -1
- package/dist/core/modules/MessageBuilder.js +2 -0
- package/dist/core/modules/TelemetryHandler.d.ts +2 -3
- package/dist/core/modules/TelemetryHandler.js +3 -3
- package/dist/core/modules/ToolsManager.d.ts +2 -2
- package/dist/core/redisConversationMemoryManager.d.ts +8 -14
- package/dist/core/redisConversationMemoryManager.js +69 -78
- package/dist/factories/providerFactory.d.ts +2 -2
- package/dist/files/fileReferenceRegistry.d.ts +276 -0
- package/dist/files/fileReferenceRegistry.js +1543 -0
- package/dist/files/fileTools.d.ts +423 -0
- package/dist/files/fileTools.js +449 -0
- package/dist/files/index.d.ts +14 -0
- package/dist/files/index.js +13 -0
- package/dist/files/streamingReader.d.ts +93 -0
- package/dist/files/streamingReader.js +321 -0
- package/dist/files/types.d.ts +23 -0
- package/dist/files/types.js +23 -0
- package/dist/image-gen/imageGenTools.d.ts +2 -2
- package/dist/image-gen/types.d.ts +12 -12
- package/dist/lib/agent/directTools.d.ts +7 -7
- package/dist/lib/config/conversationMemory.d.ts +2 -0
- package/dist/lib/config/conversationMemory.js +5 -5
- package/dist/lib/constants/contextWindows.d.ts +46 -0
- package/dist/lib/constants/contextWindows.js +157 -0
- package/dist/lib/context/budgetChecker.d.ts +18 -0
- package/dist/lib/context/budgetChecker.js +72 -0
- package/dist/lib/context/contextCompactor.d.ts +22 -0
- package/dist/lib/context/contextCompactor.js +107 -0
- package/dist/lib/context/effectiveHistory.d.ts +52 -0
- package/dist/lib/context/effectiveHistory.js +106 -0
- package/dist/lib/context/errorDetection.d.ts +14 -0
- package/dist/lib/context/errorDetection.js +125 -0
- package/dist/lib/context/fileSummarizationService.d.ts +54 -0
- package/dist/lib/context/fileSummarizationService.js +256 -0
- package/dist/lib/context/fileSummarizer.d.ts +56 -0
- package/dist/lib/context/fileSummarizer.js +146 -0
- package/dist/lib/context/fileTokenBudget.d.ts +53 -0
- package/dist/lib/context/fileTokenBudget.js +128 -0
- package/dist/lib/context/prompts/summarizationPrompt.d.ts +17 -0
- package/dist/lib/context/prompts/summarizationPrompt.js +111 -0
- package/dist/lib/context/stages/fileReadDeduplicator.d.ts +10 -0
- package/dist/lib/context/stages/fileReadDeduplicator.js +67 -0
- package/dist/lib/context/stages/slidingWindowTruncator.d.ts +11 -0
- package/dist/lib/context/stages/slidingWindowTruncator.js +43 -0
- package/dist/lib/context/stages/structuredSummarizer.d.ts +10 -0
- package/dist/lib/context/stages/structuredSummarizer.js +50 -0
- package/dist/lib/context/stages/toolOutputPruner.d.ts +10 -0
- package/dist/lib/context/stages/toolOutputPruner.js +53 -0
- package/dist/lib/context/summarizationEngine.d.ts +45 -0
- package/dist/lib/context/summarizationEngine.js +111 -0
- package/dist/lib/context/toolOutputLimits.d.ts +17 -0
- package/dist/lib/context/toolOutputLimits.js +85 -0
- package/dist/lib/context/toolPairRepair.d.ts +16 -0
- package/dist/lib/context/toolPairRepair.js +67 -0
- package/dist/lib/core/conversationMemoryManager.d.ts +5 -15
- package/dist/lib/core/conversationMemoryManager.js +15 -75
- package/dist/lib/core/modules/MessageBuilder.d.ts +1 -1
- package/dist/lib/core/modules/MessageBuilder.js +2 -0
- package/dist/lib/core/modules/TelemetryHandler.d.ts +2 -3
- package/dist/lib/core/modules/TelemetryHandler.js +3 -3
- package/dist/lib/core/modules/ToolsManager.d.ts +2 -2
- package/dist/lib/core/redisConversationMemoryManager.d.ts +8 -14
- package/dist/lib/core/redisConversationMemoryManager.js +69 -78
- package/dist/lib/factories/providerFactory.d.ts +2 -2
- package/dist/lib/files/fileReferenceRegistry.d.ts +276 -0
- package/dist/lib/files/fileReferenceRegistry.js +1544 -0
- package/dist/lib/files/fileTools.d.ts +423 -0
- package/dist/lib/files/fileTools.js +450 -0
- package/dist/lib/files/index.d.ts +14 -0
- package/dist/lib/files/index.js +14 -0
- package/dist/lib/files/streamingReader.d.ts +93 -0
- package/dist/lib/files/streamingReader.js +322 -0
- package/dist/lib/files/types.d.ts +23 -0
- package/dist/lib/files/types.js +24 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +2 -2
- package/dist/lib/image-gen/types.d.ts +12 -12
- package/dist/lib/memory/mem0Initializer.d.ts +2 -2
- package/dist/lib/neurolink.d.ts +61 -2
- package/dist/lib/neurolink.js +619 -307
- package/dist/lib/processors/archive/ArchiveProcessor.d.ts +327 -0
- package/dist/lib/processors/archive/ArchiveProcessor.js +1309 -0
- package/dist/lib/processors/archive/index.d.ts +33 -0
- package/dist/lib/processors/archive/index.js +43 -0
- package/dist/lib/processors/base/types.d.ts +70 -64
- package/dist/lib/processors/base/types.js +6 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +8 -8
- package/dist/lib/processors/cli/fileProcessorCli.js +5 -5
- package/dist/lib/processors/config/mimeTypes.js +25 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +52 -40
- package/dist/lib/processors/config/sizeLimits.js +56 -44
- package/dist/lib/processors/document/ExcelProcessor.d.ts +14 -0
- package/dist/lib/processors/document/ExcelProcessor.js +72 -1
- package/dist/lib/processors/document/PptxProcessor.d.ts +63 -0
- package/dist/lib/processors/document/PptxProcessor.js +158 -0
- package/dist/lib/processors/document/index.d.ts +1 -0
- package/dist/lib/processors/document/index.js +6 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +2 -2
- package/dist/lib/processors/errors/errorHelpers.d.ts +2 -2
- package/dist/lib/processors/errors/errorSerializer.d.ts +4 -4
- package/dist/lib/processors/index.d.ts +8 -2
- package/dist/lib/processors/index.js +5 -2
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +8 -8
- package/dist/lib/processors/integration/FileProcessorIntegration.js +7 -7
- package/dist/lib/processors/media/AudioProcessor.d.ts +328 -0
- package/dist/lib/processors/media/AudioProcessor.js +708 -0
- package/dist/lib/processors/media/VideoProcessor.d.ts +350 -0
- package/dist/lib/processors/media/VideoProcessor.js +992 -0
- package/dist/lib/processors/media/index.d.ts +27 -0
- package/dist/lib/processors/media/index.js +37 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +19 -5
- package/dist/lib/processors/registry/ProcessorRegistry.js +103 -8
- package/dist/lib/processors/registry/index.d.ts +1 -1
- package/dist/lib/processors/registry/index.js +1 -1
- package/dist/lib/processors/registry/types.d.ts +2 -2
- package/dist/lib/providers/googleAiStudio.d.ts +34 -0
- package/dist/lib/providers/googleAiStudio.js +267 -397
- package/dist/lib/providers/googleVertex.d.ts +55 -1
- package/dist/lib/providers/googleVertex.js +452 -719
- package/dist/lib/providers/sagemaker/detection.d.ts +6 -6
- package/dist/lib/providers/sagemaker/diagnostics.d.ts +4 -4
- package/dist/lib/providers/sagemaker/parsers.d.ts +4 -4
- package/dist/lib/rag/chunkers/RecursiveChunker.js +2 -2
- package/dist/lib/rag/document/loaders.d.ts +6 -71
- package/dist/lib/rag/document/loaders.js +5 -5
- package/dist/lib/rag/graphRag/graphRAG.js +26 -9
- package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +5 -55
- package/dist/lib/rag/metadata/metadataExtractor.js +6 -3
- package/dist/lib/rag/pipeline/RAGPipeline.d.ts +8 -126
- package/dist/lib/rag/pipeline/RAGPipeline.js +11 -11
- package/dist/lib/rag/pipeline/contextAssembly.d.ts +3 -42
- package/dist/lib/rag/pipeline/contextAssembly.js +6 -3
- package/dist/lib/rag/reranker/RerankerFactory.d.ts +5 -60
- package/dist/lib/rag/resilience/CircuitBreaker.d.ts +3 -33
- package/dist/lib/rag/resilience/RetryHandler.d.ts +2 -21
- package/dist/lib/rag/retrieval/hybridSearch.d.ts +3 -41
- package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +2 -13
- package/dist/lib/rag/retrieval/vectorQueryTool.js +4 -3
- package/dist/lib/rag/types.d.ts +3 -3
- package/dist/lib/sdk/toolRegistration.d.ts +2 -2
- package/dist/lib/server/middleware/cache.d.ts +2 -2
- package/dist/lib/server/middleware/rateLimit.d.ts +2 -2
- package/dist/lib/server/routes/mcpRoutes.js +277 -249
- package/dist/lib/server/routes/memoryRoutes.js +287 -281
- package/dist/lib/server/utils/validation.d.ts +10 -10
- package/dist/lib/session/globalSessionState.d.ts +2 -2
- package/dist/lib/telemetry/telemetryService.d.ts +2 -2
- package/dist/lib/types/common.d.ts +39 -0
- package/dist/lib/types/contextTypes.d.ts +255 -0
- package/dist/lib/types/contextTypes.js +0 -2
- package/dist/lib/types/conversation.d.ts +62 -0
- package/dist/lib/types/conversationMemoryInterface.d.ts +27 -0
- package/dist/lib/types/conversationMemoryInterface.js +7 -0
- package/dist/lib/types/fileReferenceTypes.d.ts +222 -0
- package/dist/lib/types/fileReferenceTypes.js +9 -0
- package/dist/lib/types/fileTypes.d.ts +26 -3
- package/dist/lib/types/generateTypes.d.ts +22 -1
- package/dist/lib/types/index.d.ts +4 -5
- package/dist/lib/types/index.js +8 -10
- package/dist/lib/types/modelTypes.d.ts +2 -2
- package/dist/lib/types/processorTypes.d.ts +597 -0
- package/dist/lib/types/processorTypes.js +91 -0
- package/dist/lib/types/ragTypes.d.ts +481 -0
- package/dist/lib/types/ragTypes.js +8 -0
- package/dist/lib/types/sdkTypes.d.ts +17 -18
- package/dist/lib/types/streamTypes.d.ts +11 -1
- package/dist/lib/utils/async/retry.d.ts +2 -2
- package/dist/lib/utils/async/withTimeout.js +3 -1
- package/dist/lib/utils/conversationMemory.d.ts +12 -6
- package/dist/lib/utils/conversationMemory.js +76 -36
- package/dist/lib/utils/fileDetector.d.ts +62 -0
- package/dist/lib/utils/fileDetector.js +1014 -14
- package/dist/lib/utils/json/safeParse.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +806 -153
- package/dist/lib/utils/modelChoices.d.ts +2 -2
- package/dist/lib/utils/multimodalOptionsBuilder.d.ts +2 -1
- package/dist/lib/utils/multimodalOptionsBuilder.js +1 -0
- package/dist/lib/utils/rateLimiter.d.ts +2 -2
- package/dist/lib/utils/sanitizers/filename.d.ts +4 -4
- package/dist/lib/utils/sanitizers/svg.d.ts +2 -2
- package/dist/lib/utils/thinkingConfig.d.ts +6 -6
- package/dist/lib/utils/tokenEstimation.d.ts +68 -0
- package/dist/lib/utils/tokenEstimation.js +113 -0
- package/dist/lib/utils/tokenUtils.d.ts +4 -4
- package/dist/lib/utils/ttsProcessor.d.ts +2 -2
- package/dist/lib/workflow/config.d.ts +150 -150
- package/dist/memory/mem0Initializer.d.ts +2 -2
- package/dist/neurolink.d.ts +61 -2
- package/dist/neurolink.js +619 -307
- package/dist/processors/archive/ArchiveProcessor.d.ts +327 -0
- package/dist/processors/archive/ArchiveProcessor.js +1308 -0
- package/dist/processors/archive/index.d.ts +33 -0
- package/dist/processors/archive/index.js +42 -0
- package/dist/processors/base/types.d.ts +70 -64
- package/dist/processors/base/types.js +6 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +8 -8
- package/dist/processors/cli/fileProcessorCli.js +5 -5
- package/dist/processors/config/mimeTypes.js +25 -0
- package/dist/processors/config/sizeLimits.d.ts +52 -40
- package/dist/processors/config/sizeLimits.js +56 -44
- package/dist/processors/document/ExcelProcessor.d.ts +14 -0
- package/dist/processors/document/ExcelProcessor.js +72 -1
- package/dist/processors/document/PptxProcessor.d.ts +63 -0
- package/dist/processors/document/PptxProcessor.js +157 -0
- package/dist/processors/document/index.d.ts +1 -0
- package/dist/processors/document/index.js +6 -0
- package/dist/processors/errors/FileErrorCode.d.ts +2 -2
- package/dist/processors/errors/errorHelpers.d.ts +2 -2
- package/dist/processors/errors/errorSerializer.d.ts +4 -4
- package/dist/processors/index.d.ts +8 -2
- package/dist/processors/index.js +5 -2
- package/dist/processors/integration/FileProcessorIntegration.d.ts +8 -8
- package/dist/processors/integration/FileProcessorIntegration.js +7 -7
- package/dist/processors/media/AudioProcessor.d.ts +328 -0
- package/dist/processors/media/AudioProcessor.js +707 -0
- package/dist/processors/media/VideoProcessor.d.ts +350 -0
- package/dist/processors/media/VideoProcessor.js +991 -0
- package/dist/processors/media/ffprobe-static.d.ts +4 -0
- package/dist/processors/media/index.d.ts +27 -0
- package/dist/processors/media/index.js +36 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +19 -5
- package/dist/processors/registry/ProcessorRegistry.js +103 -8
- package/dist/processors/registry/index.d.ts +1 -1
- package/dist/processors/registry/index.js +1 -1
- package/dist/processors/registry/types.d.ts +2 -2
- package/dist/providers/googleAiStudio.d.ts +34 -0
- package/dist/providers/googleAiStudio.js +267 -397
- package/dist/providers/googleVertex.d.ts +55 -1
- package/dist/providers/googleVertex.js +452 -719
- package/dist/providers/sagemaker/detection.d.ts +6 -6
- package/dist/providers/sagemaker/diagnostics.d.ts +4 -4
- package/dist/providers/sagemaker/parsers.d.ts +4 -4
- package/dist/rag/chunkers/RecursiveChunker.js +2 -2
- package/dist/rag/document/loaders.d.ts +6 -71
- package/dist/rag/document/loaders.js +5 -5
- package/dist/rag/graphRag/graphRAG.js +26 -9
- package/dist/rag/metadata/MetadataExtractorFactory.d.ts +5 -55
- package/dist/rag/metadata/metadataExtractor.js +6 -3
- package/dist/rag/pipeline/RAGPipeline.d.ts +8 -126
- package/dist/rag/pipeline/RAGPipeline.js +11 -11
- package/dist/rag/pipeline/contextAssembly.d.ts +3 -42
- package/dist/rag/pipeline/contextAssembly.js +6 -3
- package/dist/rag/reranker/RerankerFactory.d.ts +5 -60
- package/dist/rag/resilience/CircuitBreaker.d.ts +3 -33
- package/dist/rag/resilience/RetryHandler.d.ts +2 -21
- package/dist/rag/retrieval/hybridSearch.d.ts +3 -41
- package/dist/rag/retrieval/vectorQueryTool.d.ts +2 -13
- package/dist/rag/retrieval/vectorQueryTool.js +4 -3
- package/dist/rag/types.d.ts +3 -3
- package/dist/sdk/toolRegistration.d.ts +2 -2
- package/dist/server/middleware/cache.d.ts +2 -2
- package/dist/server/middleware/rateLimit.d.ts +2 -2
- package/dist/server/routes/mcpRoutes.js +277 -249
- package/dist/server/routes/memoryRoutes.js +287 -281
- package/dist/server/utils/validation.d.ts +4 -4
- package/dist/session/globalSessionState.d.ts +2 -2
- package/dist/telemetry/telemetryService.d.ts +2 -2
- package/dist/types/common.d.ts +39 -0
- package/dist/types/contextTypes.d.ts +255 -0
- package/dist/types/contextTypes.js +0 -2
- package/dist/types/conversation.d.ts +62 -0
- package/dist/types/conversationMemoryInterface.d.ts +27 -0
- package/dist/types/conversationMemoryInterface.js +6 -0
- package/dist/types/fileReferenceTypes.d.ts +222 -0
- package/dist/types/fileReferenceTypes.js +8 -0
- package/dist/types/fileTypes.d.ts +26 -3
- package/dist/types/generateTypes.d.ts +22 -1
- package/dist/types/index.d.ts +4 -5
- package/dist/types/index.js +8 -10
- package/dist/types/processorTypes.d.ts +597 -0
- package/dist/types/processorTypes.js +90 -0
- package/dist/types/ragTypes.d.ts +481 -0
- package/dist/types/ragTypes.js +7 -0
- package/dist/types/sdkTypes.d.ts +17 -18
- package/dist/types/streamTypes.d.ts +11 -1
- package/dist/utils/async/retry.d.ts +2 -2
- package/dist/utils/async/withTimeout.js +3 -1
- package/dist/utils/conversationMemory.d.ts +12 -6
- package/dist/utils/conversationMemory.js +76 -36
- package/dist/utils/fileDetector.d.ts +62 -0
- package/dist/utils/fileDetector.js +1014 -14
- package/dist/utils/json/safeParse.d.ts +2 -2
- package/dist/utils/messageBuilder.js +806 -153
- package/dist/utils/modelChoices.d.ts +2 -2
- package/dist/utils/multimodalOptionsBuilder.d.ts +2 -1
- package/dist/utils/multimodalOptionsBuilder.js +1 -0
- package/dist/utils/rateLimiter.d.ts +2 -2
- package/dist/utils/sanitizers/filename.d.ts +4 -4
- package/dist/utils/sanitizers/svg.d.ts +2 -2
- package/dist/utils/thinkingConfig.d.ts +6 -6
- package/dist/utils/tokenEstimation.d.ts +68 -0
- package/dist/utils/tokenEstimation.js +112 -0
- package/dist/utils/tokenUtils.d.ts +4 -4
- package/dist/utils/ttsProcessor.d.ts +2 -2
- package/dist/workflow/config.d.ts +104 -104
- package/package.json +18 -6
- package/dist/lib/utils/conversationMemoryUtils.d.ts +0 -25
- package/dist/lib/utils/conversationMemoryUtils.js +0 -138
- package/dist/utils/conversationMemoryUtils.d.ts +0 -25
- package/dist/utils/conversationMemoryUtils.js +0 -137
|
@@ -0,0 +1,1308 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Archive Processor
|
|
3
|
+
*
|
|
4
|
+
* Handles downloading, validating, and processing archive files (ZIP, TAR, TAR.GZ, GZ).
|
|
5
|
+
* Extracts file listings with metadata for AI consumption without recursively
|
|
6
|
+
* processing individual entries through other processors (Phase 1).
|
|
7
|
+
*
|
|
8
|
+
* Key features:
|
|
9
|
+
* - ZIP support via adm-zip (dynamic import)
|
|
10
|
+
* - TAR / TAR.GZ support via tar-stream (dynamic import)
|
|
11
|
+
* - Plain GZ support via Node zlib
|
|
12
|
+
* - Comprehensive security validation (path traversal, zip bombs, symlinks, encryption)
|
|
13
|
+
* - In-memory extraction with configurable size limits
|
|
14
|
+
* - Structured text output for LLM consumption
|
|
15
|
+
*
|
|
16
|
+
* @module processors/archive/ArchiveProcessor
|
|
17
|
+
*
|
|
18
|
+
* @example
|
|
19
|
+
* ```typescript
|
|
20
|
+
* import { archiveProcessor, processArchive, isArchiveFile } from "./ArchiveProcessor.js";
|
|
21
|
+
*
|
|
22
|
+
* // Check if a file is an archive
|
|
23
|
+
* if (isArchiveFile(fileInfo.mimetype, fileInfo.name)) {
|
|
24
|
+
* const result = await processArchive(fileInfo, {
|
|
25
|
+
* authHeaders: { Authorization: "Bearer token" },
|
|
26
|
+
* });
|
|
27
|
+
*
|
|
28
|
+
* if (result.success) {
|
|
29
|
+
* console.log(`Format: ${result.data.archiveMetadata.format}`);
|
|
30
|
+
* console.log(`Entries: ${result.data.archiveMetadata.totalEntries}`);
|
|
31
|
+
* for (const entry of result.data.entries) {
|
|
32
|
+
* console.log(` ${entry.name} (${entry.uncompressedSize} bytes)`);
|
|
33
|
+
* }
|
|
34
|
+
* }
|
|
35
|
+
* }
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
38
|
+
import * as path from "path";
|
|
39
|
+
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
|
|
40
|
+
import { SIZE_LIMITS_MB } from "../config/index.js";
|
|
41
|
+
import { FileErrorCode } from "../errors/index.js";
|
|
42
|
+
// =============================================================================
|
|
43
|
+
// SECURITY CONFIGURATION
|
|
44
|
+
// =============================================================================
|
|
45
|
+
/**
|
|
46
|
+
* Security limits for archive processing.
|
|
47
|
+
* These values are intentionally conservative to prevent resource exhaustion
|
|
48
|
+
* and common archive-based attacks (zip bombs, path traversal, etc.).
|
|
49
|
+
*/
|
|
50
|
+
const ARCHIVE_SECURITY = {
|
|
51
|
+
/** Maximum number of entries allowed in a single archive */
|
|
52
|
+
MAX_ENTRIES: 1000,
|
|
53
|
+
/** Maximum total decompressed size allowed (100 MB) */
|
|
54
|
+
MAX_DECOMPRESSED_SIZE: 100 * 1024 * 1024,
|
|
55
|
+
/** Maximum size of any single file within the archive (20 MB) */
|
|
56
|
+
MAX_SINGLE_FILE_SIZE: 20 * 1024 * 1024,
|
|
57
|
+
/** Maximum compression ratio before flagging as potential zip bomb */
|
|
58
|
+
MAX_COMPRESSION_RATIO: 100,
|
|
59
|
+
/**
|
|
60
|
+
* Maximum archive nesting depth.
|
|
61
|
+
* Phase 1 only lists contents (no recursive extraction), so depth is 1.
|
|
62
|
+
*/
|
|
63
|
+
MAX_NESTING_DEPTH: 1,
|
|
64
|
+
/** Maximum path length for any entry name */
|
|
65
|
+
MAX_PATH_LENGTH: 255,
|
|
66
|
+
/** Whether to allow encrypted archive entries */
|
|
67
|
+
ALLOW_ENCRYPTED: false,
|
|
68
|
+
/** Whether to allow symbolic link entries */
|
|
69
|
+
ALLOW_SYMLINKS: false,
|
|
70
|
+
};
|
|
71
|
+
/**
|
|
72
|
+
* Archive processor configuration constants.
|
|
73
|
+
*/
|
|
74
|
+
const ARCHIVE_CONFIG = {
|
|
75
|
+
/** Maximum archive file size in MB (uses centralized constant from sizeLimits) */
|
|
76
|
+
MAX_SIZE_MB: SIZE_LIMITS_MB.ARCHIVE_MAX_MB,
|
|
77
|
+
/** Processing timeout in milliseconds (60 seconds) */
|
|
78
|
+
TIMEOUT_MS: 60_000,
|
|
79
|
+
/** Maximum number of entries to extract content from (Phase 2 sub-processing) */
|
|
80
|
+
MAX_EXTRACT_ENTRIES: 20,
|
|
81
|
+
/** Maximum size of a single entry to extract for content processing (1 MB) */
|
|
82
|
+
MAX_EXTRACT_ENTRY_SIZE: 1 * 1024 * 1024,
|
|
83
|
+
/** Maximum total extracted content size across all entries (5 MB) */
|
|
84
|
+
MAX_TOTAL_EXTRACT_SIZE: 5 * 1024 * 1024,
|
|
85
|
+
/** File extensions eligible for content extraction inside archives */
|
|
86
|
+
EXTRACTABLE_EXTENSIONS: new Set([
|
|
87
|
+
".ts",
|
|
88
|
+
".js",
|
|
89
|
+
".tsx",
|
|
90
|
+
".jsx",
|
|
91
|
+
".py",
|
|
92
|
+
".java",
|
|
93
|
+
".go",
|
|
94
|
+
".rs",
|
|
95
|
+
".rb",
|
|
96
|
+
".php",
|
|
97
|
+
".c",
|
|
98
|
+
".cpp",
|
|
99
|
+
".h",
|
|
100
|
+
".hpp",
|
|
101
|
+
".cs",
|
|
102
|
+
".swift",
|
|
103
|
+
".kt",
|
|
104
|
+
".scala",
|
|
105
|
+
".sh",
|
|
106
|
+
".bash",
|
|
107
|
+
".txt",
|
|
108
|
+
".md",
|
|
109
|
+
".json",
|
|
110
|
+
".yaml",
|
|
111
|
+
".yml",
|
|
112
|
+
".xml",
|
|
113
|
+
".html",
|
|
114
|
+
".css",
|
|
115
|
+
".sql",
|
|
116
|
+
".toml",
|
|
117
|
+
".ini",
|
|
118
|
+
".cfg",
|
|
119
|
+
".env",
|
|
120
|
+
".csv",
|
|
121
|
+
".log",
|
|
122
|
+
".conf",
|
|
123
|
+
".dockerfile",
|
|
124
|
+
".makefile",
|
|
125
|
+
".gitignore",
|
|
126
|
+
".editorconfig",
|
|
127
|
+
]),
|
|
128
|
+
};
|
|
129
|
+
// =============================================================================
|
|
130
|
+
// SUPPORTED FORMATS
|
|
131
|
+
// =============================================================================
|
|
132
|
+
/** MIME types recognized as archive formats */
|
|
133
|
+
const SUPPORTED_ARCHIVE_MIME_TYPES = [
|
|
134
|
+
"application/zip",
|
|
135
|
+
"application/x-zip-compressed",
|
|
136
|
+
"application/x-zip",
|
|
137
|
+
"application/x-tar",
|
|
138
|
+
"application/x-gtar",
|
|
139
|
+
"application/gzip",
|
|
140
|
+
"application/x-gzip",
|
|
141
|
+
"application/x-compressed-tar",
|
|
142
|
+
"application/x-bzip2",
|
|
143
|
+
"application/java-archive",
|
|
144
|
+
];
|
|
145
|
+
/** File extensions recognized as archive formats */
|
|
146
|
+
const SUPPORTED_ARCHIVE_EXTENSIONS = [".zip", ".tar", ".gz", ".tgz", ".bz2", ".tbz2", ".jar"];
|
|
147
|
+
// =============================================================================
|
|
148
|
+
// MAGIC BYTE SIGNATURES
|
|
149
|
+
// =============================================================================
|
|
150
|
+
/**
|
|
151
|
+
* Magic byte signatures for archive format detection.
|
|
152
|
+
* Used alongside file extension for robust format identification.
|
|
153
|
+
*/
|
|
154
|
+
const MAGIC_BYTES = {
|
|
155
|
+
/** ZIP/JAR: PK\x03\x04 */
|
|
156
|
+
ZIP: [0x50, 0x4b, 0x03, 0x04],
|
|
157
|
+
/** ZIP empty archive: PK\x05\x06 */
|
|
158
|
+
ZIP_EMPTY: [0x50, 0x4b, 0x05, 0x06],
|
|
159
|
+
/** ZIP spanned: PK\x07\x08 */
|
|
160
|
+
ZIP_SPANNED: [0x50, 0x4b, 0x07, 0x08],
|
|
161
|
+
/** GZIP: \x1f\x8b */
|
|
162
|
+
GZIP: [0x1f, 0x8b],
|
|
163
|
+
/** BZIP2: BZ */
|
|
164
|
+
BZIP2: [0x42, 0x5a],
|
|
165
|
+
/** RAR: Rar!\x1a\x07 */
|
|
166
|
+
RAR: [0x52, 0x61, 0x72, 0x21, 0x1a, 0x07],
|
|
167
|
+
/** 7-Zip: 7z\xbc\xaf\x27\x1c */
|
|
168
|
+
SEVEN_ZIP: [0x37, 0x7a, 0xbc, 0xaf, 0x27, 0x1c],
|
|
169
|
+
};
|
|
170
|
+
// =============================================================================
|
|
171
|
+
// ARCHIVE PROCESSOR CLASS
|
|
172
|
+
// =============================================================================
|
|
173
|
+
/**
|
|
174
|
+
* Archive Processor - handles ZIP, TAR, TAR.GZ, and plain GZ files.
|
|
175
|
+
*
|
|
176
|
+
* Overrides the base `processFile()` to implement a custom pipeline:
|
|
177
|
+
* 1. Validate file type and size
|
|
178
|
+
* 2. Obtain the archive buffer (from provided buffer or URL download)
|
|
179
|
+
* 3. Detect the archive format via magic bytes and file extension
|
|
180
|
+
* 4. Run security validation (path traversal, zip bombs, encryption, symlinks)
|
|
181
|
+
* 5. Extract entry metadata (no recursive file processing in Phase 1)
|
|
182
|
+
* 6. Build LLM-friendly text content with file listing
|
|
183
|
+
*
|
|
184
|
+
* RAR and 7z formats are detected but not yet supported for extraction.
|
|
185
|
+
*
|
|
186
|
+
* @example
|
|
187
|
+
* ```typescript
|
|
188
|
+
* const processor = new ArchiveProcessor();
|
|
189
|
+
*
|
|
190
|
+
* const result = await processor.processFile(fileInfo, {
|
|
191
|
+
* authHeaders: { Authorization: "Bearer token" },
|
|
192
|
+
* });
|
|
193
|
+
*
|
|
194
|
+
* if (result.success) {
|
|
195
|
+
* console.log(`Format: ${result.data.archiveMetadata.format}`);
|
|
196
|
+
* console.log(`Entries: ${result.data.entries.length}`);
|
|
197
|
+
* console.log(result.data.textContent);
|
|
198
|
+
* }
|
|
199
|
+
* ```
|
|
200
|
+
*/
|
|
201
|
+
export class ArchiveProcessor extends BaseFileProcessor {
|
|
202
|
+
constructor() {
|
|
203
|
+
super({
|
|
204
|
+
maxSizeMB: ARCHIVE_CONFIG.MAX_SIZE_MB,
|
|
205
|
+
timeoutMs: ARCHIVE_CONFIG.TIMEOUT_MS,
|
|
206
|
+
supportedMimeTypes: [...SUPPORTED_ARCHIVE_MIME_TYPES],
|
|
207
|
+
supportedExtensions: [...SUPPORTED_ARCHIVE_EXTENSIONS],
|
|
208
|
+
fileTypeName: "archive",
|
|
209
|
+
defaultFilename: "archive.zip",
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
// ===========================================================================
|
|
213
|
+
// ABSTRACT METHOD IMPLEMENTATION
|
|
214
|
+
// ===========================================================================
|
|
215
|
+
/**
|
|
216
|
+
* Build a stub processed result.
|
|
217
|
+
* The actual work is done in the `processFile()` override; this method
|
|
218
|
+
* satisfies the abstract contract from `BaseFileProcessor`.
|
|
219
|
+
*
|
|
220
|
+
* @param buffer - Raw archive buffer
|
|
221
|
+
* @param fileInfo - Original file information
|
|
222
|
+
* @returns Empty ProcessedArchive scaffold
|
|
223
|
+
*/
|
|
224
|
+
buildProcessedResult(buffer, fileInfo) {
|
|
225
|
+
return {
|
|
226
|
+
buffer,
|
|
227
|
+
mimetype: fileInfo.mimetype || "application/octet-stream",
|
|
228
|
+
size: buffer.length,
|
|
229
|
+
filename: this.getFilename(fileInfo),
|
|
230
|
+
textContent: "",
|
|
231
|
+
archiveMetadata: {
|
|
232
|
+
format: "zip",
|
|
233
|
+
totalEntries: 0,
|
|
234
|
+
totalUncompressedSize: 0,
|
|
235
|
+
totalCompressedSize: 0,
|
|
236
|
+
},
|
|
237
|
+
entries: [],
|
|
238
|
+
securityWarnings: [],
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
// ===========================================================================
|
|
242
|
+
// MAIN PROCESSING PIPELINE (override)
|
|
243
|
+
// ===========================================================================
|
|
244
|
+
/**
|
|
245
|
+
* Process an archive file through the full extraction pipeline.
|
|
246
|
+
*
|
|
247
|
+
* @param fileInfo - File information (can include URL or buffer)
|
|
248
|
+
* @param options - Optional processing options (auth headers, timeout, etc.)
|
|
249
|
+
* @returns Processing result with archive metadata and entry listing, or error
|
|
250
|
+
*/
|
|
251
|
+
async processFile(fileInfo, options) {
|
|
252
|
+
try {
|
|
253
|
+
// Step 1: Validate file type and size
|
|
254
|
+
const validationResult = this.validateFileWithResult(fileInfo);
|
|
255
|
+
if (!validationResult.success) {
|
|
256
|
+
return { success: false, error: validationResult.error };
|
|
257
|
+
}
|
|
258
|
+
// Step 2: Get file buffer
|
|
259
|
+
let buffer;
|
|
260
|
+
if (fileInfo.buffer) {
|
|
261
|
+
buffer = fileInfo.buffer;
|
|
262
|
+
}
|
|
263
|
+
else if (fileInfo.url) {
|
|
264
|
+
const downloadResult = await this.downloadFileWithRetry(fileInfo, options);
|
|
265
|
+
if (!downloadResult.success) {
|
|
266
|
+
return { success: false, error: downloadResult.error };
|
|
267
|
+
}
|
|
268
|
+
if (!downloadResult.data) {
|
|
269
|
+
return {
|
|
270
|
+
success: false,
|
|
271
|
+
error: this.createError(FileErrorCode.DOWNLOAD_FAILED, {
|
|
272
|
+
reason: "Download succeeded but returned no data",
|
|
273
|
+
}),
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
buffer = downloadResult.data;
|
|
277
|
+
// Validate actual downloaded size against limit
|
|
278
|
+
if (!this.validateFileSize(buffer.length)) {
|
|
279
|
+
return {
|
|
280
|
+
success: false,
|
|
281
|
+
error: this.createError(FileErrorCode.FILE_TOO_LARGE, {
|
|
282
|
+
sizeMB: (buffer.length / (1024 * 1024)).toFixed(2),
|
|
283
|
+
maxMB: this.config.maxSizeMB,
|
|
284
|
+
type: this.config.fileTypeName,
|
|
285
|
+
}),
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
else {
|
|
290
|
+
return {
|
|
291
|
+
success: false,
|
|
292
|
+
error: this.createError(FileErrorCode.DOWNLOAD_FAILED, {
|
|
293
|
+
reason: "No buffer or URL provided for file",
|
|
294
|
+
}),
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
// Step 3: Detect archive format
|
|
298
|
+
const filename = this.getFilename(fileInfo);
|
|
299
|
+
const format = this.detectArchiveFormat(buffer, filename);
|
|
300
|
+
if (!format) {
|
|
301
|
+
return {
|
|
302
|
+
success: false,
|
|
303
|
+
error: this.createError(FileErrorCode.INVALID_FORMAT, {
|
|
304
|
+
reason: "Unable to detect archive format from magic bytes or file extension",
|
|
305
|
+
}),
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
// Step 4: Check for unsupported formats (RAR, 7z)
|
|
309
|
+
if (format === "rar" || format === "7z") {
|
|
310
|
+
return {
|
|
311
|
+
success: false,
|
|
312
|
+
error: this.createError(FileErrorCode.UNSUPPORTED_TYPE, {
|
|
313
|
+
format,
|
|
314
|
+
reason: `${format.toUpperCase()} archives are not yet supported. Please convert to ZIP or TAR format.`,
|
|
315
|
+
supportedFormats: "ZIP, TAR, TAR.GZ, GZ",
|
|
316
|
+
}),
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
// Step 5: Extract entries based on format
|
|
320
|
+
const extractionResult = await this.extractEntries(buffer, format);
|
|
321
|
+
if (!extractionResult.success) {
|
|
322
|
+
return {
|
|
323
|
+
success: false,
|
|
324
|
+
error: extractionResult.error,
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
const { entries, securityWarnings } = extractionResult;
|
|
328
|
+
// Step 6: Compute aggregate metadata
|
|
329
|
+
const totalUncompressedSize = entries.reduce((sum, e) => sum + e.uncompressedSize, 0);
|
|
330
|
+
const totalCompressedSize = entries.reduce((sum, e) => sum + e.compressedSize, 0);
|
|
331
|
+
// Step 7: Security check - overall compression ratio
|
|
332
|
+
if (buffer.length > 0 && totalUncompressedSize > 0) {
|
|
333
|
+
const overallRatio = totalUncompressedSize / buffer.length;
|
|
334
|
+
if (overallRatio > ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO) {
|
|
335
|
+
return {
|
|
336
|
+
success: false,
|
|
337
|
+
error: this.createError(FileErrorCode.ZIP_BOMB_DETECTED, {
|
|
338
|
+
compressionRatio: overallRatio.toFixed(1),
|
|
339
|
+
maxRatio: ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO,
|
|
340
|
+
}),
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
// Step 8: Security check - total decompressed size
|
|
345
|
+
if (totalUncompressedSize > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) {
|
|
346
|
+
return {
|
|
347
|
+
success: false,
|
|
348
|
+
error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, {
|
|
349
|
+
reason: `Total decompressed size (${this.formatSizeMB(totalUncompressedSize)} MB) exceeds limit (${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB)`,
|
|
350
|
+
}),
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
// Step 9: Extract content from text-based entries (Phase 2 sub-processing)
|
|
354
|
+
// For ZIP archives, extract and include content from small text-based files.
|
|
355
|
+
// Skips nested archives and binary files for safety.
|
|
356
|
+
let extractedContents = new Map();
|
|
357
|
+
if (format === "zip") {
|
|
358
|
+
extractedContents = await this.extractEntryContents(buffer, entries);
|
|
359
|
+
}
|
|
360
|
+
// Step 10: Build text content for LLM
|
|
361
|
+
const archiveMetadata = {
|
|
362
|
+
format,
|
|
363
|
+
totalEntries: entries.length,
|
|
364
|
+
totalUncompressedSize,
|
|
365
|
+
totalCompressedSize,
|
|
366
|
+
};
|
|
367
|
+
const textContent = this.buildTextContent(filename, archiveMetadata, entries, securityWarnings, extractedContents);
|
|
368
|
+
// Step 10: Build final result
|
|
369
|
+
return {
|
|
370
|
+
success: true,
|
|
371
|
+
data: {
|
|
372
|
+
buffer,
|
|
373
|
+
mimetype: fileInfo.mimetype || "application/octet-stream",
|
|
374
|
+
size: buffer.length,
|
|
375
|
+
filename,
|
|
376
|
+
textContent,
|
|
377
|
+
archiveMetadata,
|
|
378
|
+
entries,
|
|
379
|
+
securityWarnings,
|
|
380
|
+
},
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
catch (error) {
|
|
384
|
+
return {
|
|
385
|
+
success: false,
|
|
386
|
+
error: this.createError(FileErrorCode.PROCESSING_FAILED, {
|
|
387
|
+
fileType: "archive",
|
|
388
|
+
error: error instanceof Error ? error.message : String(error),
|
|
389
|
+
}, error instanceof Error ? error : undefined),
|
|
390
|
+
};
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
// ===========================================================================
|
|
394
|
+
// FORMAT DETECTION
|
|
395
|
+
// ===========================================================================
|
|
396
|
+
/**
|
|
397
|
+
* Detect the archive format using magic bytes and file extension.
|
|
398
|
+
* Magic bytes take precedence over extension when available.
|
|
399
|
+
*
|
|
400
|
+
* @param buffer - Raw archive buffer
|
|
401
|
+
* @param filename - Original filename for extension-based fallback
|
|
402
|
+
* @returns Detected archive format, or null if unrecognized
|
|
403
|
+
*/
|
|
404
|
+
detectArchiveFormat(buffer, filename) {
|
|
405
|
+
// Try magic bytes first (most reliable)
|
|
406
|
+
const magicFormat = this.detectFormatFromMagicBytes(buffer);
|
|
407
|
+
if (magicFormat) {
|
|
408
|
+
// For GZIP, check if it wraps a TAR archive
|
|
409
|
+
if (magicFormat === "gz") {
|
|
410
|
+
const ext = filename.toLowerCase();
|
|
411
|
+
if (ext.endsWith(".tar.gz") || ext.endsWith(".tgz") || ext.endsWith(".tbz2")) {
|
|
412
|
+
return "tar.gz";
|
|
413
|
+
}
|
|
414
|
+
// Could still be a tar.gz without the extension - we'll detect during extraction
|
|
415
|
+
return "gz";
|
|
416
|
+
}
|
|
417
|
+
return magicFormat;
|
|
418
|
+
}
|
|
419
|
+
// Fallback to extension-based detection
|
|
420
|
+
return this.detectFormatFromExtension(filename);
|
|
421
|
+
}
|
|
422
|
+
/**
|
|
423
|
+
* Detect archive format from magic bytes at the start of the buffer.
|
|
424
|
+
*
|
|
425
|
+
* @param buffer - Raw archive buffer
|
|
426
|
+
* @returns Detected format, or null if magic bytes don't match any known format
|
|
427
|
+
*/
|
|
428
|
+
detectFormatFromMagicBytes(buffer) {
|
|
429
|
+
if (buffer.length < 2) {
|
|
430
|
+
return null;
|
|
431
|
+
}
|
|
432
|
+
// Check for 7-Zip (6 bytes)
|
|
433
|
+
if (buffer.length >= 6 && this.matchesMagic(buffer, MAGIC_BYTES.SEVEN_ZIP)) {
|
|
434
|
+
return "7z";
|
|
435
|
+
}
|
|
436
|
+
// Check for RAR (6+ bytes)
|
|
437
|
+
if (buffer.length >= 6 && this.matchesMagic(buffer, MAGIC_BYTES.RAR)) {
|
|
438
|
+
return "rar";
|
|
439
|
+
}
|
|
440
|
+
// Check for ZIP/JAR (4 bytes)
|
|
441
|
+
if (buffer.length >= 4 &&
|
|
442
|
+
(this.matchesMagic(buffer, MAGIC_BYTES.ZIP) ||
|
|
443
|
+
this.matchesMagic(buffer, MAGIC_BYTES.ZIP_EMPTY) ||
|
|
444
|
+
this.matchesMagic(buffer, MAGIC_BYTES.ZIP_SPANNED))) {
|
|
445
|
+
return "zip";
|
|
446
|
+
}
|
|
447
|
+
// Check for GZIP (2 bytes)
|
|
448
|
+
if (this.matchesMagic(buffer, MAGIC_BYTES.GZIP)) {
|
|
449
|
+
return "gz";
|
|
450
|
+
}
|
|
451
|
+
// Check for BZIP2 (2 bytes)
|
|
452
|
+
if (this.matchesMagic(buffer, MAGIC_BYTES.BZIP2)) {
|
|
453
|
+
return "tar.bz2";
|
|
454
|
+
}
|
|
455
|
+
return null;
|
|
456
|
+
}
|
|
457
|
+
/**
|
|
458
|
+
* Detect archive format from file extension.
|
|
459
|
+
*
|
|
460
|
+
* @param filename - Filename to extract extension from
|
|
461
|
+
* @returns Detected format, or null if extension is unrecognized
|
|
462
|
+
*/
|
|
463
|
+
detectFormatFromExtension(filename) {
|
|
464
|
+
const lowerFilename = filename.toLowerCase();
|
|
465
|
+
if (lowerFilename.endsWith(".tar.gz") || lowerFilename.endsWith(".tgz")) {
|
|
466
|
+
return "tar.gz";
|
|
467
|
+
}
|
|
468
|
+
if (lowerFilename.endsWith(".tar.bz2") || lowerFilename.endsWith(".tbz2")) {
|
|
469
|
+
return "tar.bz2";
|
|
470
|
+
}
|
|
471
|
+
if (lowerFilename.endsWith(".tar")) {
|
|
472
|
+
return "tar";
|
|
473
|
+
}
|
|
474
|
+
if (lowerFilename.endsWith(".gz")) {
|
|
475
|
+
return "gz";
|
|
476
|
+
}
|
|
477
|
+
if (lowerFilename.endsWith(".bz2")) {
|
|
478
|
+
return "tar.bz2";
|
|
479
|
+
}
|
|
480
|
+
if (lowerFilename.endsWith(".zip") || lowerFilename.endsWith(".jar")) {
|
|
481
|
+
return "zip";
|
|
482
|
+
}
|
|
483
|
+
if (lowerFilename.endsWith(".rar")) {
|
|
484
|
+
return "rar";
|
|
485
|
+
}
|
|
486
|
+
if (lowerFilename.endsWith(".7z")) {
|
|
487
|
+
return "7z";
|
|
488
|
+
}
|
|
489
|
+
return null;
|
|
490
|
+
}
|
|
491
|
+
/**
|
|
492
|
+
* Check if a buffer starts with the given magic byte sequence.
|
|
493
|
+
*
|
|
494
|
+
* @param buffer - Buffer to check
|
|
495
|
+
* @param magic - Expected byte sequence
|
|
496
|
+
* @returns true if the buffer starts with the magic bytes
|
|
497
|
+
*/
|
|
498
|
+
matchesMagic(buffer, magic) {
|
|
499
|
+
for (let i = 0; i < magic.length; i++) {
|
|
500
|
+
if (buffer[i] !== magic[i]) {
|
|
501
|
+
return false;
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
return true;
|
|
505
|
+
}
|
|
506
|
+
// ===========================================================================
|
|
507
|
+
// ENTRY EXTRACTION
|
|
508
|
+
// ===========================================================================
|
|
509
|
+
/**
|
|
510
|
+
* Extract entry metadata from the archive.
|
|
511
|
+
* Delegates to format-specific extraction methods.
|
|
512
|
+
*
|
|
513
|
+
* @param buffer - Raw archive buffer
|
|
514
|
+
* @param format - Detected archive format
|
|
515
|
+
* @returns Extraction result with entries and security warnings, or error
|
|
516
|
+
*/
|
|
517
|
+
async extractEntries(buffer, format) {
|
|
518
|
+
switch (format) {
|
|
519
|
+
case "zip":
|
|
520
|
+
return this.extractZipEntries(buffer);
|
|
521
|
+
case "tar":
|
|
522
|
+
return this.extractTarEntries(buffer);
|
|
523
|
+
case "tar.gz":
|
|
524
|
+
return this.extractTarGzEntries(buffer);
|
|
525
|
+
case "tar.bz2":
|
|
526
|
+
return {
|
|
527
|
+
success: false,
|
|
528
|
+
entries: [],
|
|
529
|
+
securityWarnings: [],
|
|
530
|
+
error: this.createError(FileErrorCode.UNSUPPORTED_TYPE, {
|
|
531
|
+
format: "tar.bz2",
|
|
532
|
+
reason: "TAR.BZ2 archives are not yet supported. Please convert to ZIP or TAR.GZ format.",
|
|
533
|
+
supportedFormats: "ZIP, TAR, TAR.GZ, GZ",
|
|
534
|
+
}),
|
|
535
|
+
};
|
|
536
|
+
case "gz":
|
|
537
|
+
return this.extractGzEntries(buffer);
|
|
538
|
+
default:
|
|
539
|
+
return {
|
|
540
|
+
success: false,
|
|
541
|
+
entries: [],
|
|
542
|
+
securityWarnings: [],
|
|
543
|
+
error: this.createError(FileErrorCode.UNSUPPORTED_TYPE, {
|
|
544
|
+
format,
|
|
545
|
+
reason: `${format} archives are not supported`,
|
|
546
|
+
supportedFormats: "ZIP, TAR, TAR.GZ, GZ",
|
|
547
|
+
}),
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
// ===========================================================================
|
|
552
|
+
// ZIP EXTRACTION
|
|
553
|
+
// ===========================================================================
|
|
554
|
+
/**
|
|
555
|
+
* Extract entry metadata from a ZIP archive.
|
|
556
|
+
* Validates each entry for path traversal, encryption, symlinks, and size limits.
|
|
557
|
+
*
|
|
558
|
+
* @param buffer - Raw ZIP buffer
|
|
559
|
+
* @returns Extraction result with entries, security warnings, or error
|
|
560
|
+
*/
|
|
561
|
+
async extractZipEntries(buffer) {
|
|
562
|
+
const entries = [];
|
|
563
|
+
const securityWarnings = [];
|
|
564
|
+
try {
|
|
565
|
+
const AdmZip = (await import("adm-zip")).default;
|
|
566
|
+
const zip = new AdmZip(buffer);
|
|
567
|
+
const zipEntries = zip.getEntries();
|
|
568
|
+
// Check entry count limit
|
|
569
|
+
if (zipEntries.length > ARCHIVE_SECURITY.MAX_ENTRIES) {
|
|
570
|
+
return {
|
|
571
|
+
success: false,
|
|
572
|
+
entries: [],
|
|
573
|
+
securityWarnings: [],
|
|
574
|
+
error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, {
|
|
575
|
+
reason: `Archive contains ${zipEntries.length} entries, exceeding the limit of ${ARCHIVE_SECURITY.MAX_ENTRIES}`,
|
|
576
|
+
}),
|
|
577
|
+
};
|
|
578
|
+
}
|
|
579
|
+
let cumulativeUncompressedSize = 0;
|
|
580
|
+
for (const entry of zipEntries) {
|
|
581
|
+
const entryName = entry.entryName;
|
|
582
|
+
// Security: path traversal check
|
|
583
|
+
if (this.hasPathTraversal(entryName)) {
|
|
584
|
+
securityWarnings.push(`Path traversal detected in entry: "${entryName}" - entry skipped`);
|
|
585
|
+
continue;
|
|
586
|
+
}
|
|
587
|
+
// Security: path length check
|
|
588
|
+
if (entryName.length > ARCHIVE_SECURITY.MAX_PATH_LENGTH) {
|
|
589
|
+
securityWarnings.push(`Entry name exceeds maximum path length (${ARCHIVE_SECURITY.MAX_PATH_LENGTH}): "${entryName.substring(0, 50)}..." - entry skipped`);
|
|
590
|
+
continue;
|
|
591
|
+
}
|
|
592
|
+
// Security: encrypted entry check
|
|
593
|
+
if (entry.header.flags & 0x01) {
|
|
594
|
+
if (!ARCHIVE_SECURITY.ALLOW_ENCRYPTED) {
|
|
595
|
+
securityWarnings.push(`Encrypted entry detected: "${entryName}" - entry skipped`);
|
|
596
|
+
continue;
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
// Security: symlink check (ZIP external attributes)
|
|
600
|
+
const externalAttr = entry.header.attr >>> 16;
|
|
601
|
+
const isSymlink = (externalAttr & 0xa000) === 0xa000;
|
|
602
|
+
if (isSymlink && !ARCHIVE_SECURITY.ALLOW_SYMLINKS) {
|
|
603
|
+
securityWarnings.push(`Symbolic link detected: "${entryName}" - entry skipped`);
|
|
604
|
+
continue;
|
|
605
|
+
}
|
|
606
|
+
const isDirectory = entry.isDirectory;
|
|
607
|
+
const uncompressedSize = entry.header.size;
|
|
608
|
+
const compressedSize = entry.header.compressedSize;
|
|
609
|
+
// Security: single file size check
|
|
610
|
+
if (!isDirectory && uncompressedSize > ARCHIVE_SECURITY.MAX_SINGLE_FILE_SIZE) {
|
|
611
|
+
securityWarnings.push(`Entry "${entryName}" exceeds single file size limit (${this.formatSizeMB(uncompressedSize)} MB > ${this.formatSizeMB(ARCHIVE_SECURITY.MAX_SINGLE_FILE_SIZE)} MB) - entry listed but flagged`);
|
|
612
|
+
}
|
|
613
|
+
// Security: per-entry compression ratio check
|
|
614
|
+
if (compressedSize > 0 && !isDirectory) {
|
|
615
|
+
const ratio = uncompressedSize / compressedSize;
|
|
616
|
+
if (ratio > ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO) {
|
|
617
|
+
return {
|
|
618
|
+
success: false,
|
|
619
|
+
entries: [],
|
|
620
|
+
securityWarnings: [],
|
|
621
|
+
error: this.createError(FileErrorCode.ZIP_BOMB_DETECTED, {
|
|
622
|
+
entryName,
|
|
623
|
+
compressionRatio: ratio.toFixed(1),
|
|
624
|
+
maxRatio: ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO,
|
|
625
|
+
}),
|
|
626
|
+
};
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
// Cumulative decompressed size check
|
|
630
|
+
cumulativeUncompressedSize += uncompressedSize;
|
|
631
|
+
if (cumulativeUncompressedSize > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) {
|
|
632
|
+
return {
|
|
633
|
+
success: false,
|
|
634
|
+
entries: [],
|
|
635
|
+
securityWarnings: [],
|
|
636
|
+
error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, {
|
|
637
|
+
reason: `Cumulative decompressed size exceeds limit of ${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB`,
|
|
638
|
+
}),
|
|
639
|
+
};
|
|
640
|
+
}
|
|
641
|
+
entries.push({
|
|
642
|
+
name: entryName,
|
|
643
|
+
uncompressedSize,
|
|
644
|
+
compressedSize,
|
|
645
|
+
isDirectory,
|
|
646
|
+
});
|
|
647
|
+
}
|
|
648
|
+
return { success: true, entries, securityWarnings };
|
|
649
|
+
}
|
|
650
|
+
catch (error) {
|
|
651
|
+
return {
|
|
652
|
+
success: false,
|
|
653
|
+
entries: [],
|
|
654
|
+
securityWarnings: [],
|
|
655
|
+
error: this.createError(FileErrorCode.CORRUPTED_FILE, {
|
|
656
|
+
reason: `Failed to read ZIP archive: ${error instanceof Error ? error.message : String(error)}`,
|
|
657
|
+
}, error instanceof Error ? error : undefined),
|
|
658
|
+
};
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
// ===========================================================================
|
|
662
|
+
// TAR EXTRACTION
|
|
663
|
+
// ===========================================================================
|
|
664
|
+
/**
|
|
665
|
+
* Extract entry metadata from a plain TAR archive.
|
|
666
|
+
*
|
|
667
|
+
* @param buffer - Raw TAR buffer
|
|
668
|
+
* @returns Extraction result with entries and security warnings, or error
|
|
669
|
+
*/
|
|
670
|
+
async extractTarEntries(buffer) {
|
|
671
|
+
try {
|
|
672
|
+
const tarStream = await import("tar-stream");
|
|
673
|
+
return await this.parseTarStream(tarStream, buffer);
|
|
674
|
+
}
|
|
675
|
+
catch (error) {
|
|
676
|
+
return {
|
|
677
|
+
success: false,
|
|
678
|
+
entries: [],
|
|
679
|
+
securityWarnings: [],
|
|
680
|
+
error: this.createError(FileErrorCode.CORRUPTED_FILE, {
|
|
681
|
+
reason: `Failed to read TAR archive: ${error instanceof Error ? error.message : String(error)}`,
|
|
682
|
+
}, error instanceof Error ? error : undefined),
|
|
683
|
+
};
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
/**
|
|
687
|
+
* Extract entry metadata from a GZIP-compressed TAR archive.
|
|
688
|
+
* First decompresses with zlib, then parses as TAR.
|
|
689
|
+
*
|
|
690
|
+
* @param buffer - Raw TAR.GZ buffer
|
|
691
|
+
* @returns Extraction result with entries and security warnings, or error
|
|
692
|
+
*/
|
|
693
|
+
async extractTarGzEntries(buffer) {
|
|
694
|
+
try {
|
|
695
|
+
const zlib = await import("zlib");
|
|
696
|
+
const { promisify } = await import("util");
|
|
697
|
+
const gunzip = promisify(zlib.gunzip);
|
|
698
|
+
const decompressed = await gunzip(buffer);
|
|
699
|
+
const tarBuffer = Buffer.from(decompressed);
|
|
700
|
+
// Security: check decompressed size
|
|
701
|
+
if (tarBuffer.length > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) {
|
|
702
|
+
return {
|
|
703
|
+
success: false,
|
|
704
|
+
entries: [],
|
|
705
|
+
securityWarnings: [],
|
|
706
|
+
error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, {
|
|
707
|
+
reason: `Decompressed TAR size (${this.formatSizeMB(tarBuffer.length)} MB) exceeds limit (${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB)`,
|
|
708
|
+
}),
|
|
709
|
+
};
|
|
710
|
+
}
|
|
711
|
+
// Security: check compression ratio
|
|
712
|
+
if (buffer.length > 0) {
|
|
713
|
+
const ratio = tarBuffer.length / buffer.length;
|
|
714
|
+
if (ratio > ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO) {
|
|
715
|
+
return {
|
|
716
|
+
success: false,
|
|
717
|
+
entries: [],
|
|
718
|
+
securityWarnings: [],
|
|
719
|
+
error: this.createError(FileErrorCode.ZIP_BOMB_DETECTED, {
|
|
720
|
+
compressionRatio: ratio.toFixed(1),
|
|
721
|
+
maxRatio: ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO,
|
|
722
|
+
}),
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
const tarStream = await import("tar-stream");
|
|
727
|
+
return await this.parseTarStream(tarStream, tarBuffer);
|
|
728
|
+
}
|
|
729
|
+
catch (error) {
|
|
730
|
+
// Check if the error is one we already created (security validation)
|
|
731
|
+
if (error &&
|
|
732
|
+
typeof error === "object" &&
|
|
733
|
+
"code" in error &&
|
|
734
|
+
typeof error.code === "string") {
|
|
735
|
+
// Re-throw our structured errors
|
|
736
|
+
return {
|
|
737
|
+
success: false,
|
|
738
|
+
entries: [],
|
|
739
|
+
securityWarnings: [],
|
|
740
|
+
error: this.createError(FileErrorCode.DECOMPRESSION_FAILED, {
|
|
741
|
+
reason: `Failed to decompress TAR.GZ archive: ${error instanceof Error ? error.message : String(error)}`,
|
|
742
|
+
}, error instanceof Error ? error : undefined),
|
|
743
|
+
};
|
|
744
|
+
}
|
|
745
|
+
return {
|
|
746
|
+
success: false,
|
|
747
|
+
entries: [],
|
|
748
|
+
securityWarnings: [],
|
|
749
|
+
error: this.createError(FileErrorCode.DECOMPRESSION_FAILED, {
|
|
750
|
+
reason: `Failed to decompress TAR.GZ archive: ${error instanceof Error ? error.message : String(error)}`,
|
|
751
|
+
}, error instanceof Error ? error : undefined),
|
|
752
|
+
};
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
/**
|
|
756
|
+
* Parse a TAR stream and extract entry metadata.
|
|
757
|
+
* Shared between plain TAR and decompressed TAR.GZ processing.
|
|
758
|
+
*
|
|
759
|
+
* @param tarStream - The imported tar-stream module
|
|
760
|
+
* @param buffer - Raw (decompressed) TAR buffer
|
|
761
|
+
* @returns Extraction result with entries and security warnings, or error
|
|
762
|
+
*/
|
|
763
|
+
async parseTarStream(tarStream, buffer) {
|
|
764
|
+
return new Promise((resolve) => {
|
|
765
|
+
const entries = [];
|
|
766
|
+
const securityWarnings = [];
|
|
767
|
+
let entryCount = 0;
|
|
768
|
+
let cumulativeSize = 0;
|
|
769
|
+
let earlyError = null;
|
|
770
|
+
const extract = tarStream.extract();
|
|
771
|
+
extract.on("entry", (header, stream, next) => {
|
|
772
|
+
entryCount++;
|
|
773
|
+
// Security: entry count limit
|
|
774
|
+
if (entryCount > ARCHIVE_SECURITY.MAX_ENTRIES) {
|
|
775
|
+
earlyError = this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, {
|
|
776
|
+
reason: `Archive contains more than ${ARCHIVE_SECURITY.MAX_ENTRIES} entries`,
|
|
777
|
+
});
|
|
778
|
+
stream.resume();
|
|
779
|
+
extract.destroy();
|
|
780
|
+
return;
|
|
781
|
+
}
|
|
782
|
+
const entryName = header.name || "";
|
|
783
|
+
const entrySize = header.size || 0;
|
|
784
|
+
const entryType = header.type || "file";
|
|
785
|
+
// Security: path traversal
|
|
786
|
+
if (this.hasPathTraversal(entryName)) {
|
|
787
|
+
securityWarnings.push(`Path traversal detected in entry: "${entryName}" - entry skipped`);
|
|
788
|
+
stream.resume();
|
|
789
|
+
next();
|
|
790
|
+
return;
|
|
791
|
+
}
|
|
792
|
+
// Security: path length
|
|
793
|
+
if (entryName.length > ARCHIVE_SECURITY.MAX_PATH_LENGTH) {
|
|
794
|
+
securityWarnings.push(`Entry name exceeds maximum path length (${ARCHIVE_SECURITY.MAX_PATH_LENGTH}): "${entryName.substring(0, 50)}..." - entry skipped`);
|
|
795
|
+
stream.resume();
|
|
796
|
+
next();
|
|
797
|
+
return;
|
|
798
|
+
}
|
|
799
|
+
// Security: symlinks
|
|
800
|
+
if ((entryType === "symlink" || entryType === "link") && !ARCHIVE_SECURITY.ALLOW_SYMLINKS) {
|
|
801
|
+
securityWarnings.push(`Symbolic/hard link detected: "${entryName}" - entry skipped`);
|
|
802
|
+
stream.resume();
|
|
803
|
+
next();
|
|
804
|
+
return;
|
|
805
|
+
}
|
|
806
|
+
const isDirectory = entryType === "directory";
|
|
807
|
+
// Security: single file size
|
|
808
|
+
if (!isDirectory && entrySize > ARCHIVE_SECURITY.MAX_SINGLE_FILE_SIZE) {
|
|
809
|
+
securityWarnings.push(`Entry "${entryName}" exceeds single file size limit (${this.formatSizeMB(entrySize)} MB > ${this.formatSizeMB(ARCHIVE_SECURITY.MAX_SINGLE_FILE_SIZE)} MB) - entry listed but flagged`);
|
|
810
|
+
}
|
|
811
|
+
// Security: cumulative size
|
|
812
|
+
cumulativeSize += entrySize;
|
|
813
|
+
if (cumulativeSize > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) {
|
|
814
|
+
earlyError = this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, {
|
|
815
|
+
reason: `Cumulative entry size exceeds limit of ${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB`,
|
|
816
|
+
});
|
|
817
|
+
stream.resume();
|
|
818
|
+
extract.destroy();
|
|
819
|
+
return;
|
|
820
|
+
}
|
|
821
|
+
entries.push({
|
|
822
|
+
name: entryName,
|
|
823
|
+
uncompressedSize: entrySize,
|
|
824
|
+
compressedSize: 0, // TAR doesn't compress individual entries
|
|
825
|
+
isDirectory,
|
|
826
|
+
});
|
|
827
|
+
// Consume the stream without buffering (we only need metadata)
|
|
828
|
+
stream.resume();
|
|
829
|
+
next();
|
|
830
|
+
});
|
|
831
|
+
extract.on("finish", () => {
|
|
832
|
+
if (earlyError) {
|
|
833
|
+
resolve({
|
|
834
|
+
success: false,
|
|
835
|
+
entries: [],
|
|
836
|
+
securityWarnings: [],
|
|
837
|
+
error: earlyError,
|
|
838
|
+
});
|
|
839
|
+
}
|
|
840
|
+
else {
|
|
841
|
+
resolve({ success: true, entries, securityWarnings });
|
|
842
|
+
}
|
|
843
|
+
});
|
|
844
|
+
extract.on("error", (err) => {
|
|
845
|
+
if (earlyError) {
|
|
846
|
+
resolve({
|
|
847
|
+
success: false,
|
|
848
|
+
entries: [],
|
|
849
|
+
securityWarnings: [],
|
|
850
|
+
error: earlyError,
|
|
851
|
+
});
|
|
852
|
+
}
|
|
853
|
+
else {
|
|
854
|
+
resolve({
|
|
855
|
+
success: false,
|
|
856
|
+
entries: [],
|
|
857
|
+
securityWarnings: [],
|
|
858
|
+
error: this.createError(FileErrorCode.CORRUPTED_FILE, {
|
|
859
|
+
reason: `Failed to parse TAR archive: ${err.message}`,
|
|
860
|
+
}, err),
|
|
861
|
+
});
|
|
862
|
+
}
|
|
863
|
+
});
|
|
864
|
+
// Feed the buffer into the extract stream
|
|
865
|
+
extract.end(buffer);
|
|
866
|
+
});
|
|
867
|
+
}
|
|
868
|
+
// ===========================================================================
|
|
869
|
+
// GZIP EXTRACTION (plain, non-TAR)
|
|
870
|
+
// ===========================================================================
|
|
871
|
+
/**
|
|
872
|
+
* Extract metadata from a plain GZIP file (single compressed file, not a TAR).
|
|
873
|
+
* Since plain GZ wraps a single file, we create a single entry using the
|
|
874
|
+
* original filename minus the .gz extension.
|
|
875
|
+
*
|
|
876
|
+
* @param buffer - Raw GZIP buffer
|
|
877
|
+
* @returns Extraction result with a single entry and security warnings, or error
|
|
878
|
+
*/
|
|
879
|
+
async extractGzEntries(buffer) {
|
|
880
|
+
try {
|
|
881
|
+
const zlib = await import("zlib");
|
|
882
|
+
const { promisify } = await import("util");
|
|
883
|
+
const gunzip = promisify(zlib.gunzip);
|
|
884
|
+
const decompressed = await gunzip(buffer);
|
|
885
|
+
// Security: check decompressed size
|
|
886
|
+
if (decompressed.length > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) {
|
|
887
|
+
return {
|
|
888
|
+
success: false,
|
|
889
|
+
entries: [],
|
|
890
|
+
securityWarnings: [],
|
|
891
|
+
error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, {
|
|
892
|
+
reason: `Decompressed size (${this.formatSizeMB(decompressed.length)} MB) exceeds limit (${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB)`,
|
|
893
|
+
}),
|
|
894
|
+
};
|
|
895
|
+
}
|
|
896
|
+
// Security: compression ratio
|
|
897
|
+
if (buffer.length > 0) {
|
|
898
|
+
const ratio = decompressed.length / buffer.length;
|
|
899
|
+
if (ratio > ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO) {
|
|
900
|
+
return {
|
|
901
|
+
success: false,
|
|
902
|
+
entries: [],
|
|
903
|
+
securityWarnings: [],
|
|
904
|
+
error: this.createError(FileErrorCode.ZIP_BOMB_DETECTED, {
|
|
905
|
+
compressionRatio: ratio.toFixed(1),
|
|
906
|
+
maxRatio: ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO,
|
|
907
|
+
}),
|
|
908
|
+
};
|
|
909
|
+
}
|
|
910
|
+
}
|
|
911
|
+
// Check if the decompressed content is actually a TAR
|
|
912
|
+
if (this.looksLikeTar(decompressed)) {
|
|
913
|
+
// It's actually a tar.gz; re-route through TAR extraction
|
|
914
|
+
const tarStream = await import("tar-stream");
|
|
915
|
+
return await this.parseTarStream(tarStream, Buffer.from(decompressed));
|
|
916
|
+
}
|
|
917
|
+
// Plain GZ - single entry
|
|
918
|
+
// Derive the inner filename by removing the .gz extension
|
|
919
|
+
const innerFilename = "decompressed-content";
|
|
920
|
+
const securityWarnings = [];
|
|
921
|
+
const entries = [
|
|
922
|
+
{
|
|
923
|
+
name: innerFilename,
|
|
924
|
+
uncompressedSize: decompressed.length,
|
|
925
|
+
compressedSize: buffer.length,
|
|
926
|
+
isDirectory: false,
|
|
927
|
+
},
|
|
928
|
+
];
|
|
929
|
+
return { success: true, entries, securityWarnings };
|
|
930
|
+
}
|
|
931
|
+
catch (error) {
|
|
932
|
+
return {
|
|
933
|
+
success: false,
|
|
934
|
+
entries: [],
|
|
935
|
+
securityWarnings: [],
|
|
936
|
+
error: this.createError(FileErrorCode.DECOMPRESSION_FAILED, {
|
|
937
|
+
reason: `Failed to decompress GZIP file: ${error instanceof Error ? error.message : String(error)}`,
|
|
938
|
+
}, error instanceof Error ? error : undefined),
|
|
939
|
+
};
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
/**
|
|
943
|
+
* Heuristic check to determine if a buffer looks like a TAR archive.
|
|
944
|
+
* TAR archives have a "ustar" magic string at byte offset 257.
|
|
945
|
+
*
|
|
946
|
+
* @param buffer - Decompressed buffer to check
|
|
947
|
+
* @returns true if the buffer appears to be a TAR archive
|
|
948
|
+
*/
|
|
949
|
+
looksLikeTar(buffer) {
|
|
950
|
+
if (buffer.length < 263) {
|
|
951
|
+
return false;
|
|
952
|
+
}
|
|
953
|
+
// "ustar" at offset 257
|
|
954
|
+
const magic = Buffer.from(buffer.slice(257, 263)).toString("ascii");
|
|
955
|
+
return magic.startsWith("ustar");
|
|
956
|
+
}
|
|
957
|
+
// ===========================================================================
|
|
958
|
+
// SECURITY VALIDATION
|
|
959
|
+
// ===========================================================================
|
|
960
|
+
/**
|
|
961
|
+
* Check if an entry name contains path traversal sequences.
|
|
962
|
+
* Detects `../`, absolute paths, and other traversal vectors.
|
|
963
|
+
*
|
|
964
|
+
* @param entryName - Archive entry name/path to validate
|
|
965
|
+
* @returns true if path traversal is detected
|
|
966
|
+
*/
|
|
967
|
+
hasPathTraversal(entryName) {
|
|
968
|
+
// Normalize separators
|
|
969
|
+
const normalized = entryName.replace(/\\/g, "/");
|
|
970
|
+
// Check for parent directory traversal
|
|
971
|
+
if (normalized.includes("../") || normalized.includes("/..")) {
|
|
972
|
+
return true;
|
|
973
|
+
}
|
|
974
|
+
// Check for absolute paths
|
|
975
|
+
if (normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized)) {
|
|
976
|
+
return true;
|
|
977
|
+
}
|
|
978
|
+
// Check resolved path doesn't escape root
|
|
979
|
+
const resolved = path.posix.normalize(normalized);
|
|
980
|
+
if (resolved.startsWith("../") || resolved === "..") {
|
|
981
|
+
return true;
|
|
982
|
+
}
|
|
983
|
+
return false;
|
|
984
|
+
}
|
|
985
|
+
// ===========================================================================
|
|
986
|
+
// CONTENT EXTRACTION (Phase 2 sub-processing)
|
|
987
|
+
// ===========================================================================
|
|
988
|
+
/**
|
|
989
|
+
* Extract text content from eligible ZIP entries for LLM consumption.
|
|
990
|
+
*
|
|
991
|
+
* Selects small, text-based files from the archive and extracts their
|
|
992
|
+
* content. Files are sorted by relevance (config files, source code, docs).
|
|
993
|
+
* Binary files, nested archives, and files exceeding size limits are skipped.
|
|
994
|
+
*
|
|
995
|
+
* @param buffer - Raw ZIP archive buffer
|
|
996
|
+
* @param entries - Previously extracted entry metadata
|
|
997
|
+
* @returns Map of entry name to extracted text content
|
|
998
|
+
*/
|
|
999
|
+
async extractEntryContents(buffer, entries) {
|
|
1000
|
+
const contents = new Map();
|
|
1001
|
+
try {
|
|
1002
|
+
const AdmZip = (await import("adm-zip")).default;
|
|
1003
|
+
const zip = new AdmZip(buffer);
|
|
1004
|
+
// Filter to extractable text-based entries within size limits
|
|
1005
|
+
const candidates = entries
|
|
1006
|
+
.filter((e) => {
|
|
1007
|
+
if (e.isDirectory) {
|
|
1008
|
+
return false;
|
|
1009
|
+
}
|
|
1010
|
+
if (e.uncompressedSize > ARCHIVE_CONFIG.MAX_EXTRACT_ENTRY_SIZE) {
|
|
1011
|
+
return false;
|
|
1012
|
+
}
|
|
1013
|
+
if (e.uncompressedSize === 0) {
|
|
1014
|
+
return false;
|
|
1015
|
+
}
|
|
1016
|
+
const ext = path.extname(e.name).toLowerCase();
|
|
1017
|
+
// Check by extension
|
|
1018
|
+
if (ARCHIVE_CONFIG.EXTRACTABLE_EXTENSIONS.has(ext)) {
|
|
1019
|
+
return true;
|
|
1020
|
+
}
|
|
1021
|
+
// Check for common extensionless config files
|
|
1022
|
+
const basename = path.basename(e.name).toLowerCase();
|
|
1023
|
+
if (basename === "readme" || basename === "license" || basename === "makefile" || basename === "dockerfile") {
|
|
1024
|
+
return true;
|
|
1025
|
+
}
|
|
1026
|
+
return false;
|
|
1027
|
+
})
|
|
1028
|
+
// Sort: smaller files first (more likely to fit), then by name
|
|
1029
|
+
.sort((a, b) => a.uncompressedSize - b.uncompressedSize);
|
|
1030
|
+
let totalExtracted = 0;
|
|
1031
|
+
let extractCount = 0;
|
|
1032
|
+
for (const entry of candidates) {
|
|
1033
|
+
if (extractCount >= ARCHIVE_CONFIG.MAX_EXTRACT_ENTRIES) {
|
|
1034
|
+
break;
|
|
1035
|
+
}
|
|
1036
|
+
if (totalExtracted + entry.uncompressedSize > ARCHIVE_CONFIG.MAX_TOTAL_EXTRACT_SIZE) {
|
|
1037
|
+
break;
|
|
1038
|
+
}
|
|
1039
|
+
try {
|
|
1040
|
+
const zipEntry = zip.getEntry(entry.name);
|
|
1041
|
+
if (!zipEntry) {
|
|
1042
|
+
continue;
|
|
1043
|
+
}
|
|
1044
|
+
const data = zipEntry.getData();
|
|
1045
|
+
if (!data || data.length === 0) {
|
|
1046
|
+
continue;
|
|
1047
|
+
}
|
|
1048
|
+
// Simple binary detection: check for null bytes in first 512 bytes
|
|
1049
|
+
const sample = data.slice(0, Math.min(512, data.length));
|
|
1050
|
+
if (sample.includes(0)) {
|
|
1051
|
+
continue;
|
|
1052
|
+
}
|
|
1053
|
+
const text = data.toString("utf-8");
|
|
1054
|
+
// Sanity check: skip if too many replacement characters (likely binary)
|
|
1055
|
+
const replacementCount = (text.match(/\ufffd/g) || []).length;
|
|
1056
|
+
if (replacementCount > text.length * 0.05) {
|
|
1057
|
+
continue;
|
|
1058
|
+
}
|
|
1059
|
+
contents.set(entry.name, text);
|
|
1060
|
+
totalExtracted += data.length;
|
|
1061
|
+
extractCount++;
|
|
1062
|
+
}
|
|
1063
|
+
catch {
|
|
1064
|
+
// Skip entries that fail to extract (binary, corrupt, etc.)
|
|
1065
|
+
}
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
catch {
|
|
1069
|
+
// If ZIP re-parsing fails, return empty — listing is still available
|
|
1070
|
+
}
|
|
1071
|
+
return contents;
|
|
1072
|
+
}
|
|
1073
|
+
// ===========================================================================
|
|
1074
|
+
// TEXT CONTENT BUILDING
|
|
1075
|
+
// ===========================================================================
|
|
1076
|
+
/**
|
|
1077
|
+
* Build a structured text description of the archive for LLM consumption.
|
|
1078
|
+
* Includes archive metadata, file listing with sizes, and security warnings.
|
|
1079
|
+
*
|
|
1080
|
+
* @param filename - Original archive filename
|
|
1081
|
+
* @param metadata - Aggregate archive metadata
|
|
1082
|
+
* @param entries - Individual entry metadata
|
|
1083
|
+
* @param securityWarnings - Security warnings encountered during processing
|
|
1084
|
+
* @param extractedContents - Map of entry name to extracted text content (Phase 2)
|
|
1085
|
+
* @returns Formatted text content string
|
|
1086
|
+
*/
|
|
1087
|
+
buildTextContent(filename, metadata, entries, securityWarnings, extractedContents) {
|
|
1088
|
+
const lines = [];
|
|
1089
|
+
// Header
|
|
1090
|
+
lines.push(`## Archive: ${filename}`);
|
|
1091
|
+
lines.push("");
|
|
1092
|
+
// Metadata
|
|
1093
|
+
lines.push("### Metadata");
|
|
1094
|
+
lines.push(`- **Format:** ${metadata.format.toUpperCase()}`);
|
|
1095
|
+
lines.push(`- **Total entries:** ${metadata.totalEntries}`);
|
|
1096
|
+
lines.push(`- **Total uncompressed size:** ${this.formatHumanReadableSize(metadata.totalUncompressedSize)}`);
|
|
1097
|
+
if (metadata.totalCompressedSize > 0) {
|
|
1098
|
+
lines.push(`- **Total compressed size:** ${this.formatHumanReadableSize(metadata.totalCompressedSize)}`);
|
|
1099
|
+
}
|
|
1100
|
+
lines.push("");
|
|
1101
|
+
// Security warnings
|
|
1102
|
+
if (securityWarnings.length > 0) {
|
|
1103
|
+
lines.push("### Security Warnings");
|
|
1104
|
+
for (const warning of securityWarnings) {
|
|
1105
|
+
lines.push(`- ${warning}`);
|
|
1106
|
+
}
|
|
1107
|
+
lines.push("");
|
|
1108
|
+
}
|
|
1109
|
+
// File listing
|
|
1110
|
+
lines.push("### Contents");
|
|
1111
|
+
lines.push("");
|
|
1112
|
+
// Separate directories and files
|
|
1113
|
+
const directories = entries.filter((e) => e.isDirectory);
|
|
1114
|
+
const files = entries.filter((e) => !e.isDirectory);
|
|
1115
|
+
if (directories.length > 0) {
|
|
1116
|
+
lines.push(`**Directories (${directories.length}):**`);
|
|
1117
|
+
for (const dir of directories) {
|
|
1118
|
+
lines.push(` ${dir.name}`);
|
|
1119
|
+
}
|
|
1120
|
+
lines.push("");
|
|
1121
|
+
}
|
|
1122
|
+
if (files.length > 0) {
|
|
1123
|
+
lines.push(`**Files (${files.length}):**`);
|
|
1124
|
+
// Sort files by path for readability
|
|
1125
|
+
const sortedFiles = [...files].sort((a, b) => a.name.localeCompare(b.name));
|
|
1126
|
+
for (const file of sortedFiles) {
|
|
1127
|
+
const sizeStr = this.formatHumanReadableSize(file.uncompressedSize);
|
|
1128
|
+
lines.push(` ${file.name} (${sizeStr})`);
|
|
1129
|
+
}
|
|
1130
|
+
lines.push("");
|
|
1131
|
+
}
|
|
1132
|
+
if (entries.length === 0) {
|
|
1133
|
+
lines.push("*Archive is empty.*");
|
|
1134
|
+
lines.push("");
|
|
1135
|
+
}
|
|
1136
|
+
// Extracted file contents (Phase 2 sub-processing)
|
|
1137
|
+
if (extractedContents && extractedContents.size > 0) {
|
|
1138
|
+
lines.push("### Extracted File Contents");
|
|
1139
|
+
lines.push("");
|
|
1140
|
+
extractedContents.forEach((content, entryName) => {
|
|
1141
|
+
const ext = path.extname(entryName).replace(".", "");
|
|
1142
|
+
const langHint = ext || "";
|
|
1143
|
+
lines.push(`#### ${entryName}`);
|
|
1144
|
+
lines.push(`\`\`\`${langHint}`);
|
|
1145
|
+
// Truncate very long file contents to avoid excessive token usage
|
|
1146
|
+
if (content.length > 10000) {
|
|
1147
|
+
lines.push(content.slice(0, 8000));
|
|
1148
|
+
lines.push(`\n... [truncated ${content.length - 8000} characters] ...`);
|
|
1149
|
+
lines.push(content.slice(-1000));
|
|
1150
|
+
}
|
|
1151
|
+
else {
|
|
1152
|
+
lines.push(content);
|
|
1153
|
+
}
|
|
1154
|
+
lines.push("```");
|
|
1155
|
+
lines.push("");
|
|
1156
|
+
});
|
|
1157
|
+
}
|
|
1158
|
+
return lines.join("\n");
|
|
1159
|
+
}
|
|
1160
|
+
/**
|
|
1161
|
+
* Format a byte count as a human-readable size string.
|
|
1162
|
+
*
|
|
1163
|
+
* @param bytes - Size in bytes
|
|
1164
|
+
* @returns Formatted string (e.g., "1.5 MB", "256 KB", "128 B")
|
|
1165
|
+
*/
|
|
1166
|
+
formatHumanReadableSize(bytes) {
|
|
1167
|
+
if (bytes === 0) {
|
|
1168
|
+
return "0 B";
|
|
1169
|
+
}
|
|
1170
|
+
const units = ["B", "KB", "MB", "GB"];
|
|
1171
|
+
const k = 1024;
|
|
1172
|
+
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
|
1173
|
+
const idx = Math.min(i, units.length - 1);
|
|
1174
|
+
return `${parseFloat((bytes / k ** idx).toFixed(2))} ${units[idx]}`;
|
|
1175
|
+
}
|
|
1176
|
+
// ===========================================================================
|
|
1177
|
+
// TARGETED EXTRACTION API
|
|
1178
|
+
// ===========================================================================
|
|
1179
|
+
/**
|
|
1180
|
+
* Extract a specific file from a ZIP archive and return its text content.
|
|
1181
|
+
*
|
|
1182
|
+
* Called by the `extract_file_content` tool for targeted access to files
|
|
1183
|
+
* inside archives. Only supports ZIP archives (the most common format).
|
|
1184
|
+
* Applies security checks (path traversal, size limits).
|
|
1185
|
+
*
|
|
1186
|
+
* @param buffer - Archive file buffer
|
|
1187
|
+
* @param entryPath - Path of the entry within the archive (e.g., "src/index.ts")
|
|
1188
|
+
* @returns Text content of the extracted file, or error message
|
|
1189
|
+
*/
|
|
1190
|
+
async extractEntry(buffer, entryPath) {
|
|
1191
|
+
try {
|
|
1192
|
+
const AdmZip = (await import("adm-zip")).default;
|
|
1193
|
+
const zip = new AdmZip(buffer);
|
|
1194
|
+
const entries = zip.getEntries();
|
|
1195
|
+
// Security: check for path traversal
|
|
1196
|
+
if (this.hasPathTraversal(entryPath)) {
|
|
1197
|
+
return `Security error: entry path "${entryPath}" contains path traversal.`;
|
|
1198
|
+
}
|
|
1199
|
+
// Find the matching entry (case-insensitive fallback)
|
|
1200
|
+
let targetEntry = entries.find((e) => e.entryName === entryPath);
|
|
1201
|
+
if (!targetEntry) {
|
|
1202
|
+
targetEntry = entries.find((e) => e.entryName.toLowerCase() === entryPath.toLowerCase());
|
|
1203
|
+
}
|
|
1204
|
+
if (!targetEntry) {
|
|
1205
|
+
// List available entries to help the LLM
|
|
1206
|
+
const available = entries
|
|
1207
|
+
.filter((e) => !e.isDirectory)
|
|
1208
|
+
.slice(0, 20)
|
|
1209
|
+
.map((e) => ` - ${e.entryName} (${this.formatHumanReadableSize(e.header.size)})`)
|
|
1210
|
+
.join("\n");
|
|
1211
|
+
return `Entry "${entryPath}" not found in archive.\n\nAvailable entries (first 20):\n${available}`;
|
|
1212
|
+
}
|
|
1213
|
+
if (targetEntry.isDirectory) {
|
|
1214
|
+
return `"${entryPath}" is a directory, not a file.`;
|
|
1215
|
+
}
|
|
1216
|
+
// Security: size check
|
|
1217
|
+
const maxSize = 5 * 1024 * 1024; // 5 MB
|
|
1218
|
+
if (targetEntry.header.size > maxSize) {
|
|
1219
|
+
return `Entry "${entryPath}" is too large (${this.formatHumanReadableSize(targetEntry.header.size)}). Maximum extraction size is 5 MB.`;
|
|
1220
|
+
}
|
|
1221
|
+
const data = targetEntry.getData();
|
|
1222
|
+
// Check if it looks like text
|
|
1223
|
+
const sampleSize = Math.min(data.length, 512);
|
|
1224
|
+
let printable = 0;
|
|
1225
|
+
for (let i = 0; i < sampleSize; i++) {
|
|
1226
|
+
const b = data[i];
|
|
1227
|
+
if ((b >= 0x20 && b <= 0x7e) || b === 0x09 || b === 0x0a || b === 0x0d || b >= 0x80) {
|
|
1228
|
+
printable++;
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
if (sampleSize > 0 && printable / sampleSize < 0.8) {
|
|
1232
|
+
return `Entry "${entryPath}" appears to be a binary file (${this.formatHumanReadableSize(data.length)}). Cannot display as text.`;
|
|
1233
|
+
}
|
|
1234
|
+
return data.toString("utf-8");
|
|
1235
|
+
}
|
|
1236
|
+
catch (err) {
|
|
1237
|
+
return `Failed to extract entry "${entryPath}": ${err instanceof Error ? err.message : String(err)}`;
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
}
|
|
1241
|
+
// =============================================================================
|
|
1242
|
+
// SINGLETON INSTANCE
|
|
1243
|
+
// =============================================================================
|
|
1244
|
+
/**
|
|
1245
|
+
* Singleton Archive processor instance.
|
|
1246
|
+
* Use this for standard archive processing operations.
|
|
1247
|
+
*
|
|
1248
|
+
* @example
|
|
1249
|
+
* ```typescript
|
|
1250
|
+
* import { archiveProcessor } from "./ArchiveProcessor.js";
|
|
1251
|
+
*
|
|
1252
|
+
* const result = await archiveProcessor.processFile(fileInfo);
|
|
1253
|
+
* ```
|
|
1254
|
+
*/
|
|
1255
|
+
export const archiveProcessor = new ArchiveProcessor();
|
|
1256
|
+
// =============================================================================
|
|
1257
|
+
// HELPER FUNCTIONS
|
|
1258
|
+
// =============================================================================
|
|
1259
|
+
/**
|
|
1260
|
+
* Check if a file is an archive file.
|
|
1261
|
+
* Matches by MIME type or file extension.
|
|
1262
|
+
*
|
|
1263
|
+
* @param mimetype - MIME type of the file
|
|
1264
|
+
* @param filename - Filename (for extension-based detection)
|
|
1265
|
+
* @returns true if the file is a recognized archive format
|
|
1266
|
+
*
|
|
1267
|
+
* @example
|
|
1268
|
+
* ```typescript
|
|
1269
|
+
* if (isArchiveFile("application/zip", "backup.zip")) {
|
|
1270
|
+
* // Process as archive
|
|
1271
|
+
* }
|
|
1272
|
+
*
|
|
1273
|
+
* if (isArchiveFile("", "data.tar.gz")) {
|
|
1274
|
+
* // Also matches by extension
|
|
1275
|
+
* }
|
|
1276
|
+
* ```
|
|
1277
|
+
*/
|
|
1278
|
+
export function isArchiveFile(mimetype, filename) {
|
|
1279
|
+
return archiveProcessor.isFileSupported(mimetype, filename);
|
|
1280
|
+
}
|
|
1281
|
+
/**
|
|
1282
|
+
* Process a single archive file.
|
|
1283
|
+
* Convenience function that uses the singleton processor.
|
|
1284
|
+
*
|
|
1285
|
+
* @param fileInfo - File information (can include URL or buffer)
|
|
1286
|
+
* @param options - Optional processing options (auth headers, timeout, etc.)
|
|
1287
|
+
* @returns Processing result with archive metadata and entry listing, or error
|
|
1288
|
+
*
|
|
1289
|
+
* @example
|
|
1290
|
+
* ```typescript
|
|
1291
|
+
* import { processArchive } from "./ArchiveProcessor.js";
|
|
1292
|
+
*
|
|
1293
|
+
* const result = await processArchive(fileInfo, {
|
|
1294
|
+
* authHeaders: { Authorization: "Bearer token" },
|
|
1295
|
+
* });
|
|
1296
|
+
*
|
|
1297
|
+
* if (result.success) {
|
|
1298
|
+
* const { archiveMetadata, entries, textContent } = result.data;
|
|
1299
|
+
* console.log(`Found ${entries.length} entries in ${archiveMetadata.format} archive`);
|
|
1300
|
+
* console.log(textContent);
|
|
1301
|
+
* } else {
|
|
1302
|
+
* console.error(`Processing failed: ${result.error?.userMessage}`);
|
|
1303
|
+
* }
|
|
1304
|
+
* ```
|
|
1305
|
+
*/
|
|
1306
|
+
export async function processArchive(fileInfo, options) {
|
|
1307
|
+
return archiveProcessor.processFile(fileInfo, options);
|
|
1308
|
+
}
|