@juspay/neurolink 9.5.2 → 9.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/README.md +29 -25
- package/dist/agent/directTools.d.ts +5 -5
- package/dist/cli/commands/config.d.ts +9 -9
- package/dist/cli/commands/serve.d.ts +37 -0
- package/dist/cli/commands/serve.js +302 -229
- package/dist/cli/commands/setup-anthropic.d.ts +2 -2
- package/dist/cli/commands/setup-azure.d.ts +2 -2
- package/dist/cli/commands/setup-bedrock.d.ts +2 -2
- package/dist/cli/commands/setup-gcp.d.ts +2 -2
- package/dist/cli/commands/setup-google-ai.d.ts +2 -2
- package/dist/cli/commands/setup-huggingface.d.ts +2 -2
- package/dist/cli/commands/setup-mistral.d.ts +2 -2
- package/dist/cli/commands/setup-openai.d.ts +2 -2
- package/dist/cli/commands/setup.d.ts +2 -2
- package/dist/cli/factories/commandFactory.js +16 -2
- package/dist/cli/loop/optionsSchema.d.ts +2 -2
- package/dist/cli/loop/session.d.ts +4 -0
- package/dist/cli/loop/session.js +49 -4
- package/dist/cli/utils/interactiveSetup.d.ts +4 -4
- package/dist/config/conversationMemory.d.ts +2 -0
- package/dist/config/conversationMemory.js +5 -5
- package/dist/constants/contextWindows.d.ts +46 -0
- package/dist/constants/contextWindows.js +156 -0
- package/dist/context/budgetChecker.d.ts +18 -0
- package/dist/context/budgetChecker.js +71 -0
- package/dist/context/contextCompactor.d.ts +22 -0
- package/dist/context/contextCompactor.js +106 -0
- package/dist/context/effectiveHistory.d.ts +52 -0
- package/dist/context/effectiveHistory.js +105 -0
- package/dist/context/errorDetection.d.ts +14 -0
- package/dist/context/errorDetection.js +124 -0
- package/dist/context/fileSummarizationService.d.ts +54 -0
- package/dist/context/fileSummarizationService.js +255 -0
- package/dist/context/fileSummarizer.d.ts +56 -0
- package/dist/context/fileSummarizer.js +145 -0
- package/dist/context/fileTokenBudget.d.ts +53 -0
- package/dist/context/fileTokenBudget.js +127 -0
- package/dist/context/prompts/summarizationPrompt.d.ts +17 -0
- package/dist/context/prompts/summarizationPrompt.js +110 -0
- package/dist/context/stages/fileReadDeduplicator.d.ts +10 -0
- package/dist/context/stages/fileReadDeduplicator.js +66 -0
- package/dist/context/stages/slidingWindowTruncator.d.ts +11 -0
- package/dist/context/stages/slidingWindowTruncator.js +42 -0
- package/dist/context/stages/structuredSummarizer.d.ts +10 -0
- package/dist/context/stages/structuredSummarizer.js +49 -0
- package/dist/context/stages/toolOutputPruner.d.ts +10 -0
- package/dist/context/stages/toolOutputPruner.js +52 -0
- package/dist/context/summarizationEngine.d.ts +45 -0
- package/dist/context/summarizationEngine.js +110 -0
- package/dist/context/toolOutputLimits.d.ts +17 -0
- package/dist/context/toolOutputLimits.js +84 -0
- package/dist/context/toolPairRepair.d.ts +16 -0
- package/dist/context/toolPairRepair.js +66 -0
- package/dist/core/conversationMemoryManager.d.ts +5 -15
- package/dist/core/conversationMemoryManager.js +15 -75
- package/dist/core/modules/MessageBuilder.d.ts +1 -1
- package/dist/core/modules/MessageBuilder.js +2 -0
- package/dist/core/modules/TelemetryHandler.d.ts +2 -3
- package/dist/core/modules/TelemetryHandler.js +3 -3
- package/dist/core/modules/ToolsManager.d.ts +2 -2
- package/dist/core/redisConversationMemoryManager.d.ts +8 -14
- package/dist/core/redisConversationMemoryManager.js +69 -78
- package/dist/factories/providerFactory.d.ts +2 -2
- package/dist/files/fileReferenceRegistry.d.ts +276 -0
- package/dist/files/fileReferenceRegistry.js +1543 -0
- package/dist/files/fileTools.d.ts +423 -0
- package/dist/files/fileTools.js +449 -0
- package/dist/files/index.d.ts +14 -0
- package/dist/files/index.js +13 -0
- package/dist/files/streamingReader.d.ts +93 -0
- package/dist/files/streamingReader.js +321 -0
- package/dist/files/types.d.ts +23 -0
- package/dist/files/types.js +23 -0
- package/dist/image-gen/imageGenTools.d.ts +2 -2
- package/dist/image-gen/types.d.ts +12 -12
- package/dist/lib/agent/directTools.d.ts +7 -7
- package/dist/lib/config/conversationMemory.d.ts +2 -0
- package/dist/lib/config/conversationMemory.js +5 -5
- package/dist/lib/constants/contextWindows.d.ts +46 -0
- package/dist/lib/constants/contextWindows.js +157 -0
- package/dist/lib/context/budgetChecker.d.ts +18 -0
- package/dist/lib/context/budgetChecker.js +72 -0
- package/dist/lib/context/contextCompactor.d.ts +22 -0
- package/dist/lib/context/contextCompactor.js +107 -0
- package/dist/lib/context/effectiveHistory.d.ts +52 -0
- package/dist/lib/context/effectiveHistory.js +106 -0
- package/dist/lib/context/errorDetection.d.ts +14 -0
- package/dist/lib/context/errorDetection.js +125 -0
- package/dist/lib/context/fileSummarizationService.d.ts +54 -0
- package/dist/lib/context/fileSummarizationService.js +256 -0
- package/dist/lib/context/fileSummarizer.d.ts +56 -0
- package/dist/lib/context/fileSummarizer.js +146 -0
- package/dist/lib/context/fileTokenBudget.d.ts +53 -0
- package/dist/lib/context/fileTokenBudget.js +128 -0
- package/dist/lib/context/prompts/summarizationPrompt.d.ts +17 -0
- package/dist/lib/context/prompts/summarizationPrompt.js +111 -0
- package/dist/lib/context/stages/fileReadDeduplicator.d.ts +10 -0
- package/dist/lib/context/stages/fileReadDeduplicator.js +67 -0
- package/dist/lib/context/stages/slidingWindowTruncator.d.ts +11 -0
- package/dist/lib/context/stages/slidingWindowTruncator.js +43 -0
- package/dist/lib/context/stages/structuredSummarizer.d.ts +10 -0
- package/dist/lib/context/stages/structuredSummarizer.js +50 -0
- package/dist/lib/context/stages/toolOutputPruner.d.ts +10 -0
- package/dist/lib/context/stages/toolOutputPruner.js +53 -0
- package/dist/lib/context/summarizationEngine.d.ts +45 -0
- package/dist/lib/context/summarizationEngine.js +111 -0
- package/dist/lib/context/toolOutputLimits.d.ts +17 -0
- package/dist/lib/context/toolOutputLimits.js +85 -0
- package/dist/lib/context/toolPairRepair.d.ts +16 -0
- package/dist/lib/context/toolPairRepair.js +67 -0
- package/dist/lib/core/conversationMemoryManager.d.ts +5 -15
- package/dist/lib/core/conversationMemoryManager.js +15 -75
- package/dist/lib/core/modules/MessageBuilder.d.ts +1 -1
- package/dist/lib/core/modules/MessageBuilder.js +2 -0
- package/dist/lib/core/modules/TelemetryHandler.d.ts +2 -3
- package/dist/lib/core/modules/TelemetryHandler.js +3 -3
- package/dist/lib/core/modules/ToolsManager.d.ts +2 -2
- package/dist/lib/core/redisConversationMemoryManager.d.ts +8 -14
- package/dist/lib/core/redisConversationMemoryManager.js +69 -78
- package/dist/lib/factories/providerFactory.d.ts +2 -2
- package/dist/lib/files/fileReferenceRegistry.d.ts +276 -0
- package/dist/lib/files/fileReferenceRegistry.js +1544 -0
- package/dist/lib/files/fileTools.d.ts +423 -0
- package/dist/lib/files/fileTools.js +450 -0
- package/dist/lib/files/index.d.ts +14 -0
- package/dist/lib/files/index.js +14 -0
- package/dist/lib/files/streamingReader.d.ts +93 -0
- package/dist/lib/files/streamingReader.js +322 -0
- package/dist/lib/files/types.d.ts +23 -0
- package/dist/lib/files/types.js +24 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +2 -2
- package/dist/lib/image-gen/types.d.ts +12 -12
- package/dist/lib/memory/mem0Initializer.d.ts +2 -2
- package/dist/lib/neurolink.d.ts +61 -2
- package/dist/lib/neurolink.js +619 -307
- package/dist/lib/processors/archive/ArchiveProcessor.d.ts +327 -0
- package/dist/lib/processors/archive/ArchiveProcessor.js +1309 -0
- package/dist/lib/processors/archive/index.d.ts +33 -0
- package/dist/lib/processors/archive/index.js +43 -0
- package/dist/lib/processors/base/types.d.ts +70 -64
- package/dist/lib/processors/base/types.js +6 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +8 -8
- package/dist/lib/processors/cli/fileProcessorCli.js +5 -5
- package/dist/lib/processors/config/mimeTypes.js +25 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +52 -40
- package/dist/lib/processors/config/sizeLimits.js +56 -44
- package/dist/lib/processors/document/ExcelProcessor.d.ts +14 -0
- package/dist/lib/processors/document/ExcelProcessor.js +72 -1
- package/dist/lib/processors/document/PptxProcessor.d.ts +63 -0
- package/dist/lib/processors/document/PptxProcessor.js +158 -0
- package/dist/lib/processors/document/index.d.ts +1 -0
- package/dist/lib/processors/document/index.js +6 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +2 -2
- package/dist/lib/processors/errors/errorHelpers.d.ts +2 -2
- package/dist/lib/processors/errors/errorSerializer.d.ts +4 -4
- package/dist/lib/processors/index.d.ts +8 -2
- package/dist/lib/processors/index.js +5 -2
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +8 -8
- package/dist/lib/processors/integration/FileProcessorIntegration.js +7 -7
- package/dist/lib/processors/media/AudioProcessor.d.ts +328 -0
- package/dist/lib/processors/media/AudioProcessor.js +708 -0
- package/dist/lib/processors/media/VideoProcessor.d.ts +350 -0
- package/dist/lib/processors/media/VideoProcessor.js +992 -0
- package/dist/lib/processors/media/index.d.ts +27 -0
- package/dist/lib/processors/media/index.js +37 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +19 -5
- package/dist/lib/processors/registry/ProcessorRegistry.js +103 -8
- package/dist/lib/processors/registry/index.d.ts +1 -1
- package/dist/lib/processors/registry/index.js +1 -1
- package/dist/lib/processors/registry/types.d.ts +2 -2
- package/dist/lib/providers/googleAiStudio.d.ts +34 -0
- package/dist/lib/providers/googleAiStudio.js +267 -397
- package/dist/lib/providers/googleVertex.d.ts +55 -1
- package/dist/lib/providers/googleVertex.js +452 -719
- package/dist/lib/providers/sagemaker/detection.d.ts +6 -6
- package/dist/lib/providers/sagemaker/diagnostics.d.ts +4 -4
- package/dist/lib/providers/sagemaker/parsers.d.ts +4 -4
- package/dist/lib/rag/chunkers/RecursiveChunker.js +2 -2
- package/dist/lib/rag/document/loaders.d.ts +6 -71
- package/dist/lib/rag/document/loaders.js +5 -5
- package/dist/lib/rag/graphRag/graphRAG.js +26 -9
- package/dist/lib/rag/metadata/MetadataExtractorFactory.d.ts +5 -55
- package/dist/lib/rag/metadata/metadataExtractor.js +6 -3
- package/dist/lib/rag/pipeline/RAGPipeline.d.ts +8 -126
- package/dist/lib/rag/pipeline/RAGPipeline.js +11 -11
- package/dist/lib/rag/pipeline/contextAssembly.d.ts +3 -42
- package/dist/lib/rag/pipeline/contextAssembly.js +6 -3
- package/dist/lib/rag/reranker/RerankerFactory.d.ts +5 -60
- package/dist/lib/rag/resilience/CircuitBreaker.d.ts +3 -33
- package/dist/lib/rag/resilience/RetryHandler.d.ts +2 -21
- package/dist/lib/rag/retrieval/hybridSearch.d.ts +3 -41
- package/dist/lib/rag/retrieval/vectorQueryTool.d.ts +2 -13
- package/dist/lib/rag/retrieval/vectorQueryTool.js +4 -3
- package/dist/lib/rag/types.d.ts +3 -3
- package/dist/lib/sdk/toolRegistration.d.ts +2 -2
- package/dist/lib/server/middleware/cache.d.ts +2 -2
- package/dist/lib/server/middleware/rateLimit.d.ts +2 -2
- package/dist/lib/server/routes/mcpRoutes.js +277 -249
- package/dist/lib/server/routes/memoryRoutes.js +287 -281
- package/dist/lib/server/utils/validation.d.ts +10 -10
- package/dist/lib/session/globalSessionState.d.ts +2 -2
- package/dist/lib/telemetry/telemetryService.d.ts +2 -2
- package/dist/lib/types/common.d.ts +39 -0
- package/dist/lib/types/contextTypes.d.ts +255 -0
- package/dist/lib/types/contextTypes.js +0 -2
- package/dist/lib/types/conversation.d.ts +62 -0
- package/dist/lib/types/conversationMemoryInterface.d.ts +27 -0
- package/dist/lib/types/conversationMemoryInterface.js +7 -0
- package/dist/lib/types/fileReferenceTypes.d.ts +222 -0
- package/dist/lib/types/fileReferenceTypes.js +9 -0
- package/dist/lib/types/fileTypes.d.ts +26 -3
- package/dist/lib/types/generateTypes.d.ts +22 -1
- package/dist/lib/types/index.d.ts +4 -5
- package/dist/lib/types/index.js +8 -10
- package/dist/lib/types/modelTypes.d.ts +2 -2
- package/dist/lib/types/processorTypes.d.ts +597 -0
- package/dist/lib/types/processorTypes.js +91 -0
- package/dist/lib/types/ragTypes.d.ts +481 -0
- package/dist/lib/types/ragTypes.js +8 -0
- package/dist/lib/types/sdkTypes.d.ts +17 -18
- package/dist/lib/types/streamTypes.d.ts +11 -1
- package/dist/lib/utils/async/retry.d.ts +2 -2
- package/dist/lib/utils/async/withTimeout.js +3 -1
- package/dist/lib/utils/conversationMemory.d.ts +12 -6
- package/dist/lib/utils/conversationMemory.js +76 -36
- package/dist/lib/utils/fileDetector.d.ts +62 -0
- package/dist/lib/utils/fileDetector.js +1014 -14
- package/dist/lib/utils/json/safeParse.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +806 -153
- package/dist/lib/utils/modelChoices.d.ts +2 -2
- package/dist/lib/utils/multimodalOptionsBuilder.d.ts +2 -1
- package/dist/lib/utils/multimodalOptionsBuilder.js +1 -0
- package/dist/lib/utils/rateLimiter.d.ts +2 -2
- package/dist/lib/utils/sanitizers/filename.d.ts +4 -4
- package/dist/lib/utils/sanitizers/svg.d.ts +2 -2
- package/dist/lib/utils/thinkingConfig.d.ts +6 -6
- package/dist/lib/utils/tokenEstimation.d.ts +68 -0
- package/dist/lib/utils/tokenEstimation.js +113 -0
- package/dist/lib/utils/tokenUtils.d.ts +4 -4
- package/dist/lib/utils/ttsProcessor.d.ts +2 -2
- package/dist/lib/workflow/config.d.ts +150 -150
- package/dist/memory/mem0Initializer.d.ts +2 -2
- package/dist/neurolink.d.ts +61 -2
- package/dist/neurolink.js +619 -307
- package/dist/processors/archive/ArchiveProcessor.d.ts +327 -0
- package/dist/processors/archive/ArchiveProcessor.js +1308 -0
- package/dist/processors/archive/index.d.ts +33 -0
- package/dist/processors/archive/index.js +42 -0
- package/dist/processors/base/types.d.ts +70 -64
- package/dist/processors/base/types.js +6 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +8 -8
- package/dist/processors/cli/fileProcessorCli.js +5 -5
- package/dist/processors/config/mimeTypes.js +25 -0
- package/dist/processors/config/sizeLimits.d.ts +52 -40
- package/dist/processors/config/sizeLimits.js +56 -44
- package/dist/processors/document/ExcelProcessor.d.ts +14 -0
- package/dist/processors/document/ExcelProcessor.js +72 -1
- package/dist/processors/document/PptxProcessor.d.ts +63 -0
- package/dist/processors/document/PptxProcessor.js +157 -0
- package/dist/processors/document/index.d.ts +1 -0
- package/dist/processors/document/index.js +6 -0
- package/dist/processors/errors/FileErrorCode.d.ts +2 -2
- package/dist/processors/errors/errorHelpers.d.ts +2 -2
- package/dist/processors/errors/errorSerializer.d.ts +4 -4
- package/dist/processors/index.d.ts +8 -2
- package/dist/processors/index.js +5 -2
- package/dist/processors/integration/FileProcessorIntegration.d.ts +8 -8
- package/dist/processors/integration/FileProcessorIntegration.js +7 -7
- package/dist/processors/media/AudioProcessor.d.ts +328 -0
- package/dist/processors/media/AudioProcessor.js +707 -0
- package/dist/processors/media/VideoProcessor.d.ts +350 -0
- package/dist/processors/media/VideoProcessor.js +991 -0
- package/dist/processors/media/ffprobe-static.d.ts +4 -0
- package/dist/processors/media/index.d.ts +27 -0
- package/dist/processors/media/index.js +36 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +19 -5
- package/dist/processors/registry/ProcessorRegistry.js +103 -8
- package/dist/processors/registry/index.d.ts +1 -1
- package/dist/processors/registry/index.js +1 -1
- package/dist/processors/registry/types.d.ts +2 -2
- package/dist/providers/googleAiStudio.d.ts +34 -0
- package/dist/providers/googleAiStudio.js +267 -397
- package/dist/providers/googleVertex.d.ts +55 -1
- package/dist/providers/googleVertex.js +452 -719
- package/dist/providers/sagemaker/detection.d.ts +6 -6
- package/dist/providers/sagemaker/diagnostics.d.ts +4 -4
- package/dist/providers/sagemaker/parsers.d.ts +4 -4
- package/dist/rag/chunkers/RecursiveChunker.js +2 -2
- package/dist/rag/document/loaders.d.ts +6 -71
- package/dist/rag/document/loaders.js +5 -5
- package/dist/rag/graphRag/graphRAG.js +26 -9
- package/dist/rag/metadata/MetadataExtractorFactory.d.ts +5 -55
- package/dist/rag/metadata/metadataExtractor.js +6 -3
- package/dist/rag/pipeline/RAGPipeline.d.ts +8 -126
- package/dist/rag/pipeline/RAGPipeline.js +11 -11
- package/dist/rag/pipeline/contextAssembly.d.ts +3 -42
- package/dist/rag/pipeline/contextAssembly.js +6 -3
- package/dist/rag/reranker/RerankerFactory.d.ts +5 -60
- package/dist/rag/resilience/CircuitBreaker.d.ts +3 -33
- package/dist/rag/resilience/RetryHandler.d.ts +2 -21
- package/dist/rag/retrieval/hybridSearch.d.ts +3 -41
- package/dist/rag/retrieval/vectorQueryTool.d.ts +2 -13
- package/dist/rag/retrieval/vectorQueryTool.js +4 -3
- package/dist/rag/types.d.ts +3 -3
- package/dist/sdk/toolRegistration.d.ts +2 -2
- package/dist/server/middleware/cache.d.ts +2 -2
- package/dist/server/middleware/rateLimit.d.ts +2 -2
- package/dist/server/routes/mcpRoutes.js +277 -249
- package/dist/server/routes/memoryRoutes.js +287 -281
- package/dist/server/utils/validation.d.ts +4 -4
- package/dist/session/globalSessionState.d.ts +2 -2
- package/dist/telemetry/telemetryService.d.ts +2 -2
- package/dist/types/common.d.ts +39 -0
- package/dist/types/contextTypes.d.ts +255 -0
- package/dist/types/contextTypes.js +0 -2
- package/dist/types/conversation.d.ts +62 -0
- package/dist/types/conversationMemoryInterface.d.ts +27 -0
- package/dist/types/conversationMemoryInterface.js +6 -0
- package/dist/types/fileReferenceTypes.d.ts +222 -0
- package/dist/types/fileReferenceTypes.js +8 -0
- package/dist/types/fileTypes.d.ts +26 -3
- package/dist/types/generateTypes.d.ts +22 -1
- package/dist/types/index.d.ts +4 -5
- package/dist/types/index.js +8 -10
- package/dist/types/processorTypes.d.ts +597 -0
- package/dist/types/processorTypes.js +90 -0
- package/dist/types/ragTypes.d.ts +481 -0
- package/dist/types/ragTypes.js +7 -0
- package/dist/types/sdkTypes.d.ts +17 -18
- package/dist/types/streamTypes.d.ts +11 -1
- package/dist/utils/async/retry.d.ts +2 -2
- package/dist/utils/async/withTimeout.js +3 -1
- package/dist/utils/conversationMemory.d.ts +12 -6
- package/dist/utils/conversationMemory.js +76 -36
- package/dist/utils/fileDetector.d.ts +62 -0
- package/dist/utils/fileDetector.js +1014 -14
- package/dist/utils/json/safeParse.d.ts +2 -2
- package/dist/utils/messageBuilder.js +806 -153
- package/dist/utils/modelChoices.d.ts +2 -2
- package/dist/utils/multimodalOptionsBuilder.d.ts +2 -1
- package/dist/utils/multimodalOptionsBuilder.js +1 -0
- package/dist/utils/rateLimiter.d.ts +2 -2
- package/dist/utils/sanitizers/filename.d.ts +4 -4
- package/dist/utils/sanitizers/svg.d.ts +2 -2
- package/dist/utils/thinkingConfig.d.ts +6 -6
- package/dist/utils/tokenEstimation.d.ts +68 -0
- package/dist/utils/tokenEstimation.js +112 -0
- package/dist/utils/tokenUtils.d.ts +4 -4
- package/dist/utils/ttsProcessor.d.ts +2 -2
- package/dist/workflow/config.d.ts +104 -104
- package/package.json +18 -6
- package/dist/lib/utils/conversationMemoryUtils.d.ts +0 -25
- package/dist/lib/utils/conversationMemoryUtils.js +0 -138
- package/dist/utils/conversationMemoryUtils.d.ts +0 -25
- package/dist/utils/conversationMemoryUtils.js +0 -137
|
@@ -0,0 +1,1543 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File Reference Registry
|
|
3
|
+
*
|
|
4
|
+
* Central registry for managing file references in on-demand processing mode.
|
|
5
|
+
* Files are registered with lightweight metadata and previews. Full content
|
|
6
|
+
* is processed on-demand when the LLM requests it via tools.
|
|
7
|
+
*
|
|
8
|
+
* This module is the core of the file reference architecture, replacing
|
|
9
|
+
* the previous "load everything upfront" pattern for files that exceed
|
|
10
|
+
* the tiny/small size tiers.
|
|
11
|
+
*
|
|
12
|
+
* @module files/fileReferenceRegistry
|
|
13
|
+
*/
|
|
14
|
+
import { randomUUID } from "node:crypto";
|
|
15
|
+
import { mkdir, readFile, stat, unlink, writeFile } from "node:fs/promises";
|
|
16
|
+
import { tmpdir } from "node:os";
|
|
17
|
+
import { basename, extname, join } from "node:path";
|
|
18
|
+
import { estimatePostProcessingTokens } from "../context/fileTokenBudget.js";
|
|
19
|
+
import { logger } from "../utils/logger.js";
|
|
20
|
+
import { StreamingReader } from "./streamingReader.js";
|
|
21
|
+
import { SIZE_TIER_THRESHOLDS } from "./types.js";
|
|
22
|
+
/** Default maximum files in registry before LRU eviction */
|
|
23
|
+
const DEFAULT_MAX_FILES = 100;
|
|
24
|
+
/** Default maximum temp bytes (1 GB) */
|
|
25
|
+
const DEFAULT_MAX_TEMP_BYTES = 1024 * 1024 * 1024;
|
|
26
|
+
/** Default preview length in characters */
|
|
27
|
+
const DEFAULT_PREVIEW_CHARS = 2000;
|
|
28
|
+
/** Maximum file size we'll accept (2 GB) */
|
|
29
|
+
const MAX_ACCEPTED_SIZE = 2 * 1024 * 1024 * 1024;
|
|
30
|
+
/**
|
|
31
|
+
* Registry for managing file references with on-demand processing.
|
|
32
|
+
*
|
|
33
|
+
* Design decisions:
|
|
34
|
+
* - One instance per NeuroLink SDK instance (not global singleton)
|
|
35
|
+
* - File buffers persisted to temp dir for later streaming access
|
|
36
|
+
* - LRU eviction when maxFiles exceeded
|
|
37
|
+
* - Thread-safe via sequential async operations (Node.js single-threaded)
|
|
38
|
+
*
|
|
39
|
+
* @example
|
|
40
|
+
* ```typescript
|
|
41
|
+
* const registry = new FileReferenceRegistry();
|
|
42
|
+
* const ref = await registry.register(buffer, {
|
|
43
|
+
* filename: 'report.xlsx',
|
|
44
|
+
* });
|
|
45
|
+
* console.log(ref.sizeTier); // 'medium'
|
|
46
|
+
* console.log(ref.preview); // First 2000 chars of processed content
|
|
47
|
+
* console.log(ref.estimatedTokens); // Type-aware estimate
|
|
48
|
+
*
|
|
49
|
+
* // Later, LLM requests specific section
|
|
50
|
+
* const section = await registry.readSection(ref.id, 1, 50, 5000);
|
|
51
|
+
* ```
|
|
52
|
+
*/
|
|
53
|
+
export class FileReferenceRegistry {
|
|
54
|
+
files = new Map();
|
|
55
|
+
tempDir;
|
|
56
|
+
maxFiles;
|
|
57
|
+
maxTempBytes;
|
|
58
|
+
defaultPreviewChars;
|
|
59
|
+
currentTempBytes = 0;
|
|
60
|
+
tempDirCreated = false;
|
|
61
|
+
constructor(options = {}) {
|
|
62
|
+
this.tempDir =
|
|
63
|
+
options.tempDir || join(tmpdir(), "neurolink-files", randomUUID());
|
|
64
|
+
this.maxFiles = options.maxFiles ?? DEFAULT_MAX_FILES;
|
|
65
|
+
this.maxTempBytes = options.maxTempBytes ?? DEFAULT_MAX_TEMP_BYTES;
|
|
66
|
+
this.defaultPreviewChars =
|
|
67
|
+
options.defaultPreviewChars ?? DEFAULT_PREVIEW_CHARS;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Register a file from a Buffer.
|
|
71
|
+
*
|
|
72
|
+
* This is the primary registration method. It performs lightweight analysis:
|
|
73
|
+
* 1. Detect file type from magic bytes (first 1KB)
|
|
74
|
+
* 2. Determine size tier
|
|
75
|
+
* 3. Extract preview (first N chars of text, or metadata for binary)
|
|
76
|
+
* 4. Persist buffer to temp directory for later streaming access
|
|
77
|
+
*
|
|
78
|
+
* Total time: ~1-5ms for most files (no full processing).
|
|
79
|
+
*
|
|
80
|
+
* @param buffer - File content as Buffer
|
|
81
|
+
* @param source - How the file was provided ('buffer', 'url', 'path', 'datauri')
|
|
82
|
+
* @param options - Registration options
|
|
83
|
+
* @returns FileReference with metadata and preview
|
|
84
|
+
*/
|
|
85
|
+
async register(buffer, source = "buffer", options = {}) {
|
|
86
|
+
const sizeBytes = buffer.length;
|
|
87
|
+
// Reject oversized files
|
|
88
|
+
if (sizeBytes > MAX_ACCEPTED_SIZE) {
|
|
89
|
+
const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(1);
|
|
90
|
+
throw new Error(`File too large (${sizeMB} MB). Maximum accepted size is 2 GB.`);
|
|
91
|
+
}
|
|
92
|
+
// Detect file type from magic bytes and extension.
|
|
93
|
+
// If the provided filename has no extension, append one guessed from magic bytes
|
|
94
|
+
// so downstream processors (e.g., VideoProcessor) can validate by extension.
|
|
95
|
+
let filename = options.filename || `file-${Date.now()}${this.guessExtension(buffer)}`;
|
|
96
|
+
if (!extname(filename)) {
|
|
97
|
+
const guessedExt = this.guessExtension(buffer);
|
|
98
|
+
if (guessedExt) {
|
|
99
|
+
filename = `${filename}${guessedExt}`;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
const ext = extname(filename).toLowerCase().replace(".", "");
|
|
103
|
+
const detectedType = options.fileType || this.detectType(buffer, ext);
|
|
104
|
+
const mimeType = this.guessMimeType(detectedType, ext);
|
|
105
|
+
const sizeTier = FileReferenceRegistry.classifySizeTier(sizeBytes);
|
|
106
|
+
// Generate preview (fast — only reads first N chars)
|
|
107
|
+
const preview = this.extractPreview(buffer, detectedType, options.maxPreviewChars ?? this.defaultPreviewChars);
|
|
108
|
+
// Estimate post-processing tokens (type-aware)
|
|
109
|
+
const estimatedTokens = estimatePostProcessingTokens(sizeBytes, detectedType);
|
|
110
|
+
// Create reference
|
|
111
|
+
const ref = {
|
|
112
|
+
id: randomUUID(),
|
|
113
|
+
source,
|
|
114
|
+
filename,
|
|
115
|
+
sizeBytes,
|
|
116
|
+
detectedType,
|
|
117
|
+
mimeType,
|
|
118
|
+
sizeTier,
|
|
119
|
+
estimatedTokens,
|
|
120
|
+
preview,
|
|
121
|
+
status: "registered",
|
|
122
|
+
registeredAt: Date.now(),
|
|
123
|
+
lastAccessedAt: Date.now(),
|
|
124
|
+
extension: ext || undefined,
|
|
125
|
+
};
|
|
126
|
+
// Persist buffer to temp directory (unless skipped or tiny)
|
|
127
|
+
if (!options.skipTempPersist && sizeTier !== "tiny") {
|
|
128
|
+
try {
|
|
129
|
+
const tempPath = await this.persistToTemp(ref.id, buffer, ext);
|
|
130
|
+
ref.tempPath = tempPath;
|
|
131
|
+
}
|
|
132
|
+
catch (err) {
|
|
133
|
+
logger.warn(`[FileReferenceRegistry] Failed to persist ${filename} to temp: ${err instanceof Error ? err.message : String(err)}`);
|
|
134
|
+
// Continue without temp persistence — buffer-based access still works
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
// For tiny files, store the processed content inline
|
|
138
|
+
if (sizeTier === "tiny") {
|
|
139
|
+
ref.processedContent = this.isTextType(detectedType, buffer)
|
|
140
|
+
? buffer.toString("utf-8")
|
|
141
|
+
: preview;
|
|
142
|
+
ref.status = "processed";
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
ref.status = "previewed";
|
|
146
|
+
}
|
|
147
|
+
// Evict LRU entries if at capacity
|
|
148
|
+
if (this.files.size >= this.maxFiles) {
|
|
149
|
+
this.evictLRU();
|
|
150
|
+
}
|
|
151
|
+
this.files.set(ref.id, ref);
|
|
152
|
+
logger.info(`[FileReferenceRegistry] Registered "${filename}" (${this.formatSize(sizeBytes)}, ` +
|
|
153
|
+
`tier=${sizeTier}, type=${detectedType}, ~${estimatedTokens} tokens)`);
|
|
154
|
+
return ref;
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Register a file from a file path on disk.
|
|
158
|
+
*
|
|
159
|
+
* Does NOT read the entire file — only reads the first 1KB for type detection
|
|
160
|
+
* and preview. The file path is stored for later streaming access.
|
|
161
|
+
*
|
|
162
|
+
* @param filePath - Absolute path to the file
|
|
163
|
+
* @param options - Registration options
|
|
164
|
+
* @returns FileReference with metadata and preview
|
|
165
|
+
*/
|
|
166
|
+
async registerFromPath(filePath, options = {}) {
|
|
167
|
+
const fileStat = await stat(filePath);
|
|
168
|
+
const sizeBytes = fileStat.size;
|
|
169
|
+
if (sizeBytes > MAX_ACCEPTED_SIZE) {
|
|
170
|
+
const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(1);
|
|
171
|
+
throw new Error(`File too large (${sizeMB} MB). Maximum accepted size is 2 GB.`);
|
|
172
|
+
}
|
|
173
|
+
const filename = options.filename || basename(filePath);
|
|
174
|
+
const ext = extname(filename).toLowerCase().replace(".", "");
|
|
175
|
+
const detectedType = options.fileType || this.detectTypeFromExtension(ext);
|
|
176
|
+
const mimeType = this.guessMimeType(detectedType, ext);
|
|
177
|
+
const sizeTier = FileReferenceRegistry.classifySizeTier(sizeBytes);
|
|
178
|
+
const estimatedTokens = estimatePostProcessingTokens(sizeBytes, detectedType);
|
|
179
|
+
// Read preview from file (streaming — only first N bytes)
|
|
180
|
+
let preview;
|
|
181
|
+
try {
|
|
182
|
+
preview = await StreamingReader.readPreview(filePath, options.maxPreviewChars ?? this.defaultPreviewChars);
|
|
183
|
+
}
|
|
184
|
+
catch {
|
|
185
|
+
preview = `[File: ${filename}, ${this.formatSize(sizeBytes)}, type: ${detectedType}]`;
|
|
186
|
+
}
|
|
187
|
+
const ref = {
|
|
188
|
+
id: randomUUID(),
|
|
189
|
+
source: "path",
|
|
190
|
+
originalPath: filePath,
|
|
191
|
+
filename,
|
|
192
|
+
sizeBytes,
|
|
193
|
+
detectedType,
|
|
194
|
+
mimeType,
|
|
195
|
+
sizeTier,
|
|
196
|
+
estimatedTokens,
|
|
197
|
+
preview,
|
|
198
|
+
status: "previewed",
|
|
199
|
+
registeredAt: Date.now(),
|
|
200
|
+
lastAccessedAt: Date.now(),
|
|
201
|
+
extension: ext || undefined,
|
|
202
|
+
};
|
|
203
|
+
// For path-based files, no need to persist — we already have the path
|
|
204
|
+
// Store the original path as the access point
|
|
205
|
+
ref.tempPath = filePath;
|
|
206
|
+
if (this.files.size >= this.maxFiles) {
|
|
207
|
+
this.evictLRU();
|
|
208
|
+
}
|
|
209
|
+
this.files.set(ref.id, ref);
|
|
210
|
+
logger.info(`[FileReferenceRegistry] Registered from path "${filename}" ` +
|
|
211
|
+
`(${this.formatSize(sizeBytes)}, tier=${sizeTier}, type=${detectedType})`);
|
|
212
|
+
return ref;
|
|
213
|
+
}
|
|
214
|
+
/**
|
|
215
|
+
* Get a file reference by ID.
|
|
216
|
+
* Updates lastAccessedAt for LRU tracking.
|
|
217
|
+
*/
|
|
218
|
+
get(id) {
|
|
219
|
+
const ref = this.files.get(id);
|
|
220
|
+
if (ref) {
|
|
221
|
+
ref.lastAccessedAt = Date.now();
|
|
222
|
+
}
|
|
223
|
+
return ref;
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Get a file reference by ID or filename.
|
|
227
|
+
* Tries ID lookup first, then falls back to filename match.
|
|
228
|
+
* This handles the common case where an LLM uses the filename
|
|
229
|
+
* instead of the UUID when calling file tools.
|
|
230
|
+
*
|
|
231
|
+
* @param idOrName - UUID or filename to search for
|
|
232
|
+
* @returns File reference if found, undefined otherwise
|
|
233
|
+
*/
|
|
234
|
+
getByIdOrFilename(idOrName) {
|
|
235
|
+
// Try direct ID lookup first (most common, O(1))
|
|
236
|
+
const byId = this.get(idOrName);
|
|
237
|
+
if (byId) {
|
|
238
|
+
return byId;
|
|
239
|
+
}
|
|
240
|
+
// Fallback: search by filename (case-insensitive)
|
|
241
|
+
const lowerName = idOrName.toLowerCase();
|
|
242
|
+
for (const ref of this.files.values()) {
|
|
243
|
+
if (ref.filename.toLowerCase() === lowerName) {
|
|
244
|
+
ref.lastAccessedAt = Date.now();
|
|
245
|
+
return ref;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
// Fallback: search by basename (without path)
|
|
249
|
+
for (const ref of this.files.values()) {
|
|
250
|
+
const refBasename = ref.filename.split("/").pop()?.toLowerCase() ?? "";
|
|
251
|
+
if (refBasename === lowerName) {
|
|
252
|
+
ref.lastAccessedAt = Date.now();
|
|
253
|
+
return ref;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
return undefined;
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Ensure a file has been processed (binary content extracted to text).
|
|
260
|
+
*
|
|
261
|
+
* For text files this is a no-op. For binary files (PDF, XLSX, video, etc.)
|
|
262
|
+
* this triggers on-demand processing if it hasn't happened yet. After this
|
|
263
|
+
* call, ref.processedContent and ref.preview contain extracted text.
|
|
264
|
+
*
|
|
265
|
+
* Used by file tools (get_file_preview) to ensure the preview contains
|
|
266
|
+
* real content instead of placeholder metadata strings.
|
|
267
|
+
*/
|
|
268
|
+
async ensureProcessed(fileId) {
|
|
269
|
+
const ref = this.get(fileId);
|
|
270
|
+
if (!ref) {
|
|
271
|
+
return;
|
|
272
|
+
}
|
|
273
|
+
if (!ref.processedContent && !this.isTextType(ref.detectedType)) {
|
|
274
|
+
await this.processFileOnDemand(ref);
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Extract targeted content from a registered file.
|
|
279
|
+
*
|
|
280
|
+
* This is the core dispatch method for the `extract_file_content` tool.
|
|
281
|
+
* Routes extraction to the appropriate processor based on file type and
|
|
282
|
+
* the parameters provided.
|
|
283
|
+
*
|
|
284
|
+
* @param params - Extraction parameters (file_id + type-specific options)
|
|
285
|
+
* @returns Extraction result with text and/or images
|
|
286
|
+
*/
|
|
287
|
+
async extractContent(params) {
|
|
288
|
+
const ref = this.getByIdOrFilename(params.file_id);
|
|
289
|
+
if (!ref) {
|
|
290
|
+
return {
|
|
291
|
+
success: false,
|
|
292
|
+
error: `File not found: "${params.file_id}". Use list_attached_files to see available files.`,
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
try {
|
|
296
|
+
// Text-like types don't need raw buffer — they use readSection
|
|
297
|
+
// which works from processedContent (tiny files) or tempPath (larger files)
|
|
298
|
+
if (this.isTextType(ref.detectedType) ||
|
|
299
|
+
ref.detectedType === "csv" ||
|
|
300
|
+
ref.detectedType === "svg" ||
|
|
301
|
+
ref.detectedType === "unknown") {
|
|
302
|
+
return await this.extractTextTargeted(ref, params);
|
|
303
|
+
}
|
|
304
|
+
// Binary types need the raw buffer for processor-specific extraction
|
|
305
|
+
const buffer = ref.tempPath ? await readFile(ref.tempPath) : null;
|
|
306
|
+
if (!buffer) {
|
|
307
|
+
return {
|
|
308
|
+
success: false,
|
|
309
|
+
error: `No file data available for "${ref.filename}". The file may have been evicted from cache.`,
|
|
310
|
+
};
|
|
311
|
+
}
|
|
312
|
+
switch (ref.detectedType) {
|
|
313
|
+
case "video":
|
|
314
|
+
return await this.extractVideoTargeted(buffer, ref, params);
|
|
315
|
+
case "pdf":
|
|
316
|
+
return await this.extractPdfTargeted(buffer, ref, params);
|
|
317
|
+
case "xlsx":
|
|
318
|
+
return await this.extractExcelTargeted(buffer, ref, params);
|
|
319
|
+
case "pptx":
|
|
320
|
+
return await this.extractPptxTargeted(buffer, ref, params);
|
|
321
|
+
case "archive":
|
|
322
|
+
return await this.extractArchiveTargeted(buffer, ref, params);
|
|
323
|
+
case "audio":
|
|
324
|
+
return await this.extractAudioTargeted(buffer, ref, params);
|
|
325
|
+
default:
|
|
326
|
+
// Fallback for any unrecognized binary type
|
|
327
|
+
return await this.extractTextTargeted(ref, params);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
catch (err) {
|
|
331
|
+
return {
|
|
332
|
+
success: false,
|
|
333
|
+
error: `Extraction failed for "${ref.filename}": ${err instanceof Error ? err.message : String(err)}`,
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
// ─── Targeted Extraction Dispatchers ──────────────────────────────
|
|
338
|
+
async extractVideoTargeted(buffer, ref, params) {
|
|
339
|
+
const { videoProcessor } = await import("../processors/media/VideoProcessor.js");
|
|
340
|
+
// If time range specified, extract frames from that range
|
|
341
|
+
if (params.start_time !== undefined && params.end_time !== undefined) {
|
|
342
|
+
const frames = await videoProcessor.extractFrameRange(buffer, ref.filename, params.start_time, params.end_time, params.frame_count ?? 5);
|
|
343
|
+
return {
|
|
344
|
+
success: true,
|
|
345
|
+
text: `Extracted ${frames.length} frames from ${ref.filename} (${params.start_time}s - ${params.end_time}s)`,
|
|
346
|
+
images: frames,
|
|
347
|
+
metadata: {
|
|
348
|
+
startTime: params.start_time,
|
|
349
|
+
endTime: params.end_time,
|
|
350
|
+
frameCount: frames.length,
|
|
351
|
+
},
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
// No time range: return full metadata + initial keyframes
|
|
355
|
+
if (!ref.processedContent) {
|
|
356
|
+
await this.processFileOnDemand(ref);
|
|
357
|
+
}
|
|
358
|
+
return {
|
|
359
|
+
success: true,
|
|
360
|
+
text: ref.processedContent || `[Video: ${ref.filename}]`,
|
|
361
|
+
images: ref.extractedImages ?? undefined,
|
|
362
|
+
};
|
|
363
|
+
}
|
|
364
|
+
async extractPdfTargeted(buffer, ref, params) {
|
|
365
|
+
// If specific pages requested, extract those pages
|
|
366
|
+
const pages = params.pages ??
|
|
367
|
+
(params.page_range
|
|
368
|
+
? Array.from({ length: params.page_range.end - params.page_range.start + 1 }, (_, i) => (params.page_range ?? { start: 0 }).start + i)
|
|
369
|
+
: undefined);
|
|
370
|
+
if (pages && pages.length > 0) {
|
|
371
|
+
try {
|
|
372
|
+
const { PDFParse } = await import("pdf-parse");
|
|
373
|
+
const pdf = new PDFParse({ data: new Uint8Array(buffer) });
|
|
374
|
+
try {
|
|
375
|
+
const firstPage = Math.min(...pages);
|
|
376
|
+
const lastPage = Math.max(...pages);
|
|
377
|
+
const textResult = await pdf.getText({
|
|
378
|
+
first: firstPage,
|
|
379
|
+
last: lastPage,
|
|
380
|
+
});
|
|
381
|
+
const totalPages = textResult.total || 0;
|
|
382
|
+
const text = textResult.text?.trim() || "(No text found on the requested pages)";
|
|
383
|
+
// Note: pdf-parse extracts a contiguous range (first..last).
|
|
384
|
+
// For non-contiguous page requests (e.g., [1, 5, 12]), the result
|
|
385
|
+
// includes all pages in the range. This is a limitation of pdf-parse.
|
|
386
|
+
const rangeNote = firstPage !== lastPage
|
|
387
|
+
? ` (extracted pages ${firstPage}-${lastPage})`
|
|
388
|
+
: "";
|
|
389
|
+
return {
|
|
390
|
+
success: true,
|
|
391
|
+
text: `## Pages ${pages.join(", ")} of ${ref.filename}${rangeNote}\n` +
|
|
392
|
+
`Total pages in document: ${totalPages}\n\n${text}`,
|
|
393
|
+
metadata: {
|
|
394
|
+
requestedPages: pages,
|
|
395
|
+
extractedRange: { first: firstPage, last: lastPage },
|
|
396
|
+
totalPages,
|
|
397
|
+
},
|
|
398
|
+
};
|
|
399
|
+
}
|
|
400
|
+
finally {
|
|
401
|
+
await pdf.destroy().catch(() => {
|
|
402
|
+
/* cleanup - ignore destroy errors */
|
|
403
|
+
});
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
catch (err) {
|
|
407
|
+
return {
|
|
408
|
+
success: false,
|
|
409
|
+
error: `PDF page extraction failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
410
|
+
};
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
// No specific pages: return full content
|
|
414
|
+
if (!ref.processedContent) {
|
|
415
|
+
await this.processFileOnDemand(ref);
|
|
416
|
+
}
|
|
417
|
+
return {
|
|
418
|
+
success: true,
|
|
419
|
+
text: ref.processedContent || `[PDF: ${ref.filename}]`,
|
|
420
|
+
};
|
|
421
|
+
}
|
|
422
|
+
async extractExcelTargeted(buffer, ref, params) {
|
|
423
|
+
const { excelProcessor } = await import("../processors/document/ExcelProcessor.js");
|
|
424
|
+
const text = await excelProcessor.extractSheetRange(buffer, params.sheet, params.row_range?.start ?? 1, params.row_range?.end, params.columns);
|
|
425
|
+
return {
|
|
426
|
+
success: true,
|
|
427
|
+
text,
|
|
428
|
+
metadata: {
|
|
429
|
+
sheet: params.sheet,
|
|
430
|
+
rowRange: params.row_range,
|
|
431
|
+
columns: params.columns,
|
|
432
|
+
},
|
|
433
|
+
};
|
|
434
|
+
}
|
|
435
|
+
async extractPptxTargeted(buffer, ref, params) {
|
|
436
|
+
const pages = params.pages ??
|
|
437
|
+
(params.page_range
|
|
438
|
+
? Array.from({ length: params.page_range.end - params.page_range.start + 1 }, (_, i) => (params.page_range ?? { start: 0 }).start + i)
|
|
439
|
+
: undefined);
|
|
440
|
+
if (pages && pages.length > 0) {
|
|
441
|
+
const { PptxProcessor } = await import("../processors/document/PptxProcessor.js");
|
|
442
|
+
const text = await PptxProcessor.extractSlides(buffer, pages);
|
|
443
|
+
return {
|
|
444
|
+
success: true,
|
|
445
|
+
text,
|
|
446
|
+
metadata: { slides: pages },
|
|
447
|
+
};
|
|
448
|
+
}
|
|
449
|
+
// Full extraction
|
|
450
|
+
if (!ref.processedContent) {
|
|
451
|
+
await this.processFileOnDemand(ref);
|
|
452
|
+
}
|
|
453
|
+
return {
|
|
454
|
+
success: true,
|
|
455
|
+
text: ref.processedContent || `[PPTX: ${ref.filename}]`,
|
|
456
|
+
};
|
|
457
|
+
}
|
|
458
|
+
async extractArchiveTargeted(buffer, ref, params) {
|
|
459
|
+
if (params.entry_path) {
|
|
460
|
+
const { archiveProcessor } = await import("../processors/archive/ArchiveProcessor.js");
|
|
461
|
+
const text = await archiveProcessor.extractEntry(buffer, params.entry_path);
|
|
462
|
+
return {
|
|
463
|
+
success: true,
|
|
464
|
+
text,
|
|
465
|
+
metadata: { entryPath: params.entry_path },
|
|
466
|
+
};
|
|
467
|
+
}
|
|
468
|
+
// No specific entry: return full listing
|
|
469
|
+
if (!ref.processedContent) {
|
|
470
|
+
await this.processFileOnDemand(ref);
|
|
471
|
+
}
|
|
472
|
+
return {
|
|
473
|
+
success: true,
|
|
474
|
+
text: ref.processedContent || `[Archive: ${ref.filename}]`,
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
async extractAudioTargeted(_buffer, ref, _params) {
|
|
478
|
+
// Audio doesn't have sub-section extraction yet — return full metadata
|
|
479
|
+
if (!ref.processedContent) {
|
|
480
|
+
await this.processFileOnDemand(ref);
|
|
481
|
+
}
|
|
482
|
+
return {
|
|
483
|
+
success: true,
|
|
484
|
+
text: ref.processedContent || `[Audio: ${ref.filename}]`,
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
async extractTextTargeted(ref, params) {
|
|
488
|
+
// For text files, use line-range reading
|
|
489
|
+
const startLine = params.page_range?.start ?? params.row_range?.start ?? 1;
|
|
490
|
+
const endLine = params.page_range?.end ?? params.row_range?.end;
|
|
491
|
+
const result = await this.readSection(ref.id, startLine, endLine, 50_000);
|
|
492
|
+
return {
|
|
493
|
+
success: true,
|
|
494
|
+
text: result.content,
|
|
495
|
+
metadata: {
|
|
496
|
+
startLine: result.startLine,
|
|
497
|
+
endLine: result.endLine,
|
|
498
|
+
totalLines: result.totalLines,
|
|
499
|
+
truncated: result.truncated,
|
|
500
|
+
},
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
/**
|
|
504
|
+
* List all registered files.
|
|
505
|
+
* Returns a lightweight summary suitable for the LLM.
|
|
506
|
+
*/
|
|
507
|
+
list() {
|
|
508
|
+
return Array.from(this.files.values());
|
|
509
|
+
}
|
|
510
|
+
/**
|
|
511
|
+
* Generate a formatted table of all registered files for the LLM.
|
|
512
|
+
*/
|
|
513
|
+
listFormatted() {
|
|
514
|
+
const files = this.list();
|
|
515
|
+
if (files.length === 0) {
|
|
516
|
+
return "No files attached.";
|
|
517
|
+
}
|
|
518
|
+
const header = "| # | Filename | Type | Size | Tier | Est. Tokens | Status |\n" +
|
|
519
|
+
"|---|----------|------|------|------|-------------|--------|\n";
|
|
520
|
+
const rows = files.map((f, i) => `| ${i + 1} | ${f.filename} | ${f.detectedType} | ${this.formatSize(f.sizeBytes)} | ` +
|
|
521
|
+
`${f.sizeTier} | ~${f.estimatedTokens.toLocaleString()} | ${f.status} |`);
|
|
522
|
+
return header + rows.join("\n");
|
|
523
|
+
}
|
|
524
|
+
/**
|
|
525
|
+
* Read a section of a registered file.
|
|
526
|
+
*
|
|
527
|
+
* Uses StreamingReader for memory-efficient access.
|
|
528
|
+
*
|
|
529
|
+
* @param fileId - File reference ID
|
|
530
|
+
* @param startLine - Starting line (1-indexed)
|
|
531
|
+
* @param endLine - Ending line (1-indexed)
|
|
532
|
+
* @param tokenBudget - Maximum tokens to return
|
|
533
|
+
* @param provider - Provider name for token estimation
|
|
534
|
+
* @returns FileReadResult
|
|
535
|
+
*/
|
|
536
|
+
async readSection(fileId, startLine = 1, endLine, tokenBudget = 50_000, provider) {
|
|
537
|
+
const ref = this.get(fileId);
|
|
538
|
+
if (!ref) {
|
|
539
|
+
throw new Error(`File reference not found: ${fileId}`);
|
|
540
|
+
}
|
|
541
|
+
// Process binary files on first read — the lazy registration path
|
|
542
|
+
// stores raw binary to temp but never runs processors. We must process
|
|
543
|
+
// on-demand so the LLM gets extracted text, not garbled binary.
|
|
544
|
+
if (!ref.processedContent && !this.isTextType(ref.detectedType)) {
|
|
545
|
+
await this.processFileOnDemand(ref);
|
|
546
|
+
}
|
|
547
|
+
// If content is already cached (or was just processed), use buffer reader
|
|
548
|
+
if (ref.processedContent) {
|
|
549
|
+
return StreamingReader.readFromBuffer(Buffer.from(ref.processedContent, "utf-8"), {
|
|
550
|
+
startLine,
|
|
551
|
+
endLine,
|
|
552
|
+
tokenBudget,
|
|
553
|
+
provider,
|
|
554
|
+
});
|
|
555
|
+
}
|
|
556
|
+
// If we have a temp path or original path, use streaming reader
|
|
557
|
+
// (text files that were not processed on-demand)
|
|
558
|
+
const filePath = ref.tempPath || ref.originalPath;
|
|
559
|
+
if (filePath) {
|
|
560
|
+
const result = await StreamingReader.readLines(filePath, {
|
|
561
|
+
startLine,
|
|
562
|
+
endLine,
|
|
563
|
+
tokenBudget,
|
|
564
|
+
provider,
|
|
565
|
+
});
|
|
566
|
+
// Cache total lines for future reference
|
|
567
|
+
if (!ref.totalLines) {
|
|
568
|
+
ref.totalLines = result.totalLines;
|
|
569
|
+
}
|
|
570
|
+
return result;
|
|
571
|
+
}
|
|
572
|
+
throw new Error(`No accessible content for file "${ref.filename}" (id: ${fileId})`);
|
|
573
|
+
}
|
|
574
|
+
/**
|
|
575
|
+
* Search within a registered file.
|
|
576
|
+
*
|
|
577
|
+
* @param fileId - File reference ID
|
|
578
|
+
* @param pattern - Search pattern (string or regex)
|
|
579
|
+
* @param maxMatches - Maximum matches to return
|
|
580
|
+
* @returns FileSearchResult
|
|
581
|
+
*/
|
|
582
|
+
async search(fileId, pattern, maxMatches = 50) {
|
|
583
|
+
const ref = this.get(fileId);
|
|
584
|
+
if (!ref) {
|
|
585
|
+
throw new Error(`File reference not found: ${fileId}`);
|
|
586
|
+
}
|
|
587
|
+
// Process binary files on first search — same lazy processing as readSection().
|
|
588
|
+
// Without this, search would scan raw PDF/XLSX binary bytes for text patterns.
|
|
589
|
+
if (!ref.processedContent && !this.isTextType(ref.detectedType)) {
|
|
590
|
+
await this.processFileOnDemand(ref);
|
|
591
|
+
}
|
|
592
|
+
// Search in processedContent if available (binary files after on-demand processing, or tiny files)
|
|
593
|
+
if (ref.processedContent) {
|
|
594
|
+
return FileReferenceRegistry.searchInMemory(ref.processedContent, pattern, maxMatches);
|
|
595
|
+
}
|
|
596
|
+
// For text files: use streaming search on the raw temp file (content IS valid UTF-8)
|
|
597
|
+
const filePath = ref.tempPath || ref.originalPath;
|
|
598
|
+
if (filePath) {
|
|
599
|
+
return StreamingReader.searchInFile(filePath, pattern, {
|
|
600
|
+
maxMatches,
|
|
601
|
+
});
|
|
602
|
+
}
|
|
603
|
+
throw new Error(`No searchable content for file "${ref.filename}" (id: ${fileId})`);
|
|
604
|
+
}
|
|
605
|
+
/**
|
|
606
|
+
* Search within in-memory content (for tiny files without temp paths).
|
|
607
|
+
*/
|
|
608
|
+
static searchInMemory(content, pattern, maxMatches) {
|
|
609
|
+
const regex = new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "i");
|
|
610
|
+
const lines = content.split("\n");
|
|
611
|
+
const matches = [];
|
|
612
|
+
let totalMatches = 0;
|
|
613
|
+
for (let i = 0; i < lines.length; i++) {
|
|
614
|
+
if (regex.test(lines[i])) {
|
|
615
|
+
totalMatches++;
|
|
616
|
+
if (matches.length < maxMatches) {
|
|
617
|
+
matches.push({
|
|
618
|
+
lineNumber: i + 1,
|
|
619
|
+
line: lines[i],
|
|
620
|
+
contextBefore: lines.slice(Math.max(0, i - 3), i),
|
|
621
|
+
contextAfter: lines.slice(i + 1, Math.min(lines.length, i + 4)),
|
|
622
|
+
});
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
return {
|
|
627
|
+
matches,
|
|
628
|
+
totalMatches,
|
|
629
|
+
truncated: totalMatches > maxMatches,
|
|
630
|
+
};
|
|
631
|
+
}
|
|
632
|
+
/**
|
|
633
|
+
* Store a summary for a file reference.
|
|
634
|
+
*/
|
|
635
|
+
setSummary(fileId, summary) {
|
|
636
|
+
const ref = this.files.get(fileId);
|
|
637
|
+
if (ref) {
|
|
638
|
+
ref.summary = summary;
|
|
639
|
+
ref.status = "processed";
|
|
640
|
+
ref.lastAccessedAt = Date.now();
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
/**
|
|
644
|
+
* Remove a file reference and clean up its temp file.
|
|
645
|
+
*/
|
|
646
|
+
async remove(fileId) {
|
|
647
|
+
const ref = this.files.get(fileId);
|
|
648
|
+
if (!ref) {
|
|
649
|
+
return false;
|
|
650
|
+
}
|
|
651
|
+
// Clean up temp file (only if we created it, not for original paths)
|
|
652
|
+
if (ref.tempPath && ref.source !== "path") {
|
|
653
|
+
try {
|
|
654
|
+
await unlink(ref.tempPath);
|
|
655
|
+
this.currentTempBytes -= ref.sizeBytes;
|
|
656
|
+
}
|
|
657
|
+
catch {
|
|
658
|
+
// Temp file may already be cleaned up
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
this.files.delete(fileId);
|
|
662
|
+
return true;
|
|
663
|
+
}
|
|
664
|
+
/**
|
|
665
|
+
* Clear all file references and clean up temp directory.
|
|
666
|
+
*/
|
|
667
|
+
async clear() {
|
|
668
|
+
const ids = Array.from(this.files.keys());
|
|
669
|
+
for (const id of ids) {
|
|
670
|
+
await this.remove(id);
|
|
671
|
+
}
|
|
672
|
+
this.files.clear();
|
|
673
|
+
this.currentTempBytes = 0;
|
|
674
|
+
}
|
|
675
|
+
/**
|
|
676
|
+
* Get the number of registered files.
|
|
677
|
+
*/
|
|
678
|
+
get size() {
|
|
679
|
+
return this.files.size;
|
|
680
|
+
}
|
|
681
|
+
/**
|
|
682
|
+
* Generate the preview text for the initial prompt.
|
|
683
|
+
*
|
|
684
|
+
* Returns a compact summary of all registered files that uses ~50-100 tokens
|
|
685
|
+
* per file instead of full content. The LLM can use file tools to access
|
|
686
|
+
* more content as needed.
|
|
687
|
+
*
|
|
688
|
+
* @returns Formatted string for prompt injection
|
|
689
|
+
*/
|
|
690
|
+
async generatePromptPreview() {
|
|
691
|
+
const files = this.list();
|
|
692
|
+
if (files.length === 0) {
|
|
693
|
+
return "";
|
|
694
|
+
}
|
|
695
|
+
// Ensure binary files are processed so previews contain real content
|
|
696
|
+
// (e.g., video metadata, audio tags) instead of placeholder strings.
|
|
697
|
+
for (const ref of files) {
|
|
698
|
+
if (!ref.processedContent && !this.isTextType(ref.detectedType)) {
|
|
699
|
+
await this.processFileOnDemand(ref);
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
const sections = [];
|
|
703
|
+
sections.push(`\n\n## Attached Files (${files.length})\n`);
|
|
704
|
+
for (const ref of files) {
|
|
705
|
+
const sizeStr = this.formatSize(ref.sizeBytes);
|
|
706
|
+
sections.push(`### File: "${ref.filename}" (${sizeStr}, ${ref.detectedType})`);
|
|
707
|
+
if (ref.sizeTier === "tiny" && ref.processedContent) {
|
|
708
|
+
// Tiny files: include full content inline
|
|
709
|
+
sections.push(ref.processedContent);
|
|
710
|
+
}
|
|
711
|
+
else {
|
|
712
|
+
// Larger files: include preview + guidance
|
|
713
|
+
sections.push(`**Preview** (first ${this.defaultPreviewChars} chars):`);
|
|
714
|
+
sections.push(ref.preview);
|
|
715
|
+
// Add type-specific extraction hints
|
|
716
|
+
const hint = FileReferenceRegistry.getExtractionHint(ref.detectedType, sizeStr);
|
|
717
|
+
if (hint) {
|
|
718
|
+
sections.push(`\n> ${hint}`);
|
|
719
|
+
}
|
|
720
|
+
else if (ref.sizeTier !== "small") {
|
|
721
|
+
sections.push(`\n> This file is ${sizeStr}. Use \`read_file_section\` to read specific ` +
|
|
722
|
+
`sections, \`search_in_file\` to search, or \`summarize_file\` for a full summary.`);
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
sections.push(""); // blank line between files
|
|
726
|
+
}
|
|
727
|
+
return sections.join("\n");
|
|
728
|
+
}
|
|
729
|
+
// ─── Private Methods ────────────────────────────────────────────
|
|
730
|
+
/**
|
|
731
|
+
* Get type-specific extraction hints for the LLM prompt.
|
|
732
|
+
* Tells the LLM what parameters it can use with extract_file_content.
|
|
733
|
+
*/
|
|
734
|
+
static getExtractionHint(type, sizeStr) {
|
|
735
|
+
switch (type) {
|
|
736
|
+
case "video":
|
|
737
|
+
return (`This video is ${sizeStr}. Use \`extract_file_content\` with \`start_time\`/\`end_time\` ` +
|
|
738
|
+
`to get frames from specific time ranges (e.g., start_time=5, end_time=10, frame_count=3). ` +
|
|
739
|
+
`Initial keyframes are already provided above.`);
|
|
740
|
+
case "pdf":
|
|
741
|
+
return (`This PDF is ${sizeStr}. Use \`extract_file_content\` with \`pages\` (e.g., [1, 3, 5]) ` +
|
|
742
|
+
`or \`page_range\` (e.g., {start: 1, end: 10}) to get specific pages. ` +
|
|
743
|
+
`Use \`read_file_section\` for line-range access or \`search_in_file\` to search.`);
|
|
744
|
+
case "xlsx":
|
|
745
|
+
return (`This spreadsheet is ${sizeStr}. Use \`extract_file_content\` with \`sheet\` (name or index), ` +
|
|
746
|
+
`\`row_range\` (e.g., {start: 1, end: 50}), and \`columns\` (e.g., ["A", "B", "D"]) ` +
|
|
747
|
+
`for targeted data extraction.`);
|
|
748
|
+
case "pptx":
|
|
749
|
+
return (`This presentation is ${sizeStr}. Use \`extract_file_content\` with \`pages\` ` +
|
|
750
|
+
`(e.g., [1, 3, 5]) to extract specific slides.`);
|
|
751
|
+
case "archive":
|
|
752
|
+
return (`This archive is ${sizeStr}. Use \`extract_file_content\` with \`entry_path\` ` +
|
|
753
|
+
`(e.g., "src/index.ts") to extract a specific file from the archive.`);
|
|
754
|
+
case "audio":
|
|
755
|
+
return (`This audio file is ${sizeStr}. Metadata is shown above. ` +
|
|
756
|
+
`Use \`read_file_section\` or \`search_in_file\` for text-based access.`);
|
|
757
|
+
default:
|
|
758
|
+
return null;
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
/**
|
|
762
|
+
* Classify a file into a size tier based on byte size.
|
|
763
|
+
*/
|
|
764
|
+
static classifySizeTier(sizeBytes) {
|
|
765
|
+
if (sizeBytes <= SIZE_TIER_THRESHOLDS.TINY_MAX) {
|
|
766
|
+
return "tiny";
|
|
767
|
+
}
|
|
768
|
+
if (sizeBytes <= SIZE_TIER_THRESHOLDS.SMALL_MAX) {
|
|
769
|
+
return "small";
|
|
770
|
+
}
|
|
771
|
+
if (sizeBytes <= SIZE_TIER_THRESHOLDS.MEDIUM_MAX) {
|
|
772
|
+
return "medium";
|
|
773
|
+
}
|
|
774
|
+
if (sizeBytes <= SIZE_TIER_THRESHOLDS.LARGE_MAX) {
|
|
775
|
+
return "large";
|
|
776
|
+
}
|
|
777
|
+
if (sizeBytes <= SIZE_TIER_THRESHOLDS.HUGE_MAX) {
|
|
778
|
+
return "huge";
|
|
779
|
+
}
|
|
780
|
+
return "oversized";
|
|
781
|
+
}
|
|
782
|
+
/**
|
|
783
|
+
* Process a binary file on-demand, extracting text content via the
|
|
784
|
+
* appropriate processor. This bridges the gap between the lazy registration
|
|
785
|
+
* path (which stores raw binary) and the LLM read tools (which need text).
|
|
786
|
+
*
|
|
787
|
+
* Called lazily on first readSection() or search() for non-text files.
|
|
788
|
+
* Results are cached in ref.processedContent for subsequent reads.
|
|
789
|
+
*/
|
|
790
|
+
async processFileOnDemand(ref) {
|
|
791
|
+
// Prevent concurrent processing of the same file
|
|
792
|
+
if (ref.status === "processing") {
|
|
793
|
+
return;
|
|
794
|
+
}
|
|
795
|
+
ref.status = "processing";
|
|
796
|
+
try {
|
|
797
|
+
const buffer = ref.tempPath ? await readFile(ref.tempPath) : null;
|
|
798
|
+
if (!buffer) {
|
|
799
|
+
ref.status = "error";
|
|
800
|
+
logger.warn(`[FileReferenceRegistry] No buffer available for on-demand processing: "${ref.filename}"`);
|
|
801
|
+
return;
|
|
802
|
+
}
|
|
803
|
+
let extractedText = null;
|
|
804
|
+
switch (ref.detectedType) {
|
|
805
|
+
case "pdf":
|
|
806
|
+
extractedText = await this.extractPdfText(buffer);
|
|
807
|
+
break;
|
|
808
|
+
case "xlsx":
|
|
809
|
+
extractedText = await this.extractExcelText(buffer, ref);
|
|
810
|
+
break;
|
|
811
|
+
case "docx":
|
|
812
|
+
extractedText = await this.extractWordText(buffer, ref);
|
|
813
|
+
break;
|
|
814
|
+
case "pptx":
|
|
815
|
+
extractedText = await this.extractPptxText(buffer);
|
|
816
|
+
break;
|
|
817
|
+
case "video":
|
|
818
|
+
extractedText = await this.extractVideoContent(buffer, ref);
|
|
819
|
+
break;
|
|
820
|
+
case "audio":
|
|
821
|
+
extractedText = await this.extractAudioContent(buffer, ref);
|
|
822
|
+
break;
|
|
823
|
+
case "archive":
|
|
824
|
+
extractedText = await this.extractArchiveContent(buffer, ref);
|
|
825
|
+
break;
|
|
826
|
+
default:
|
|
827
|
+
// For unknown binary types, provide a descriptive fallback
|
|
828
|
+
extractedText =
|
|
829
|
+
`[Binary file: ${ref.filename}, ${this.formatSize(ref.sizeBytes)}, type: ${ref.detectedType}]\n` +
|
|
830
|
+
`This file could not be processed into text content.`;
|
|
831
|
+
break;
|
|
832
|
+
}
|
|
833
|
+
if (extractedText) {
|
|
834
|
+
ref.processedContent = extractedText;
|
|
835
|
+
ref.status = "processed";
|
|
836
|
+
// Update the preview with actual content instead of placeholder metadata
|
|
837
|
+
const previewChars = this.defaultPreviewChars;
|
|
838
|
+
if (extractedText.length <= previewChars) {
|
|
839
|
+
ref.preview = extractedText;
|
|
840
|
+
}
|
|
841
|
+
else {
|
|
842
|
+
const lastNewline = extractedText.lastIndexOf("\n", previewChars);
|
|
843
|
+
ref.preview =
|
|
844
|
+
lastNewline > previewChars * 0.8
|
|
845
|
+
? extractedText.substring(0, lastNewline)
|
|
846
|
+
: extractedText.substring(0, previewChars) + "\n...[truncated]";
|
|
847
|
+
}
|
|
848
|
+
logger.info(`[FileReferenceRegistry] On-demand processed "${ref.filename}" ` +
|
|
849
|
+
`(${ref.detectedType}, ${this.formatSize(ref.sizeBytes)}) → ${extractedText.length} chars`);
|
|
850
|
+
}
|
|
851
|
+
else {
|
|
852
|
+
ref.processedContent =
|
|
853
|
+
`[${ref.detectedType.toUpperCase()} file: ${ref.filename}, ${this.formatSize(ref.sizeBytes)}]\n` +
|
|
854
|
+
`Content could not be extracted. The file may be corrupted or in an unsupported format.`;
|
|
855
|
+
ref.preview = ref.processedContent;
|
|
856
|
+
ref.status = "processed";
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
catch (err) {
|
|
860
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
861
|
+
logger.warn(`[FileReferenceRegistry] On-demand processing failed for "${ref.filename}": ${errorMsg}`);
|
|
862
|
+
ref.processedContent =
|
|
863
|
+
`[Processing error for ${ref.filename}]\n` +
|
|
864
|
+
`Type: ${ref.detectedType}, Size: ${this.formatSize(ref.sizeBytes)}\n` +
|
|
865
|
+
`Error: ${errorMsg}`;
|
|
866
|
+
ref.preview = ref.processedContent;
|
|
867
|
+
ref.status = "error";
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
/**
|
|
871
|
+
* Extract text from a PDF buffer using pdf-parse v2 (pdfjs-dist under the hood).
|
|
872
|
+
*
|
|
873
|
+
* Handles compressed streams (FlateDecode), CMap-encoded text, modern PDFs,
|
|
874
|
+
* and most text-based PDF formats. For scanned/image-only PDFs where no text
|
|
875
|
+
* can be extracted, falls back to a descriptive message.
|
|
876
|
+
*/
|
|
877
|
+
async extractPdfText(buffer) {
|
|
878
|
+
try {
|
|
879
|
+
const { PDFParse } = await import("pdf-parse");
|
|
880
|
+
const pdf = new PDFParse({
|
|
881
|
+
data: new Uint8Array(buffer),
|
|
882
|
+
});
|
|
883
|
+
try {
|
|
884
|
+
const textResult = await pdf.getText({
|
|
885
|
+
// Limit to first 100 pages to avoid unbounded processing
|
|
886
|
+
last: 100,
|
|
887
|
+
});
|
|
888
|
+
const text = textResult.text?.trim();
|
|
889
|
+
if (!text || text.length === 0) {
|
|
890
|
+
// No text found — likely a scanned/image-only PDF
|
|
891
|
+
const pageCount = textResult.total || 0;
|
|
892
|
+
return (`[PDF document: ${this.formatSize(buffer.length)}, ${pageCount} page(s)]\n` +
|
|
893
|
+
`This PDF appears to contain scanned images or non-extractable content.\n` +
|
|
894
|
+
`Text could not be extracted from the document. The content may consist of:\n` +
|
|
895
|
+
`- Scanned pages (images of text, not searchable text)\n` +
|
|
896
|
+
`- Forms or graphical content\n` +
|
|
897
|
+
`- Protected/encrypted content`);
|
|
898
|
+
}
|
|
899
|
+
// Clean up excessive blank lines
|
|
900
|
+
const cleaned = text.replace(/\n{3,}/g, "\n\n");
|
|
901
|
+
return cleaned;
|
|
902
|
+
}
|
|
903
|
+
finally {
|
|
904
|
+
// Always clean up the PDF instance to free pdfjs-dist resources
|
|
905
|
+
await pdf.destroy().catch(() => {
|
|
906
|
+
/* cleanup - ignore destroy errors */
|
|
907
|
+
});
|
|
908
|
+
}
|
|
909
|
+
}
|
|
910
|
+
catch (err) {
|
|
911
|
+
logger.warn(`[FileReferenceRegistry] PDF text extraction failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
912
|
+
return null;
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
/**
|
|
916
|
+
* Extract text content from an Excel file using ExcelProcessor.
|
|
917
|
+
*/
|
|
918
|
+
async extractExcelText(buffer, ref) {
|
|
919
|
+
try {
|
|
920
|
+
const { processExcel } = await import("../processors/document/ExcelProcessor.js");
|
|
921
|
+
const result = await processExcel({
|
|
922
|
+
id: ref.id,
|
|
923
|
+
name: ref.filename,
|
|
924
|
+
mimetype: ref.mimeType,
|
|
925
|
+
size: ref.sizeBytes,
|
|
926
|
+
buffer,
|
|
927
|
+
});
|
|
928
|
+
if (!result.success || !result.data) {
|
|
929
|
+
return null;
|
|
930
|
+
}
|
|
931
|
+
// Format worksheets as TSV text for LLM consumption
|
|
932
|
+
const worksheets = result.data.worksheets;
|
|
933
|
+
if (worksheets && worksheets.length > 0) {
|
|
934
|
+
const sections = [];
|
|
935
|
+
for (const ws of worksheets) {
|
|
936
|
+
sections.push(`## Sheet: ${ws.name}`);
|
|
937
|
+
if (ws.headers.length > 0) {
|
|
938
|
+
sections.push(ws.headers.join("\t"));
|
|
939
|
+
}
|
|
940
|
+
for (const row of ws.rows) {
|
|
941
|
+
sections.push(row.map((cell) => (cell === null ? "" : String(cell))).join("\t"));
|
|
942
|
+
}
|
|
943
|
+
sections.push("");
|
|
944
|
+
}
|
|
945
|
+
return sections.join("\n");
|
|
946
|
+
}
|
|
947
|
+
return null;
|
|
948
|
+
}
|
|
949
|
+
catch (err) {
|
|
950
|
+
logger.warn(`[FileReferenceRegistry] Excel extraction failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
951
|
+
return null;
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
/**
|
|
955
|
+
* Extract text content from a Word document using WordProcessor.
|
|
956
|
+
*/
|
|
957
|
+
async extractWordText(buffer, ref) {
|
|
958
|
+
try {
|
|
959
|
+
const { processWord } = await import("../processors/document/WordProcessor.js");
|
|
960
|
+
const result = await processWord({
|
|
961
|
+
id: ref.id,
|
|
962
|
+
name: ref.filename,
|
|
963
|
+
mimetype: ref.mimeType,
|
|
964
|
+
size: ref.sizeBytes,
|
|
965
|
+
buffer,
|
|
966
|
+
});
|
|
967
|
+
if (!result.success || !result.data) {
|
|
968
|
+
return null;
|
|
969
|
+
}
|
|
970
|
+
return result.data.textContent || null;
|
|
971
|
+
}
|
|
972
|
+
catch (err) {
|
|
973
|
+
logger.warn(`[FileReferenceRegistry] Word extraction failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
974
|
+
return null;
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
/**
|
|
978
|
+
* Extract text from a PowerPoint file using PptxProcessor.
|
|
979
|
+
*/
|
|
980
|
+
async extractPptxText(buffer) {
|
|
981
|
+
try {
|
|
982
|
+
const { PptxProcessor } = await import("../processors/document/PptxProcessor.js");
|
|
983
|
+
return await PptxProcessor.extractText(buffer);
|
|
984
|
+
}
|
|
985
|
+
catch (err) {
|
|
986
|
+
logger.warn(`[FileReferenceRegistry] PPTX extraction failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
987
|
+
return null;
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
/**
|
|
991
|
+
* Extract metadata and content from a video file using VideoProcessor.
|
|
992
|
+
*/
|
|
993
|
+
async extractVideoContent(buffer, ref) {
|
|
994
|
+
try {
|
|
995
|
+
const { processVideo } = await import("../processors/media/VideoProcessor.js");
|
|
996
|
+
const result = await processVideo({
|
|
997
|
+
id: ref.id,
|
|
998
|
+
name: ref.filename,
|
|
999
|
+
mimetype: ref.mimeType,
|
|
1000
|
+
size: ref.sizeBytes,
|
|
1001
|
+
buffer,
|
|
1002
|
+
});
|
|
1003
|
+
if (!result.success || !result.data) {
|
|
1004
|
+
return null;
|
|
1005
|
+
}
|
|
1006
|
+
// Store keyframe images on the reference for injection into the prompt
|
|
1007
|
+
if (result.data.keyframes && result.data.keyframes.length > 0) {
|
|
1008
|
+
ref.extractedImages = result.data.keyframes;
|
|
1009
|
+
logger.info(`[FileReferenceRegistry] Extracted ${result.data.keyframes.length} keyframes from "${ref.filename}"`);
|
|
1010
|
+
}
|
|
1011
|
+
return result.data.textContent || null;
|
|
1012
|
+
}
|
|
1013
|
+
catch (err) {
|
|
1014
|
+
logger.warn(`[FileReferenceRegistry] Video extraction failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
1015
|
+
// Provide basic metadata even on failure
|
|
1016
|
+
return (`[Video file: ${ref.filename}, ${this.formatSize(ref.sizeBytes)}]\n` +
|
|
1017
|
+
`Video processing requires ffmpeg/ffprobe. Metadata could not be extracted.\n` +
|
|
1018
|
+
`Error: ${err instanceof Error ? err.message : String(err)}`);
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
/**
|
|
1022
|
+
* Extract metadata and content from an audio file using AudioProcessor.
|
|
1023
|
+
*/
|
|
1024
|
+
async extractAudioContent(buffer, ref) {
|
|
1025
|
+
try {
|
|
1026
|
+
const { processAudio } = await import("../processors/media/AudioProcessor.js");
|
|
1027
|
+
const result = await processAudio({
|
|
1028
|
+
id: ref.id,
|
|
1029
|
+
name: ref.filename,
|
|
1030
|
+
mimetype: ref.mimeType,
|
|
1031
|
+
size: ref.sizeBytes,
|
|
1032
|
+
buffer,
|
|
1033
|
+
});
|
|
1034
|
+
if (!result.success || !result.data) {
|
|
1035
|
+
return null;
|
|
1036
|
+
}
|
|
1037
|
+
return result.data.textContent || null;
|
|
1038
|
+
}
|
|
1039
|
+
catch (err) {
|
|
1040
|
+
logger.warn(`[FileReferenceRegistry] Audio extraction failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
1041
|
+
return (`[Audio file: ${ref.filename}, ${this.formatSize(ref.sizeBytes)}]\n` +
|
|
1042
|
+
`Audio processing failed. Error: ${err instanceof Error ? err.message : String(err)}`);
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
/**
|
|
1046
|
+
* Extract file listing from an archive using ArchiveProcessor.
|
|
1047
|
+
*/
|
|
1048
|
+
async extractArchiveContent(buffer, ref) {
|
|
1049
|
+
try {
|
|
1050
|
+
const { processArchive } = await import("../processors/archive/ArchiveProcessor.js");
|
|
1051
|
+
const result = await processArchive({
|
|
1052
|
+
id: ref.id,
|
|
1053
|
+
name: ref.filename,
|
|
1054
|
+
mimetype: ref.mimeType,
|
|
1055
|
+
size: ref.sizeBytes,
|
|
1056
|
+
buffer,
|
|
1057
|
+
});
|
|
1058
|
+
if (!result.success || !result.data) {
|
|
1059
|
+
return null;
|
|
1060
|
+
}
|
|
1061
|
+
return result.data.textContent || null;
|
|
1062
|
+
}
|
|
1063
|
+
catch (err) {
|
|
1064
|
+
logger.warn(`[FileReferenceRegistry] Archive extraction failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
1065
|
+
return null;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
/**
|
|
1069
|
+
* Extract a preview from a buffer.
|
|
1070
|
+
* For text: first N characters.
|
|
1071
|
+
* For binary: type-specific metadata.
|
|
1072
|
+
*/
|
|
1073
|
+
extractPreview(buffer, type, maxChars) {
|
|
1074
|
+
if (this.isTextType(type, buffer)) {
|
|
1075
|
+
// Text-based: extract first N characters
|
|
1076
|
+
const text = buffer.toString("utf-8", 0, Math.min(buffer.length, maxChars + 100));
|
|
1077
|
+
if (text.length <= maxChars) {
|
|
1078
|
+
return text;
|
|
1079
|
+
}
|
|
1080
|
+
// Break at line boundary
|
|
1081
|
+
const lastNewline = text.lastIndexOf("\n", maxChars);
|
|
1082
|
+
if (lastNewline > maxChars * 0.8) {
|
|
1083
|
+
return text.substring(0, lastNewline);
|
|
1084
|
+
}
|
|
1085
|
+
return text.substring(0, maxChars) + "\n...[truncated]";
|
|
1086
|
+
}
|
|
1087
|
+
// Binary types: type-specific preview
|
|
1088
|
+
const sizeMB = (buffer.length / (1024 * 1024)).toFixed(2);
|
|
1089
|
+
switch (type) {
|
|
1090
|
+
case "image":
|
|
1091
|
+
return `[Image file: ${sizeMB} MB]`;
|
|
1092
|
+
case "video":
|
|
1093
|
+
return `[Video file: ${sizeMB} MB — use read tools for metadata/keyframes]`;
|
|
1094
|
+
case "audio":
|
|
1095
|
+
return `[Audio file: ${sizeMB} MB — use read tools for metadata/transcript]`;
|
|
1096
|
+
case "archive":
|
|
1097
|
+
return `[Archive file: ${sizeMB} MB — use read tools for file listing]`;
|
|
1098
|
+
case "pdf":
|
|
1099
|
+
return `[PDF document: ${sizeMB} MB — use read tools for page content]`;
|
|
1100
|
+
default:
|
|
1101
|
+
return `[Binary file: ${sizeMB} MB, type: ${type}]`;
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
/**
|
|
1105
|
+
* Detect file type from buffer magic bytes and extension.
|
|
1106
|
+
*/
|
|
1107
|
+
detectType(buffer, ext) {
|
|
1108
|
+
// Check magic bytes first
|
|
1109
|
+
if (buffer.length >= 4) {
|
|
1110
|
+
const header = buffer.subarray(0, 8);
|
|
1111
|
+
// PNG: 89 50 4E 47
|
|
1112
|
+
if (header[0] === 0x89 &&
|
|
1113
|
+
header[1] === 0x50 &&
|
|
1114
|
+
header[2] === 0x4e &&
|
|
1115
|
+
header[3] === 0x47) {
|
|
1116
|
+
return "image";
|
|
1117
|
+
}
|
|
1118
|
+
// JPEG: FF D8 FF
|
|
1119
|
+
if (header[0] === 0xff && header[1] === 0xd8 && header[2] === 0xff) {
|
|
1120
|
+
return "image";
|
|
1121
|
+
}
|
|
1122
|
+
// GIF: 47 49 46
|
|
1123
|
+
if (header[0] === 0x47 && header[1] === 0x49 && header[2] === 0x46) {
|
|
1124
|
+
return "image";
|
|
1125
|
+
}
|
|
1126
|
+
// WebP: 52 49 46 46 ... 57 45 42 50
|
|
1127
|
+
if (header[0] === 0x52 &&
|
|
1128
|
+
header[1] === 0x49 &&
|
|
1129
|
+
header[2] === 0x46 &&
|
|
1130
|
+
header[3] === 0x46 &&
|
|
1131
|
+
buffer.length >= 12 &&
|
|
1132
|
+
buffer[8] === 0x57 &&
|
|
1133
|
+
buffer[9] === 0x45 &&
|
|
1134
|
+
buffer[10] === 0x42 &&
|
|
1135
|
+
buffer[11] === 0x50) {
|
|
1136
|
+
return "image";
|
|
1137
|
+
}
|
|
1138
|
+
// PDF: 25 50 44 46
|
|
1139
|
+
if (header[0] === 0x25 &&
|
|
1140
|
+
header[1] === 0x50 &&
|
|
1141
|
+
header[2] === 0x44 &&
|
|
1142
|
+
header[3] === 0x46) {
|
|
1143
|
+
return "pdf";
|
|
1144
|
+
}
|
|
1145
|
+
// ZIP (and derivatives: xlsx, docx, pptx)
|
|
1146
|
+
if (header[0] === 0x50 && header[1] === 0x4b) {
|
|
1147
|
+
// Differentiate by extension
|
|
1148
|
+
if (ext === "xlsx") {
|
|
1149
|
+
return "xlsx";
|
|
1150
|
+
}
|
|
1151
|
+
if (ext === "docx") {
|
|
1152
|
+
return "docx";
|
|
1153
|
+
}
|
|
1154
|
+
if (ext === "pptx") {
|
|
1155
|
+
return "pptx";
|
|
1156
|
+
}
|
|
1157
|
+
return "archive";
|
|
1158
|
+
}
|
|
1159
|
+
// MP4/M4A: ftyp
|
|
1160
|
+
if (buffer.length >= 8 &&
|
|
1161
|
+
buffer[4] === 0x66 &&
|
|
1162
|
+
buffer[5] === 0x74 &&
|
|
1163
|
+
buffer[6] === 0x79 &&
|
|
1164
|
+
buffer[7] === 0x70) {
|
|
1165
|
+
if (["m4a", "aac"].includes(ext)) {
|
|
1166
|
+
return "audio";
|
|
1167
|
+
}
|
|
1168
|
+
return "video";
|
|
1169
|
+
}
|
|
1170
|
+
// ID3 (MP3): 49 44 33
|
|
1171
|
+
if (header[0] === 0x49 && header[1] === 0x44 && header[2] === 0x33) {
|
|
1172
|
+
return "audio";
|
|
1173
|
+
}
|
|
1174
|
+
// OGG: 4F 67 67 53
|
|
1175
|
+
if (header[0] === 0x4f &&
|
|
1176
|
+
header[1] === 0x67 &&
|
|
1177
|
+
header[2] === 0x67 &&
|
|
1178
|
+
header[3] === 0x53) {
|
|
1179
|
+
return "audio";
|
|
1180
|
+
}
|
|
1181
|
+
// FLAC: 66 4C 61 43
|
|
1182
|
+
if (header[0] === 0x66 &&
|
|
1183
|
+
header[1] === 0x4c &&
|
|
1184
|
+
header[2] === 0x61 &&
|
|
1185
|
+
header[3] === 0x43) {
|
|
1186
|
+
return "audio";
|
|
1187
|
+
}
|
|
1188
|
+
// WAV: 52 49 46 46 ... 57 41 56 45
|
|
1189
|
+
if (header[0] === 0x52 &&
|
|
1190
|
+
header[1] === 0x49 &&
|
|
1191
|
+
header[2] === 0x46 &&
|
|
1192
|
+
header[3] === 0x46 &&
|
|
1193
|
+
buffer.length >= 12 &&
|
|
1194
|
+
buffer[8] === 0x57 &&
|
|
1195
|
+
buffer[9] === 0x41 &&
|
|
1196
|
+
buffer[10] === 0x56 &&
|
|
1197
|
+
buffer[11] === 0x45) {
|
|
1198
|
+
return "audio";
|
|
1199
|
+
}
|
|
1200
|
+
// MKV/WebM: 1A 45 DF A3
|
|
1201
|
+
if (header[0] === 0x1a &&
|
|
1202
|
+
header[1] === 0x45 &&
|
|
1203
|
+
header[2] === 0xdf &&
|
|
1204
|
+
header[3] === 0xa3) {
|
|
1205
|
+
if (ext === "webm") {
|
|
1206
|
+
return "video";
|
|
1207
|
+
}
|
|
1208
|
+
return "video";
|
|
1209
|
+
}
|
|
1210
|
+
// AVI: 52 49 46 46 ... 41 56 49 20
|
|
1211
|
+
if (header[0] === 0x52 &&
|
|
1212
|
+
header[1] === 0x49 &&
|
|
1213
|
+
header[2] === 0x46 &&
|
|
1214
|
+
header[3] === 0x46 &&
|
|
1215
|
+
buffer.length >= 12 &&
|
|
1216
|
+
buffer[8] === 0x41 &&
|
|
1217
|
+
buffer[9] === 0x56 &&
|
|
1218
|
+
buffer[10] === 0x49 &&
|
|
1219
|
+
buffer[11] === 0x20) {
|
|
1220
|
+
return "video";
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
// Fall back to extension
|
|
1224
|
+
return this.detectTypeFromExtension(ext);
|
|
1225
|
+
}
|
|
1226
|
+
/**
|
|
1227
|
+
* Detect file type from extension alone.
|
|
1228
|
+
*/
|
|
1229
|
+
detectTypeFromExtension(ext) {
|
|
1230
|
+
const extensionMap = {
|
|
1231
|
+
// Images
|
|
1232
|
+
png: "image",
|
|
1233
|
+
jpg: "image",
|
|
1234
|
+
jpeg: "image",
|
|
1235
|
+
gif: "image",
|
|
1236
|
+
webp: "image",
|
|
1237
|
+
bmp: "image",
|
|
1238
|
+
tiff: "image",
|
|
1239
|
+
ico: "image",
|
|
1240
|
+
// Video
|
|
1241
|
+
mp4: "video",
|
|
1242
|
+
mkv: "video",
|
|
1243
|
+
webm: "video",
|
|
1244
|
+
avi: "video",
|
|
1245
|
+
mov: "video",
|
|
1246
|
+
m4v: "video",
|
|
1247
|
+
// Audio
|
|
1248
|
+
mp3: "audio",
|
|
1249
|
+
wav: "audio",
|
|
1250
|
+
ogg: "audio",
|
|
1251
|
+
flac: "audio",
|
|
1252
|
+
aac: "audio",
|
|
1253
|
+
m4a: "audio",
|
|
1254
|
+
wma: "audio",
|
|
1255
|
+
// Documents
|
|
1256
|
+
pdf: "pdf",
|
|
1257
|
+
docx: "docx",
|
|
1258
|
+
pptx: "pptx",
|
|
1259
|
+
xlsx: "xlsx",
|
|
1260
|
+
// Data
|
|
1261
|
+
csv: "csv",
|
|
1262
|
+
tsv: "csv",
|
|
1263
|
+
// Markup
|
|
1264
|
+
svg: "svg",
|
|
1265
|
+
// Archives
|
|
1266
|
+
zip: "archive",
|
|
1267
|
+
tar: "archive",
|
|
1268
|
+
gz: "archive",
|
|
1269
|
+
tgz: "archive",
|
|
1270
|
+
"7z": "archive",
|
|
1271
|
+
rar: "archive",
|
|
1272
|
+
// Text & Code
|
|
1273
|
+
txt: "text",
|
|
1274
|
+
md: "text",
|
|
1275
|
+
log: "text",
|
|
1276
|
+
json: "text",
|
|
1277
|
+
yaml: "text",
|
|
1278
|
+
yml: "text",
|
|
1279
|
+
xml: "text",
|
|
1280
|
+
html: "text",
|
|
1281
|
+
htm: "text",
|
|
1282
|
+
css: "text",
|
|
1283
|
+
js: "text",
|
|
1284
|
+
ts: "text",
|
|
1285
|
+
jsx: "text",
|
|
1286
|
+
tsx: "text",
|
|
1287
|
+
py: "text",
|
|
1288
|
+
java: "text",
|
|
1289
|
+
go: "text",
|
|
1290
|
+
rs: "text",
|
|
1291
|
+
rb: "text",
|
|
1292
|
+
php: "text",
|
|
1293
|
+
c: "text",
|
|
1294
|
+
cpp: "text",
|
|
1295
|
+
h: "text",
|
|
1296
|
+
cs: "text",
|
|
1297
|
+
swift: "text",
|
|
1298
|
+
kt: "text",
|
|
1299
|
+
scala: "text",
|
|
1300
|
+
sql: "text",
|
|
1301
|
+
sh: "text",
|
|
1302
|
+
bash: "text",
|
|
1303
|
+
zsh: "text",
|
|
1304
|
+
toml: "text",
|
|
1305
|
+
ini: "text",
|
|
1306
|
+
cfg: "text",
|
|
1307
|
+
env: "text",
|
|
1308
|
+
dockerfile: "text",
|
|
1309
|
+
makefile: "text",
|
|
1310
|
+
};
|
|
1311
|
+
return extensionMap[ext.toLowerCase()] || "unknown";
|
|
1312
|
+
}
|
|
1313
|
+
/**
|
|
1314
|
+
* Whether a file type contains readable text content.
|
|
1315
|
+
* For "unknown" types, optionally checks the buffer for valid UTF-8 text.
|
|
1316
|
+
*/
|
|
1317
|
+
isTextType(type, buffer) {
|
|
1318
|
+
if (["text", "csv", "svg"].includes(type)) {
|
|
1319
|
+
return true;
|
|
1320
|
+
}
|
|
1321
|
+
// For unknown types, heuristically check if the buffer is likely text
|
|
1322
|
+
if (type === "unknown" && buffer && buffer.length > 0) {
|
|
1323
|
+
return FileReferenceRegistry.looksLikeText(buffer);
|
|
1324
|
+
}
|
|
1325
|
+
return false;
|
|
1326
|
+
}
|
|
1327
|
+
/**
|
|
1328
|
+
* Heuristic check: does a buffer look like valid text content?
|
|
1329
|
+
* Checks the first 512 bytes for mostly printable ASCII/UTF-8 characters.
|
|
1330
|
+
* Returns true if >90% of bytes are printable (ASCII 0x20-0x7E, tab, newline, CR).
|
|
1331
|
+
*/
|
|
1332
|
+
static looksLikeText(buffer) {
|
|
1333
|
+
const sampleSize = Math.min(buffer.length, 512);
|
|
1334
|
+
let printable = 0;
|
|
1335
|
+
for (let i = 0; i < sampleSize; i++) {
|
|
1336
|
+
const b = buffer[i];
|
|
1337
|
+
// Printable ASCII, tab, newline, carriage return, or high bytes (UTF-8 multibyte)
|
|
1338
|
+
if ((b >= 0x20 && b <= 0x7e) ||
|
|
1339
|
+
b === 0x09 ||
|
|
1340
|
+
b === 0x0a ||
|
|
1341
|
+
b === 0x0d ||
|
|
1342
|
+
b >= 0x80) {
|
|
1343
|
+
printable++;
|
|
1344
|
+
}
|
|
1345
|
+
}
|
|
1346
|
+
return printable / sampleSize > 0.9;
|
|
1347
|
+
}
|
|
1348
|
+
/**
|
|
1349
|
+
* Guess MIME type from file type and extension.
|
|
1350
|
+
*/
|
|
1351
|
+
guessMimeType(type, ext) {
|
|
1352
|
+
const mimeMap = {
|
|
1353
|
+
// By file type
|
|
1354
|
+
csv: "text/csv",
|
|
1355
|
+
svg: "image/svg+xml",
|
|
1356
|
+
pdf: "application/pdf",
|
|
1357
|
+
docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
1358
|
+
pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
1359
|
+
xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
1360
|
+
video: "video/mp4",
|
|
1361
|
+
audio: "audio/mpeg",
|
|
1362
|
+
archive: "application/zip",
|
|
1363
|
+
image: "image/png",
|
|
1364
|
+
};
|
|
1365
|
+
if (mimeMap[type]) {
|
|
1366
|
+
return mimeMap[type];
|
|
1367
|
+
}
|
|
1368
|
+
// By extension
|
|
1369
|
+
const extMime = {
|
|
1370
|
+
png: "image/png",
|
|
1371
|
+
jpg: "image/jpeg",
|
|
1372
|
+
jpeg: "image/jpeg",
|
|
1373
|
+
gif: "image/gif",
|
|
1374
|
+
webp: "image/webp",
|
|
1375
|
+
mp4: "video/mp4",
|
|
1376
|
+
mkv: "video/x-matroska",
|
|
1377
|
+
webm: "video/webm",
|
|
1378
|
+
avi: "video/x-msvideo",
|
|
1379
|
+
mov: "video/quicktime",
|
|
1380
|
+
mp3: "audio/mpeg",
|
|
1381
|
+
wav: "audio/wav",
|
|
1382
|
+
ogg: "audio/ogg",
|
|
1383
|
+
flac: "audio/flac",
|
|
1384
|
+
json: "application/json",
|
|
1385
|
+
xml: "application/xml",
|
|
1386
|
+
html: "text/html",
|
|
1387
|
+
css: "text/css",
|
|
1388
|
+
js: "text/javascript",
|
|
1389
|
+
ts: "text/typescript",
|
|
1390
|
+
py: "text/x-python",
|
|
1391
|
+
zip: "application/zip",
|
|
1392
|
+
tar: "application/x-tar",
|
|
1393
|
+
gz: "application/gzip",
|
|
1394
|
+
};
|
|
1395
|
+
return extMime[ext.toLowerCase()] || "application/octet-stream";
|
|
1396
|
+
}
|
|
1397
|
+
/**
|
|
1398
|
+
* Guess file extension from magic bytes.
|
|
1399
|
+
*/
|
|
1400
|
+
guessExtension(buffer) {
|
|
1401
|
+
if (buffer.length < 4) {
|
|
1402
|
+
return "";
|
|
1403
|
+
}
|
|
1404
|
+
if (buffer[0] === 0x89 && buffer[1] === 0x50) {
|
|
1405
|
+
return ".png";
|
|
1406
|
+
}
|
|
1407
|
+
if (buffer[0] === 0xff && buffer[1] === 0xd8) {
|
|
1408
|
+
return ".jpg";
|
|
1409
|
+
}
|
|
1410
|
+
if (buffer[0] === 0x25 && buffer[1] === 0x50) {
|
|
1411
|
+
return ".pdf";
|
|
1412
|
+
}
|
|
1413
|
+
if (buffer[0] === 0x50 && buffer[1] === 0x4b) {
|
|
1414
|
+
return ".zip";
|
|
1415
|
+
}
|
|
1416
|
+
if (buffer[0] === 0x49 && buffer[1] === 0x44) {
|
|
1417
|
+
return ".mp3";
|
|
1418
|
+
}
|
|
1419
|
+
// MP4/MOV/M4V — ftyp atom at offset 4
|
|
1420
|
+
if (buffer.length >= 8 &&
|
|
1421
|
+
buffer[4] === 0x66 &&
|
|
1422
|
+
buffer[5] === 0x74 &&
|
|
1423
|
+
buffer[6] === 0x79 &&
|
|
1424
|
+
buffer[7] === 0x70) {
|
|
1425
|
+
// Check the brand to distinguish MOV vs MP4
|
|
1426
|
+
const brand = buffer.toString("ascii", 8, 12);
|
|
1427
|
+
if (brand === "qt ") {
|
|
1428
|
+
return ".mov";
|
|
1429
|
+
}
|
|
1430
|
+
return ".mp4";
|
|
1431
|
+
}
|
|
1432
|
+
// MKV/WebM — EBML header (0x1A 0x45 0xDF 0xA3)
|
|
1433
|
+
if (buffer.length >= 4 &&
|
|
1434
|
+
buffer[0] === 0x1a &&
|
|
1435
|
+
buffer[1] === 0x45 &&
|
|
1436
|
+
buffer[2] === 0xdf &&
|
|
1437
|
+
buffer[3] === 0xa3) {
|
|
1438
|
+
return ".mkv";
|
|
1439
|
+
}
|
|
1440
|
+
// AVI — RIFF....AVI
|
|
1441
|
+
if (buffer.length >= 12 &&
|
|
1442
|
+
buffer[0] === 0x52 &&
|
|
1443
|
+
buffer[1] === 0x49 &&
|
|
1444
|
+
buffer[2] === 0x46 &&
|
|
1445
|
+
buffer[3] === 0x46 &&
|
|
1446
|
+
buffer[8] === 0x41 &&
|
|
1447
|
+
buffer[9] === 0x56 &&
|
|
1448
|
+
buffer[10] === 0x49) {
|
|
1449
|
+
return ".avi";
|
|
1450
|
+
}
|
|
1451
|
+
// WAV — RIFF....WAVE
|
|
1452
|
+
if (buffer.length >= 12 &&
|
|
1453
|
+
buffer[0] === 0x52 &&
|
|
1454
|
+
buffer[1] === 0x49 &&
|
|
1455
|
+
buffer[2] === 0x46 &&
|
|
1456
|
+
buffer[3] === 0x46 &&
|
|
1457
|
+
buffer[8] === 0x57 &&
|
|
1458
|
+
buffer[9] === 0x41 &&
|
|
1459
|
+
buffer[10] === 0x56 &&
|
|
1460
|
+
buffer[11] === 0x45) {
|
|
1461
|
+
return ".wav";
|
|
1462
|
+
}
|
|
1463
|
+
// FLAC
|
|
1464
|
+
if (buffer.length >= 4 &&
|
|
1465
|
+
buffer[0] === 0x66 &&
|
|
1466
|
+
buffer[1] === 0x4c &&
|
|
1467
|
+
buffer[2] === 0x61 &&
|
|
1468
|
+
buffer[3] === 0x43) {
|
|
1469
|
+
return ".flac";
|
|
1470
|
+
}
|
|
1471
|
+
// OGG
|
|
1472
|
+
if (buffer.length >= 4 &&
|
|
1473
|
+
buffer[0] === 0x4f &&
|
|
1474
|
+
buffer[1] === 0x67 &&
|
|
1475
|
+
buffer[2] === 0x67 &&
|
|
1476
|
+
buffer[3] === 0x53) {
|
|
1477
|
+
return ".ogg";
|
|
1478
|
+
}
|
|
1479
|
+
return "";
|
|
1480
|
+
}
|
|
1481
|
+
/**
|
|
1482
|
+
* Persist a buffer to the temp directory.
|
|
1483
|
+
*/
|
|
1484
|
+
async persistToTemp(id, buffer, ext) {
|
|
1485
|
+
// Check temp space budget
|
|
1486
|
+
if (this.currentTempBytes + buffer.length > this.maxTempBytes) {
|
|
1487
|
+
// Try evicting oldest files
|
|
1488
|
+
this.evictLRU();
|
|
1489
|
+
if (this.currentTempBytes + buffer.length > this.maxTempBytes) {
|
|
1490
|
+
throw new Error(`Temp directory budget exceeded (${this.formatSize(this.maxTempBytes)})`);
|
|
1491
|
+
}
|
|
1492
|
+
}
|
|
1493
|
+
// Ensure temp directory exists
|
|
1494
|
+
if (!this.tempDirCreated) {
|
|
1495
|
+
await mkdir(this.tempDir, { recursive: true });
|
|
1496
|
+
this.tempDirCreated = true;
|
|
1497
|
+
}
|
|
1498
|
+
const tempPath = join(this.tempDir, `${id}${ext ? `.${ext}` : ""}`);
|
|
1499
|
+
await writeFile(tempPath, buffer);
|
|
1500
|
+
this.currentTempBytes += buffer.length;
|
|
1501
|
+
return tempPath;
|
|
1502
|
+
}
|
|
1503
|
+
/**
|
|
1504
|
+
* Evict the least recently used file reference.
|
|
1505
|
+
*/
|
|
1506
|
+
evictLRU() {
|
|
1507
|
+
let oldest = null;
|
|
1508
|
+
let oldestId = null;
|
|
1509
|
+
for (const [id, ref] of this.files) {
|
|
1510
|
+
if (!oldest || ref.lastAccessedAt < oldest.lastAccessedAt) {
|
|
1511
|
+
oldest = ref;
|
|
1512
|
+
oldestId = id;
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
if (oldestId && oldest) {
|
|
1516
|
+
logger.info(`[FileReferenceRegistry] Evicting LRU: "${oldest.filename}" ` +
|
|
1517
|
+
`(last accessed ${new Date(oldest.lastAccessedAt).toISOString()})`);
|
|
1518
|
+
// Clean up temp file if we created it
|
|
1519
|
+
if (oldest.tempPath && oldest.source !== "path") {
|
|
1520
|
+
unlink(oldest.tempPath).catch(() => {
|
|
1521
|
+
// Ignore cleanup errors
|
|
1522
|
+
});
|
|
1523
|
+
this.currentTempBytes -= oldest.sizeBytes;
|
|
1524
|
+
}
|
|
1525
|
+
this.files.delete(oldestId);
|
|
1526
|
+
}
|
|
1527
|
+
}
|
|
1528
|
+
/**
|
|
1529
|
+
* Format byte size as human-readable string.
|
|
1530
|
+
*/
|
|
1531
|
+
formatSize(bytes) {
|
|
1532
|
+
if (bytes < 1024) {
|
|
1533
|
+
return `${bytes} B`;
|
|
1534
|
+
}
|
|
1535
|
+
if (bytes < 1024 * 1024) {
|
|
1536
|
+
return `${(bytes / 1024).toFixed(1)} KB`;
|
|
1537
|
+
}
|
|
1538
|
+
if (bytes < 1024 * 1024 * 1024) {
|
|
1539
|
+
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
|
1540
|
+
}
|
|
1541
|
+
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
|
|
1542
|
+
}
|
|
1543
|
+
}
|