@framers/agentos 0.1.112 → 0.1.114
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -5
- package/dist/api/AgentOS.d.ts +45 -12
- package/dist/api/AgentOS.d.ts.map +1 -1
- package/dist/api/AgentOS.js +225 -78
- package/dist/api/AgentOS.js.map +1 -1
- package/dist/api/AgentOSOrchestrator.d.ts +8 -0
- package/dist/api/AgentOSOrchestrator.d.ts.map +1 -1
- package/dist/api/AgentOSOrchestrator.js +350 -59
- package/dist/api/AgentOSOrchestrator.js.map +1 -1
- package/dist/api/StreamChunkEmitter.d.ts.map +1 -1
- package/dist/api/StreamChunkEmitter.js +2 -0
- package/dist/api/StreamChunkEmitter.js.map +1 -1
- package/dist/api/agency.d.ts.map +1 -1
- package/dist/api/agency.js +47 -1
- package/dist/api/agency.js.map +1 -1
- package/dist/api/agent.d.ts +18 -5
- package/dist/api/agent.d.ts.map +1 -1
- package/dist/api/agent.js +48 -9
- package/dist/api/agent.js.map +1 -1
- package/dist/api/agentExport.d.ts +202 -0
- package/dist/api/agentExport.d.ts.map +1 -0
- package/dist/api/agentExport.js +323 -0
- package/dist/api/agentExport.js.map +1 -0
- package/dist/api/editImage.d.ts +119 -0
- package/dist/api/editImage.d.ts.map +1 -0
- package/dist/api/editImage.js +150 -0
- package/dist/api/editImage.js.map +1 -0
- package/dist/api/embedText.d.ts +137 -0
- package/dist/api/embedText.d.ts.map +1 -0
- package/dist/api/embedText.js +229 -0
- package/dist/api/embedText.js.map +1 -0
- package/dist/api/externalToolRegistry.d.ts +44 -0
- package/dist/api/externalToolRegistry.d.ts.map +1 -0
- package/dist/api/externalToolRegistry.js +245 -0
- package/dist/api/externalToolRegistry.js.map +1 -0
- package/dist/api/generateImage.d.ts +1 -1
- package/dist/api/generateImage.d.ts.map +1 -1
- package/dist/api/generateImage.js +17 -13
- package/dist/api/generateImage.js.map +1 -1
- package/dist/api/generateObject.d.ts +185 -0
- package/dist/api/generateObject.d.ts.map +1 -0
- package/dist/api/generateObject.js +249 -0
- package/dist/api/generateObject.js.map +1 -0
- package/dist/api/generateText.d.ts +13 -3
- package/dist/api/generateText.d.ts.map +1 -1
- package/dist/api/generateText.js +20 -5
- package/dist/api/generateText.js.map +1 -1
- package/dist/api/interfaces/IAgentOS.d.ts +29 -1
- package/dist/api/interfaces/IAgentOS.d.ts.map +1 -1
- package/dist/api/model.d.ts +7 -7
- package/dist/api/model.d.ts.map +1 -1
- package/dist/api/model.js +22 -16
- package/dist/api/model.js.map +1 -1
- package/dist/api/processRequestWithExternalTools.d.ts +26 -0
- package/dist/api/processRequestWithExternalTools.d.ts.map +1 -0
- package/dist/api/processRequestWithExternalTools.js +52 -0
- package/dist/api/processRequestWithExternalTools.js.map +1 -0
- package/dist/api/processRequestWithRegisteredTools.d.ts +56 -0
- package/dist/api/processRequestWithRegisteredTools.d.ts.map +1 -0
- package/dist/api/processRequestWithRegisteredTools.js +125 -0
- package/dist/api/processRequestWithRegisteredTools.js.map +1 -0
- package/dist/api/provider-defaults.d.ts.map +1 -1
- package/dist/api/provider-defaults.js +28 -0
- package/dist/api/provider-defaults.js.map +1 -1
- package/dist/api/resumeExternalToolRequestWithRegisteredTools.d.ts +71 -0
- package/dist/api/resumeExternalToolRequestWithRegisteredTools.d.ts.map +1 -0
- package/dist/api/resumeExternalToolRequestWithRegisteredTools.js +159 -0
- package/dist/api/resumeExternalToolRequestWithRegisteredTools.js.map +1 -0
- package/dist/api/strategies/agentGraphBuilder.d.ts +170 -0
- package/dist/api/strategies/agentGraphBuilder.d.ts.map +1 -0
- package/dist/api/strategies/agentGraphBuilder.js +299 -0
- package/dist/api/strategies/agentGraphBuilder.js.map +1 -0
- package/dist/api/strategies/debate.d.ts +12 -1
- package/dist/api/strategies/debate.d.ts.map +1 -1
- package/dist/api/strategies/debate.js +41 -5
- package/dist/api/strategies/debate.js.map +1 -1
- package/dist/api/strategies/graphCompiler.d.ts +84 -0
- package/dist/api/strategies/graphCompiler.d.ts.map +1 -0
- package/dist/api/strategies/graphCompiler.js +617 -0
- package/dist/api/strategies/graphCompiler.js.map +1 -0
- package/dist/api/strategies/hierarchical.d.ts +15 -1
- package/dist/api/strategies/hierarchical.d.ts.map +1 -1
- package/dist/api/strategies/hierarchical.js +53 -8
- package/dist/api/strategies/hierarchical.js.map +1 -1
- package/dist/api/strategies/index.d.ts +29 -4
- package/dist/api/strategies/index.d.ts.map +1 -1
- package/dist/api/strategies/index.js +28 -4
- package/dist/api/strategies/index.js.map +1 -1
- package/dist/api/strategies/parallel.d.ts +15 -4
- package/dist/api/strategies/parallel.d.ts.map +1 -1
- package/dist/api/strategies/parallel.js +53 -16
- package/dist/api/strategies/parallel.js.map +1 -1
- package/dist/api/strategies/review-loop.d.ts +15 -1
- package/dist/api/strategies/review-loop.d.ts.map +1 -1
- package/dist/api/strategies/review-loop.js +36 -10
- package/dist/api/strategies/review-loop.js.map +1 -1
- package/dist/api/strategies/sequential.d.ts +11 -1
- package/dist/api/strategies/sequential.d.ts.map +1 -1
- package/dist/api/strategies/sequential.js +39 -8
- package/dist/api/strategies/sequential.js.map +1 -1
- package/dist/api/strategies/shared.d.ts +72 -8
- package/dist/api/strategies/shared.d.ts.map +1 -1
- package/dist/api/strategies/shared.js +92 -12
- package/dist/api/strategies/shared.js.map +1 -1
- package/dist/api/streamObject.d.ts +166 -0
- package/dist/api/streamObject.d.ts.map +1 -0
- package/dist/api/streamObject.js +268 -0
- package/dist/api/streamObject.js.map +1 -0
- package/dist/api/streamText.d.ts +1 -1
- package/dist/api/streamText.d.ts.map +1 -1
- package/dist/api/streamText.js +26 -8
- package/dist/api/streamText.js.map +1 -1
- package/dist/api/toolAdapter.d.ts +44 -8
- package/dist/api/toolAdapter.d.ts.map +1 -1
- package/dist/api/toolAdapter.js +224 -45
- package/dist/api/toolAdapter.js.map +1 -1
- package/dist/api/types/AgentOSExternalToolRequest.d.ts +35 -0
- package/dist/api/types/AgentOSExternalToolRequest.d.ts.map +1 -0
- package/dist/api/types/AgentOSExternalToolRequest.js +2 -0
- package/dist/api/types/AgentOSExternalToolRequest.js.map +1 -0
- package/dist/api/types/AgentOSResponse.d.ts +25 -0
- package/dist/api/types/AgentOSResponse.d.ts.map +1 -1
- package/dist/api/types/AgentOSResponse.js +20 -0
- package/dist/api/types/AgentOSResponse.js.map +1 -1
- package/dist/api/types/AgentOSToolResult.d.ts +11 -0
- package/dist/api/types/AgentOSToolResult.d.ts.map +1 -0
- package/dist/api/types/AgentOSToolResult.js +2 -0
- package/dist/api/types/AgentOSToolResult.js.map +1 -0
- package/dist/api/types.d.ts +81 -4
- package/dist/api/types.d.ts.map +1 -1
- package/dist/api/types.js.map +1 -1
- package/dist/api/upscaleImage.d.ts +92 -0
- package/dist/api/upscaleImage.d.ts.map +1 -0
- package/dist/api/upscaleImage.js +133 -0
- package/dist/api/upscaleImage.js.map +1 -0
- package/dist/api/variateImage.d.ts +102 -0
- package/dist/api/variateImage.d.ts.map +1 -0
- package/dist/api/variateImage.js +154 -0
- package/dist/api/variateImage.js.map +1 -0
- package/dist/cognitive_substrate/GMI.d.ts +16 -2
- package/dist/cognitive_substrate/GMI.d.ts.map +1 -1
- package/dist/cognitive_substrate/GMI.js +188 -56
- package/dist/cognitive_substrate/GMI.js.map +1 -1
- package/dist/cognitive_substrate/IGMI.d.ts +10 -0
- package/dist/cognitive_substrate/IGMI.d.ts.map +1 -1
- package/dist/cognitive_substrate/IGMI.js.map +1 -1
- package/dist/config/AgentOSConfig.d.ts +19 -2
- package/dist/config/AgentOSConfig.d.ts.map +1 -1
- package/dist/config/AgentOSConfig.js +46 -29
- package/dist/config/AgentOSConfig.js.map +1 -1
- package/dist/core/guardrails/IGuardrailService.d.ts +1 -1
- package/dist/core/images/IImageProvider.d.ts +93 -0
- package/dist/core/images/IImageProvider.d.ts.map +1 -1
- package/dist/core/images/IImageProvider.js.map +1 -1
- package/dist/core/images/ImageOperationError.d.ts +52 -0
- package/dist/core/images/ImageOperationError.d.ts.map +1 -0
- package/dist/core/images/ImageOperationError.js +58 -0
- package/dist/core/images/ImageOperationError.js.map +1 -0
- package/dist/core/images/imageToBuffer.d.ts +41 -0
- package/dist/core/images/imageToBuffer.d.ts.map +1 -0
- package/dist/core/images/imageToBuffer.js +95 -0
- package/dist/core/images/imageToBuffer.js.map +1 -0
- package/dist/core/images/index.d.ts +4 -0
- package/dist/core/images/index.d.ts.map +1 -1
- package/dist/core/images/index.js +8 -0
- package/dist/core/images/index.js.map +1 -1
- package/dist/core/images/providers/FalImageProvider.d.ts +208 -0
- package/dist/core/images/providers/FalImageProvider.d.ts.map +1 -0
- package/dist/core/images/providers/FalImageProvider.js +301 -0
- package/dist/core/images/providers/FalImageProvider.js.map +1 -0
- package/dist/core/images/providers/FluxImageProvider.d.ts +197 -0
- package/dist/core/images/providers/FluxImageProvider.d.ts.map +1 -0
- package/dist/core/images/providers/FluxImageProvider.js +271 -0
- package/dist/core/images/providers/FluxImageProvider.js.map +1 -0
- package/dist/core/images/providers/OpenAIImageProvider.d.ts +33 -1
- package/dist/core/images/providers/OpenAIImageProvider.d.ts.map +1 -1
- package/dist/core/images/providers/OpenAIImageProvider.js +125 -0
- package/dist/core/images/providers/OpenAIImageProvider.js.map +1 -1
- package/dist/core/images/providers/ReplicateImageProvider.d.ts +26 -1
- package/dist/core/images/providers/ReplicateImageProvider.d.ts.map +1 -1
- package/dist/core/images/providers/ReplicateImageProvider.js +118 -0
- package/dist/core/images/providers/ReplicateImageProvider.js.map +1 -1
- package/dist/core/images/providers/StabilityImageProvider.d.ts +41 -1
- package/dist/core/images/providers/StabilityImageProvider.d.ts.map +1 -1
- package/dist/core/images/providers/StabilityImageProvider.js +180 -7
- package/dist/core/images/providers/StabilityImageProvider.js.map +1 -1
- package/dist/core/images/providers/StableDiffusionLocalProvider.d.ts +29 -1
- package/dist/core/images/providers/StableDiffusionLocalProvider.d.ts.map +1 -1
- package/dist/core/images/providers/StableDiffusionLocalProvider.js +124 -0
- package/dist/core/images/providers/StableDiffusionLocalProvider.js.map +1 -1
- package/dist/core/llm/IPromptEngine.d.ts +2 -2
- package/dist/core/llm/IPromptEngine.d.ts.map +1 -1
- package/dist/core/llm/IPromptEngine.js +2 -2
- package/dist/core/llm/IPromptEngine.js.map +1 -1
- package/dist/core/llm/providers/AIModelProviderManager.d.ts +7 -1
- package/dist/core/llm/providers/AIModelProviderManager.d.ts.map +1 -1
- package/dist/core/llm/providers/AIModelProviderManager.js +24 -0
- package/dist/core/llm/providers/AIModelProviderManager.js.map +1 -1
- package/dist/core/llm/providers/errors/AnthropicProviderError.d.ts +42 -0
- package/dist/core/llm/providers/errors/AnthropicProviderError.d.ts.map +1 -0
- package/dist/core/llm/providers/errors/AnthropicProviderError.js +45 -0
- package/dist/core/llm/providers/errors/AnthropicProviderError.js.map +1 -0
- package/dist/core/llm/providers/errors/GeminiProviderError.d.ts +45 -0
- package/dist/core/llm/providers/errors/GeminiProviderError.d.ts.map +1 -0
- package/dist/core/llm/providers/errors/GeminiProviderError.js +46 -0
- package/dist/core/llm/providers/errors/GeminiProviderError.js.map +1 -0
- package/dist/core/llm/providers/errors/OllamaProviderError.d.ts +1 -1
- package/dist/core/llm/providers/errors/OllamaProviderError.d.ts.map +1 -1
- package/dist/core/llm/providers/errors/OllamaProviderError.js +1 -1
- package/dist/core/llm/providers/errors/OllamaProviderError.js.map +1 -1
- package/dist/core/llm/providers/errors/OpenAIProviderError.d.ts +1 -1
- package/dist/core/llm/providers/errors/OpenAIProviderError.js +1 -1
- package/dist/core/llm/providers/errors/OpenRouterProviderError.d.ts +1 -1
- package/dist/core/llm/providers/errors/OpenRouterProviderError.js +1 -1
- package/dist/core/llm/providers/implementations/AnthropicProvider.d.ts +340 -0
- package/dist/core/llm/providers/implementations/AnthropicProvider.d.ts.map +1 -0
- package/dist/core/llm/providers/implementations/AnthropicProvider.js +959 -0
- package/dist/core/llm/providers/implementations/AnthropicProvider.js.map +1 -0
- package/dist/core/llm/providers/implementations/GeminiProvider.d.ts +339 -0
- package/dist/core/llm/providers/implementations/GeminiProvider.d.ts.map +1 -0
- package/dist/core/llm/providers/implementations/GeminiProvider.js +1004 -0
- package/dist/core/llm/providers/implementations/GeminiProvider.js.map +1 -0
- package/dist/core/llm/providers/implementations/GroqProvider.d.ts +105 -0
- package/dist/core/llm/providers/implementations/GroqProvider.d.ts.map +1 -0
- package/dist/core/llm/providers/implementations/GroqProvider.js +134 -0
- package/dist/core/llm/providers/implementations/GroqProvider.js.map +1 -0
- package/dist/core/llm/providers/implementations/MistralProvider.d.ts +105 -0
- package/dist/core/llm/providers/implementations/MistralProvider.d.ts.map +1 -0
- package/dist/core/llm/providers/implementations/MistralProvider.js +146 -0
- package/dist/core/llm/providers/implementations/MistralProvider.js.map +1 -0
- package/dist/core/llm/providers/implementations/TogetherProvider.d.ts +107 -0
- package/dist/core/llm/providers/implementations/TogetherProvider.d.ts.map +1 -0
- package/dist/core/llm/providers/implementations/TogetherProvider.js +138 -0
- package/dist/core/llm/providers/implementations/TogetherProvider.js.map +1 -0
- package/dist/core/llm/providers/implementations/XAIProvider.d.ts +102 -0
- package/dist/core/llm/providers/implementations/XAIProvider.d.ts.map +1 -0
- package/dist/core/llm/providers/implementations/XAIProvider.js +123 -0
- package/dist/core/llm/providers/implementations/XAIProvider.js.map +1 -0
- package/dist/core/orchestration/AgentOrchestrator.d.ts.map +1 -1
- package/dist/core/orchestration/AgentOrchestrator.js +26 -5
- package/dist/core/orchestration/AgentOrchestrator.js.map +1 -1
- package/dist/core/tools/IToolOrchestrator.d.ts +2 -2
- package/dist/core/tools/IToolOrchestrator.d.ts.map +1 -1
- package/dist/core/tools/ToolExecutor.d.ts +3 -0
- package/dist/core/tools/ToolExecutor.d.ts.map +1 -1
- package/dist/core/tools/ToolExecutor.js +2 -1
- package/dist/core/tools/ToolExecutor.js.map +1 -1
- package/dist/core/tools/ToolOrchestrator.d.ts +7 -7
- package/dist/core/tools/ToolOrchestrator.d.ts.map +1 -1
- package/dist/core/tools/ToolOrchestrator.js +135 -36
- package/dist/core/tools/ToolOrchestrator.js.map +1 -1
- package/dist/core/tools/permissions/ToolPermissionManager.d.ts +6 -5
- package/dist/core/tools/permissions/ToolPermissionManager.d.ts.map +1 -1
- package/dist/core/tools/permissions/ToolPermissionManager.js +47 -21
- package/dist/core/tools/permissions/ToolPermissionManager.js.map +1 -1
- package/dist/core/vision/VisionPipeline.d.ts +437 -0
- package/dist/core/vision/VisionPipeline.d.ts.map +1 -0
- package/dist/core/vision/VisionPipeline.js +1113 -0
- package/dist/core/vision/VisionPipeline.js.map +1 -0
- package/dist/core/vision/index.d.ts +97 -0
- package/dist/core/vision/index.d.ts.map +1 -0
- package/dist/core/vision/index.js +182 -0
- package/dist/core/vision/index.js.map +1 -0
- package/dist/core/vision/providers/LLMVisionProvider.d.ts +135 -0
- package/dist/core/vision/providers/LLMVisionProvider.d.ts.map +1 -0
- package/dist/core/vision/providers/LLMVisionProvider.js +136 -0
- package/dist/core/vision/providers/LLMVisionProvider.js.map +1 -0
- package/dist/core/vision/providers/PipelineVisionProvider.d.ts +154 -0
- package/dist/core/vision/providers/PipelineVisionProvider.d.ts.map +1 -0
- package/dist/core/vision/providers/PipelineVisionProvider.js +160 -0
- package/dist/core/vision/providers/PipelineVisionProvider.js.map +1 -0
- package/dist/core/vision/types.d.ts +286 -0
- package/dist/core/vision/types.d.ts.map +1 -0
- package/dist/core/vision/types.js +24 -0
- package/dist/core/vision/types.js.map +1 -0
- package/dist/discovery/CapabilityDiscoveryEngine.d.ts +1 -1
- package/dist/discovery/CapabilityDiscoveryEngine.d.ts.map +1 -1
- package/dist/discovery/CapabilityDiscoveryEngine.js +1 -1
- package/dist/discovery/CapabilityDiscoveryEngine.js.map +1 -1
- package/dist/emergent/ComposableToolBuilder.d.ts +15 -4
- package/dist/emergent/ComposableToolBuilder.d.ts.map +1 -1
- package/dist/emergent/ComposableToolBuilder.js +29 -14
- package/dist/emergent/ComposableToolBuilder.js.map +1 -1
- package/dist/emergent/EmergentCapabilityEngine.d.ts +3 -3
- package/dist/emergent/EmergentCapabilityEngine.d.ts.map +1 -1
- package/dist/emergent/EmergentCapabilityEngine.js +15 -12
- package/dist/emergent/EmergentCapabilityEngine.js.map +1 -1
- package/dist/emergent/EmergentJudge.d.ts +20 -0
- package/dist/emergent/EmergentJudge.d.ts.map +1 -1
- package/dist/emergent/EmergentJudge.js +121 -26
- package/dist/emergent/EmergentJudge.js.map +1 -1
- package/dist/emergent/EmergentToolRegistry.d.ts +17 -0
- package/dist/emergent/EmergentToolRegistry.d.ts.map +1 -1
- package/dist/emergent/EmergentToolRegistry.js +26 -0
- package/dist/emergent/EmergentToolRegistry.js.map +1 -1
- package/dist/emergent/ForgeToolMetaTool.d.ts +1 -1
- package/dist/emergent/ForgeToolMetaTool.d.ts.map +1 -1
- package/dist/emergent/ForgeToolMetaTool.js +15 -2
- package/dist/emergent/ForgeToolMetaTool.js.map +1 -1
- package/dist/emergent/SandboxedToolForge.d.ts +2 -2
- package/dist/emergent/SandboxedToolForge.d.ts.map +1 -1
- package/dist/emergent/SandboxedToolForge.js +13 -23
- package/dist/emergent/SandboxedToolForge.js.map +1 -1
- package/dist/emergent/SkillExporter.d.ts +119 -0
- package/dist/emergent/SkillExporter.d.ts.map +1 -0
- package/dist/emergent/SkillExporter.js +344 -0
- package/dist/emergent/SkillExporter.js.map +1 -0
- package/dist/emergent/index.d.ts +1 -0
- package/dist/emergent/index.d.ts.map +1 -1
- package/dist/emergent/index.js +1 -0
- package/dist/emergent/index.js.map +1 -1
- package/dist/emergent/types.d.ts +4 -4
- package/dist/index.d.ts +30 -5
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +19 -2
- package/dist/index.js.map +1 -1
- package/dist/memory/facade/Memory.d.ts.map +1 -1
- package/dist/memory/facade/Memory.js +8 -0
- package/dist/memory/facade/Memory.js.map +1 -1
- package/dist/memory/facade/types.d.ts +10 -0
- package/dist/memory/facade/types.d.ts.map +1 -1
- package/dist/memory/index.d.ts +15 -7
- package/dist/memory/index.d.ts.map +1 -1
- package/dist/memory/index.js +7 -0
- package/dist/memory/index.js.map +1 -1
- package/dist/memory/ingestion/DoclingLoader.d.ts +3 -3
- package/dist/memory/ingestion/DoclingLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/DoclingLoader.js +12 -8
- package/dist/memory/ingestion/DoclingLoader.js.map +1 -1
- package/dist/memory/ingestion/FolderScanner.d.ts +7 -7
- package/dist/memory/ingestion/FolderScanner.d.ts.map +1 -1
- package/dist/memory/ingestion/FolderScanner.js +6 -6
- package/dist/memory/ingestion/FolderScanner.js.map +1 -1
- package/dist/memory/ingestion/LoaderRegistry.d.ts +8 -8
- package/dist/memory/ingestion/LoaderRegistry.d.ts.map +1 -1
- package/dist/memory/ingestion/LoaderRegistry.js +9 -11
- package/dist/memory/ingestion/LoaderRegistry.js.map +1 -1
- package/dist/memory/ingestion/MultimodalAggregator.d.ts +1 -1
- package/dist/memory/ingestion/MultimodalAggregator.js +1 -1
- package/dist/memory/ingestion/OcrPdfLoader.d.ts +2 -2
- package/dist/memory/ingestion/OcrPdfLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/OcrPdfLoader.js +12 -8
- package/dist/memory/ingestion/OcrPdfLoader.js.map +1 -1
- package/dist/memory/ingestion/PdfLoader.d.ts +8 -8
- package/dist/memory/ingestion/PdfLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/PdfLoader.js +13 -10
- package/dist/memory/ingestion/PdfLoader.js.map +1 -1
- package/dist/memory/io/MarkdownExporter.d.ts +1 -1
- package/dist/memory/io/MarkdownExporter.d.ts.map +1 -1
- package/dist/memory/io/MarkdownExporter.js +1 -1
- package/dist/memory/io/MarkdownExporter.js.map +1 -1
- package/dist/memory/observation/MemoryObserver.d.ts +63 -1
- package/dist/memory/observation/MemoryObserver.d.ts.map +1 -1
- package/dist/memory/observation/MemoryObserver.js +115 -4
- package/dist/memory/observation/MemoryObserver.js.map +1 -1
- package/dist/memory/observation/ObservationCompressor.d.ts +88 -0
- package/dist/memory/observation/ObservationCompressor.d.ts.map +1 -0
- package/dist/memory/observation/ObservationCompressor.js +207 -0
- package/dist/memory/observation/ObservationCompressor.js.map +1 -0
- package/dist/memory/observation/ObservationReflector.d.ts +82 -0
- package/dist/memory/observation/ObservationReflector.d.ts.map +1 -0
- package/dist/memory/observation/ObservationReflector.js +212 -0
- package/dist/memory/observation/ObservationReflector.js.map +1 -0
- package/dist/memory/observation/temporal.d.ts +54 -0
- package/dist/memory/observation/temporal.d.ts.map +1 -0
- package/dist/memory/observation/temporal.js +115 -0
- package/dist/memory/observation/temporal.js.map +1 -0
- package/dist/memory/tools/MemoryAddTool.d.ts +2 -2
- package/dist/memory/tools/MemoryAddTool.d.ts.map +1 -1
- package/dist/memory/tools/MemoryAddTool.js +8 -3
- package/dist/memory/tools/MemoryAddTool.js.map +1 -1
- package/dist/memory/tools/MemorySearchTool.d.ts +3 -3
- package/dist/memory/tools/MemorySearchTool.d.ts.map +1 -1
- package/dist/memory/tools/MemorySearchTool.js +11 -9
- package/dist/memory/tools/MemorySearchTool.js.map +1 -1
- package/dist/memory/tools/scopeContext.d.ts +11 -0
- package/dist/memory/tools/scopeContext.d.ts.map +1 -0
- package/dist/memory/tools/scopeContext.js +46 -0
- package/dist/memory/tools/scopeContext.js.map +1 -0
- package/dist/orchestration/builders/AgentGraph.d.ts +12 -11
- package/dist/orchestration/builders/AgentGraph.d.ts.map +1 -1
- package/dist/orchestration/builders/AgentGraph.js +12 -11
- package/dist/orchestration/builders/AgentGraph.js.map +1 -1
- package/dist/orchestration/builders/VoiceNodeBuilder.d.ts +82 -25
- package/dist/orchestration/builders/VoiceNodeBuilder.d.ts.map +1 -1
- package/dist/orchestration/builders/VoiceNodeBuilder.js +86 -26
- package/dist/orchestration/builders/VoiceNodeBuilder.js.map +1 -1
- package/dist/orchestration/builders/WorkflowBuilder.d.ts +1 -1
- package/dist/orchestration/builders/WorkflowBuilder.d.ts.map +1 -1
- package/dist/orchestration/builders/WorkflowBuilder.js +1 -1
- package/dist/orchestration/builders/WorkflowBuilder.js.map +1 -1
- package/dist/orchestration/checkpoint/InMemoryCheckpointStore.d.ts +7 -54
- package/dist/orchestration/checkpoint/InMemoryCheckpointStore.d.ts.map +1 -1
- package/dist/orchestration/checkpoint/InMemoryCheckpointStore.js +8 -56
- package/dist/orchestration/checkpoint/InMemoryCheckpointStore.js.map +1 -1
- package/dist/orchestration/events/GraphEvent.d.ts +67 -5
- package/dist/orchestration/events/GraphEvent.d.ts.map +1 -1
- package/dist/orchestration/events/GraphEvent.js.map +1 -1
- package/dist/orchestration/runtime/GraphRuntime.d.ts.map +1 -1
- package/dist/orchestration/runtime/GraphRuntime.js +151 -1
- package/dist/orchestration/runtime/GraphRuntime.js.map +1 -1
- package/dist/orchestration/runtime/LoopController.d.ts +3 -3
- package/dist/orchestration/runtime/LoopController.d.ts.map +1 -1
- package/dist/orchestration/runtime/LoopController.js.map +1 -1
- package/dist/orchestration/runtime/StateManager.d.ts +3 -3
- package/dist/orchestration/runtime/StateManager.js +3 -3
- package/dist/orchestration/runtime/VoiceNodeExecutor.d.ts +103 -26
- package/dist/orchestration/runtime/VoiceNodeExecutor.d.ts.map +1 -1
- package/dist/orchestration/runtime/VoiceNodeExecutor.js +155 -43
- package/dist/orchestration/runtime/VoiceNodeExecutor.js.map +1 -1
- package/dist/orchestration/runtime/VoiceTransportAdapter.d.ts +95 -33
- package/dist/orchestration/runtime/VoiceTransportAdapter.d.ts.map +1 -1
- package/dist/orchestration/runtime/VoiceTransportAdapter.js +83 -29
- package/dist/orchestration/runtime/VoiceTransportAdapter.js.map +1 -1
- package/dist/orchestration/runtime/VoiceTurnCollector.d.ts +73 -20
- package/dist/orchestration/runtime/VoiceTurnCollector.d.ts.map +1 -1
- package/dist/orchestration/runtime/VoiceTurnCollector.js +84 -23
- package/dist/orchestration/runtime/VoiceTurnCollector.js.map +1 -1
- package/dist/query-router/KeywordFallback.d.ts +70 -0
- package/dist/query-router/KeywordFallback.d.ts.map +1 -0
- package/dist/query-router/KeywordFallback.js +132 -0
- package/dist/query-router/KeywordFallback.js.map +1 -0
- package/dist/query-router/QueryClassifier.d.ts +140 -0
- package/dist/query-router/QueryClassifier.d.ts.map +1 -0
- package/dist/query-router/QueryClassifier.js +223 -0
- package/dist/query-router/QueryClassifier.js.map +1 -0
- package/dist/query-router/QueryDispatcher.d.ts +139 -0
- package/dist/query-router/QueryDispatcher.d.ts.map +1 -0
- package/dist/query-router/QueryDispatcher.js +297 -0
- package/dist/query-router/QueryDispatcher.js.map +1 -0
- package/dist/query-router/QueryGenerator.d.ts +184 -0
- package/dist/query-router/QueryGenerator.d.ts.map +1 -0
- package/dist/query-router/QueryGenerator.js +241 -0
- package/dist/query-router/QueryGenerator.js.map +1 -0
- package/dist/query-router/QueryRouter.d.ts +292 -0
- package/dist/query-router/QueryRouter.d.ts.map +1 -0
- package/dist/query-router/QueryRouter.js +803 -0
- package/dist/query-router/QueryRouter.js.map +1 -0
- package/dist/query-router/TopicExtractor.d.ts +73 -0
- package/dist/query-router/TopicExtractor.d.ts.map +1 -0
- package/dist/query-router/TopicExtractor.js +95 -0
- package/dist/query-router/TopicExtractor.js.map +1 -0
- package/dist/query-router/index.d.ts +40 -0
- package/dist/query-router/index.d.ts.map +1 -0
- package/dist/query-router/index.js +46 -0
- package/dist/query-router/index.js.map +1 -0
- package/dist/query-router/types.d.ts +508 -0
- package/dist/query-router/types.d.ts.map +1 -0
- package/dist/query-router/types.js +39 -0
- package/dist/query-router/types.js.map +1 -0
- package/dist/rag/index.d.ts +5 -0
- package/dist/rag/index.d.ts.map +1 -1
- package/dist/rag/index.js +7 -0
- package/dist/rag/index.js.map +1 -1
- package/dist/rag/multimodal/LLMVisionAdapter.d.ts +43 -0
- package/dist/rag/multimodal/LLMVisionAdapter.d.ts.map +1 -0
- package/dist/rag/multimodal/LLMVisionAdapter.js +46 -0
- package/dist/rag/multimodal/LLMVisionAdapter.js.map +1 -0
- package/dist/rag/multimodal/MultimodalIndexer.d.ts +244 -0
- package/dist/rag/multimodal/MultimodalIndexer.d.ts.map +1 -0
- package/dist/rag/multimodal/MultimodalIndexer.js +411 -0
- package/dist/rag/multimodal/MultimodalIndexer.js.map +1 -0
- package/dist/rag/multimodal/MultimodalMemoryBridge.d.ts +448 -0
- package/dist/rag/multimodal/MultimodalMemoryBridge.d.ts.map +1 -0
- package/dist/rag/multimodal/MultimodalMemoryBridge.js +941 -0
- package/dist/rag/multimodal/MultimodalMemoryBridge.js.map +1 -0
- package/dist/rag/multimodal/SpeechProviderAdapter.d.ts +139 -0
- package/dist/rag/multimodal/SpeechProviderAdapter.d.ts.map +1 -0
- package/dist/rag/multimodal/SpeechProviderAdapter.js +143 -0
- package/dist/rag/multimodal/SpeechProviderAdapter.js.map +1 -0
- package/dist/rag/multimodal/createMultimodalIndexerFromResolver.d.ts +172 -0
- package/dist/rag/multimodal/createMultimodalIndexerFromResolver.d.ts.map +1 -0
- package/dist/rag/multimodal/createMultimodalIndexerFromResolver.js +152 -0
- package/dist/rag/multimodal/createMultimodalIndexerFromResolver.js.map +1 -0
- package/dist/rag/multimodal/index.d.ts +44 -0
- package/dist/rag/multimodal/index.d.ts.map +1 -0
- package/dist/rag/multimodal/index.js +42 -0
- package/dist/rag/multimodal/index.js.map +1 -0
- package/dist/rag/multimodal/types.d.ts +276 -0
- package/dist/rag/multimodal/types.d.ts.map +1 -0
- package/dist/rag/multimodal/types.js +26 -0
- package/dist/rag/multimodal/types.js.map +1 -0
- package/dist/social-posting/SocialPostManager.d.ts +3 -3
- package/dist/social-posting/SocialPostManager.d.ts.map +1 -1
- package/dist/social-posting/SocialPostManager.js +3 -5
- package/dist/social-posting/SocialPostManager.js.map +1 -1
- package/dist/speech/FallbackProxy.d.ts +6 -6
- package/dist/speech/FallbackProxy.d.ts.map +1 -1
- package/dist/speech/FallbackProxy.js +3 -3
- package/dist/speech/FallbackProxy.js.map +1 -1
- package/dist/speech/SpeechProviderResolver.d.ts +8 -8
- package/dist/speech/SpeechProviderResolver.d.ts.map +1 -1
- package/dist/speech/SpeechProviderResolver.js +22 -11
- package/dist/speech/SpeechProviderResolver.js.map +1 -1
- package/dist/speech/SpeechRuntime.d.ts +1 -5
- package/dist/speech/SpeechRuntime.d.ts.map +1 -1
- package/dist/speech/SpeechRuntime.js +17 -9
- package/dist/speech/SpeechRuntime.js.map +1 -1
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts +4 -4
- package/dist/speech/providers/AssemblyAISTTProvider.js +4 -4
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts +3 -3
- package/dist/speech/providers/AzureSpeechTTSProvider.js +2 -2
- package/dist/speech/providers/AzureSpeechTTSProvider.js.map +1 -1
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts +9 -9
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts.map +1 -1
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.js +5 -5
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.js.map +1 -1
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts +2 -2
- package/dist/speech/providers/DeepgramBatchSTTProvider.js +2 -2
- package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts +3 -3
- package/dist/speech/providers/OpenAITextToSpeechProvider.js +2 -2
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts +1 -1
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts.map +1 -1
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js +1 -1
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js.map +1 -1
- package/dist/voice/TelephonyStreamTransport.d.ts +6 -6
- package/dist/voice/TelephonyStreamTransport.d.ts.map +1 -1
- package/dist/voice/TelephonyStreamTransport.js +5 -5
- package/dist/voice/TelephonyStreamTransport.js.map +1 -1
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts +4 -4
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts.map +1 -1
- package/dist/voice-pipeline/AcousticEndpointDetector.js +4 -4
- package/dist/voice-pipeline/AcousticEndpointDetector.js.map +1 -1
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts +3 -3
- package/dist/voice-pipeline/HardCutBargeinHandler.js +3 -3
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts +3 -3
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts.map +1 -1
- package/dist/voice-pipeline/HeuristicEndpointDetector.js +3 -3
- package/dist/voice-pipeline/HeuristicEndpointDetector.js.map +1 -1
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts +5 -5
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js +1 -1
- package/dist/voice-pipeline/VoiceInterruptError.d.ts +6 -6
- package/dist/voice-pipeline/VoiceInterruptError.d.ts.map +1 -1
- package/dist/voice-pipeline/VoiceInterruptError.js +4 -4
- package/dist/voice-pipeline/VoiceInterruptError.js.map +1 -1
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts +9 -9
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts.map +1 -1
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js +8 -8
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js.map +1 -1
- package/dist/voice-pipeline/WebRTCStreamTransport.d.ts +421 -0
- package/dist/voice-pipeline/WebRTCStreamTransport.d.ts.map +1 -0
- package/dist/voice-pipeline/WebRTCStreamTransport.js +573 -0
- package/dist/voice-pipeline/WebRTCStreamTransport.js.map +1 -0
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts +8 -8
- package/dist/voice-pipeline/WebSocketStreamTransport.js +5 -5
- package/dist/voice-pipeline/index.d.ts +1 -0
- package/dist/voice-pipeline/index.d.ts.map +1 -1
- package/dist/voice-pipeline/index.js +2 -0
- package/dist/voice-pipeline/index.js.map +1 -1
- package/dist/voice-pipeline/types.d.ts +43 -43
- package/dist/voice-pipeline/types.d.ts.map +1 -1
- package/package.json +19 -1
|
@@ -0,0 +1,1113 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module core/vision/VisionPipeline
|
|
3
|
+
*
|
|
4
|
+
* Unified vision pipeline with progressive enhancement.
|
|
5
|
+
*
|
|
6
|
+
* Processes images through configurable tiers:
|
|
7
|
+
*
|
|
8
|
+
* ```
|
|
9
|
+
* ┌─────────────────────────────────────────────────────────────────────┐
|
|
10
|
+
* │ Image Buffer / URL │
|
|
11
|
+
* │ ↓ │
|
|
12
|
+
* │ Preprocessing (sharp: resize, grayscale, sharpen, normalize) │
|
|
13
|
+
* │ ↓ │
|
|
14
|
+
* │ Tier 1 — Local OCR (PaddleOCR or Tesseract.js) │
|
|
15
|
+
* │ ↓ confidence < threshold? │
|
|
16
|
+
* │ Tier 2 — Local Vision (TrOCR / Florence-2) │
|
|
17
|
+
* │ ↓ still below threshold? │
|
|
18
|
+
* │ Tier 3 — Cloud Vision (GPT-4o / Claude / Gemini via generateText) │
|
|
19
|
+
* │ ↓ │
|
|
20
|
+
* │ Merge: highest-confidence text wins, structured layout preserved │
|
|
21
|
+
* │ │
|
|
22
|
+
* │ [parallel] CLIP embedding runs alongside all tiers │
|
|
23
|
+
* └─────────────────────────────────────────────────────────────────────┘
|
|
24
|
+
* ```
|
|
25
|
+
*
|
|
26
|
+
* ## Dependency loading
|
|
27
|
+
*
|
|
28
|
+
* All heavy ML dependencies (ppu-paddle-ocr, tesseract.js,
|
|
29
|
+
* \@huggingface/transformers) are loaded lazily via dynamic `import()`.
|
|
30
|
+
* If a dependency is missing, the pipeline throws a helpful error
|
|
31
|
+
* with installation instructions — it never crashes on missing
|
|
32
|
+
* optional peer deps at module load time.
|
|
33
|
+
*
|
|
34
|
+
* ## Strategy behaviours
|
|
35
|
+
*
|
|
36
|
+
* | Strategy | Tier 1 | Tier 2 | Tier 3 | Notes |
|
|
37
|
+
* |----------|--------|--------|--------|-------|
|
|
38
|
+
* | progressive | Always | If low confidence | If still low | Default |
|
|
39
|
+
* | local-only | Always | Always | Never | Air-gapped |
|
|
40
|
+
* | cloud-only | Never | Never | Always | Best quality |
|
|
41
|
+
* | parallel | Always | Always | Always | Merge best |
|
|
42
|
+
*
|
|
43
|
+
* @see {@link VisionPipelineConfig} for configuration options.
|
|
44
|
+
* @see {@link VisionResult} for the output shape.
|
|
45
|
+
* @see {@link createVisionPipeline} for the auto-detecting factory.
|
|
46
|
+
*
|
|
47
|
+
* @example
|
|
48
|
+
* ```typescript
|
|
49
|
+
* const pipeline = new VisionPipeline({
|
|
50
|
+
* strategy: 'progressive',
|
|
51
|
+
* ocr: 'paddle',
|
|
52
|
+
* handwriting: true,
|
|
53
|
+
* documentAI: true,
|
|
54
|
+
* embedding: true,
|
|
55
|
+
* cloudProvider: 'openai',
|
|
56
|
+
* confidenceThreshold: 0.8,
|
|
57
|
+
* });
|
|
58
|
+
*
|
|
59
|
+
* const result = await pipeline.process(imageBuffer);
|
|
60
|
+
* console.log(result.text); // extracted text
|
|
61
|
+
* console.log(result.category); // 'printed-text' | 'handwritten' | etc.
|
|
62
|
+
* console.log(result.embedding); // CLIP vector for search
|
|
63
|
+
* console.log(result.layout); // structured document layout
|
|
64
|
+
* ```
|
|
65
|
+
*/
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Constants
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
/**
|
|
70
|
+
* Default confidence threshold for the progressive strategy.
|
|
71
|
+
* OCR results above this threshold are accepted without cloud escalation.
|
|
72
|
+
*/
|
|
73
|
+
const DEFAULT_CONFIDENCE_THRESHOLD = 0.7;
|
|
74
|
+
/**
|
|
75
|
+
* Default cloud vision confidence score. Cloud LLMs don't return numeric
|
|
76
|
+
* confidence, so we assign a fixed high value since they are generally
|
|
77
|
+
* the most capable tier.
|
|
78
|
+
*/
|
|
79
|
+
const CLOUD_VISION_CONFIDENCE = 0.95;
|
|
80
|
+
/**
|
|
81
|
+
* Prompt sent to cloud vision LLMs when describing images.
|
|
82
|
+
* Designed to extract both descriptive text AND any embedded text,
|
|
83
|
+
* and to identify the content type for routing purposes.
|
|
84
|
+
*/
|
|
85
|
+
const CLOUD_VISION_PROMPT = 'Describe this image in detail. Extract all visible text exactly as written. ' +
|
|
86
|
+
'Identify the type of content (printed document, handwritten note, photograph, ' +
|
|
87
|
+
'diagram, screenshot, etc.). If the image contains a document, preserve the ' +
|
|
88
|
+
'logical reading order and structure.';
|
|
89
|
+
// ---------------------------------------------------------------------------
|
|
90
|
+
// VisionPipeline
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
/**
|
|
93
|
+
* Unified vision pipeline with progressive enhancement.
|
|
94
|
+
*
|
|
95
|
+
* Processes images through up to three tiers of increasing capability:
|
|
96
|
+
* 1. Local OCR (PaddleOCR / Tesseract.js) — fast, free, offline
|
|
97
|
+
* 2. Local Vision Models (TrOCR / Florence-2 / CLIP) — offline but slower
|
|
98
|
+
* 3. Cloud Vision LLMs (GPT-4o, Claude, Gemini) — best quality, API cost
|
|
99
|
+
*
|
|
100
|
+
* All heavy dependencies are loaded lazily on first use. The pipeline
|
|
101
|
+
* never imports ML libraries at module load time, so it's safe to
|
|
102
|
+
* instantiate even when optional peer deps are missing — errors only
|
|
103
|
+
* surface when a tier that needs them actually runs.
|
|
104
|
+
*
|
|
105
|
+
* @see {@link createVisionPipeline} for automatic provider detection.
|
|
106
|
+
*/
|
|
107
|
+
export class VisionPipeline {
|
|
108
|
+
// -------------------------------------------------------------------------
|
|
109
|
+
// Constructor
|
|
110
|
+
// -------------------------------------------------------------------------
|
|
111
|
+
/**
|
|
112
|
+
* Create a new vision pipeline.
|
|
113
|
+
*
|
|
114
|
+
* @param config - Pipeline configuration. All heavy dependencies are loaded
|
|
115
|
+
* lazily, so construction is synchronous and never imports ML libraries.
|
|
116
|
+
*
|
|
117
|
+
* @example
|
|
118
|
+
* ```typescript
|
|
119
|
+
* const pipeline = new VisionPipeline({
|
|
120
|
+
* strategy: 'progressive',
|
|
121
|
+
* ocr: 'paddle',
|
|
122
|
+
* handwriting: true,
|
|
123
|
+
* cloudProvider: 'openai',
|
|
124
|
+
* });
|
|
125
|
+
* ```
|
|
126
|
+
*/
|
|
127
|
+
constructor(config) {
|
|
128
|
+
/** Whether dispose() has been called. Guards against use-after-free. */
|
|
129
|
+
this._disposed = false;
|
|
130
|
+
this._config = { ...config };
|
|
131
|
+
}
|
|
132
|
+
// -------------------------------------------------------------------------
|
|
133
|
+
// Public API
|
|
134
|
+
// -------------------------------------------------------------------------
|
|
135
|
+
/**
|
|
136
|
+
* Process an image through the configured tiers.
|
|
137
|
+
*
|
|
138
|
+
* Automatically detects content type (printed text, handwritten, diagram,
|
|
139
|
+
* etc.) and routes through the appropriate processing tiers based on the
|
|
140
|
+
* configured {@link VisionStrategy}.
|
|
141
|
+
*
|
|
142
|
+
* @param image - Image data as a Buffer or file-path / URL string.
|
|
143
|
+
* Buffers are preprocessed with sharp (if configured). URL strings
|
|
144
|
+
* are passed directly to providers that support them.
|
|
145
|
+
* @param options - Optional overrides for this specific invocation.
|
|
146
|
+
* @param options.forceCategory - Force a specific content category
|
|
147
|
+
* instead of auto-detecting from OCR confidence heuristics.
|
|
148
|
+
* @param options.tiers - Run only these specific tiers, ignoring
|
|
149
|
+
* the strategy's normal routing logic.
|
|
150
|
+
* @returns Aggregated vision result with text, confidence, embeddings, etc.
|
|
151
|
+
*
|
|
152
|
+
* @throws {Error} If all configured tiers fail to produce a result.
|
|
153
|
+
* @throws {Error} If a required dependency (e.g. ppu-paddle-ocr) is missing.
|
|
154
|
+
* @throws {Error} If `dispose()` was already called.
|
|
155
|
+
*
|
|
156
|
+
* @example
|
|
157
|
+
* ```typescript
|
|
158
|
+
* // Full progressive pipeline
|
|
159
|
+
* const result = await pipeline.process(imageBuffer);
|
|
160
|
+
*
|
|
161
|
+
* // Force handwriting mode
|
|
162
|
+
* const hw = await pipeline.process(scanBuffer, {
|
|
163
|
+
* forceCategory: 'handwritten',
|
|
164
|
+
* });
|
|
165
|
+
*
|
|
166
|
+
* // Only run OCR and embedding, skip everything else
|
|
167
|
+
* const partial = await pipeline.process(imageBuffer, {
|
|
168
|
+
* tiers: ['ocr', 'embedding'],
|
|
169
|
+
* });
|
|
170
|
+
* ```
|
|
171
|
+
*/
|
|
172
|
+
async process(image, options) {
|
|
173
|
+
this._assertNotDisposed();
|
|
174
|
+
const startTime = Date.now();
|
|
175
|
+
const { strategy } = this._config;
|
|
176
|
+
const threshold = this._config.confidenceThreshold ?? DEFAULT_CONFIDENCE_THRESHOLD;
|
|
177
|
+
// Preprocess the image (resize, grayscale, etc.) if it's a Buffer
|
|
178
|
+
const preprocessed = Buffer.isBuffer(image)
|
|
179
|
+
? await this._preprocess(image)
|
|
180
|
+
: image;
|
|
181
|
+
const tierResults = [];
|
|
182
|
+
let embedding;
|
|
183
|
+
let layout;
|
|
184
|
+
const activeTiers = [];
|
|
185
|
+
// Determine which tiers to run based on strategy (or explicit override)
|
|
186
|
+
const requestedTiers = options?.tiers;
|
|
187
|
+
// -----------------------------------------------------------------------
|
|
188
|
+
// CLIP embedding — runs in parallel with everything else when enabled,
|
|
189
|
+
// because it doesn't affect the text extraction path.
|
|
190
|
+
// -----------------------------------------------------------------------
|
|
191
|
+
const embeddingPromise = this._shouldRunTier('embedding', strategy, requestedTiers)
|
|
192
|
+
? this._runClipEmbedding(preprocessed).catch(() => undefined)
|
|
193
|
+
: Promise.resolve(undefined);
|
|
194
|
+
// -----------------------------------------------------------------------
|
|
195
|
+
// Strategy: cloud-only — skip all local tiers
|
|
196
|
+
// -----------------------------------------------------------------------
|
|
197
|
+
if (strategy === 'cloud-only' && !requestedTiers) {
|
|
198
|
+
const cloudResult = await this._runCloudVision(preprocessed);
|
|
199
|
+
tierResults.push(cloudResult);
|
|
200
|
+
activeTiers.push('cloud-vision');
|
|
201
|
+
embedding = await embeddingPromise;
|
|
202
|
+
if (embedding)
|
|
203
|
+
activeTiers.push('embedding');
|
|
204
|
+
return this._assembleResult(tierResults, activeTiers, embedding, layout, options?.forceCategory, startTime);
|
|
205
|
+
}
|
|
206
|
+
// -----------------------------------------------------------------------
|
|
207
|
+
// Tier 1 — Local OCR (PaddleOCR or Tesseract.js)
|
|
208
|
+
// -----------------------------------------------------------------------
|
|
209
|
+
let ocrResult;
|
|
210
|
+
if (this._shouldRunTier('ocr', strategy, requestedTiers)) {
|
|
211
|
+
ocrResult = await this._runOcr(preprocessed);
|
|
212
|
+
tierResults.push(ocrResult);
|
|
213
|
+
activeTiers.push('ocr');
|
|
214
|
+
// In progressive mode, if OCR confidence is high enough, we can
|
|
215
|
+
// skip expensive downstream tiers and return early.
|
|
216
|
+
if (strategy === 'progressive' &&
|
|
217
|
+
!requestedTiers &&
|
|
218
|
+
ocrResult.confidence >= threshold) {
|
|
219
|
+
embedding = await embeddingPromise;
|
|
220
|
+
if (embedding)
|
|
221
|
+
activeTiers.push('embedding');
|
|
222
|
+
return this._assembleResult(tierResults, activeTiers, embedding, layout, options?.forceCategory, startTime);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
// -----------------------------------------------------------------------
|
|
226
|
+
// Content category detection — decides which Tier 2 models to invoke
|
|
227
|
+
// -----------------------------------------------------------------------
|
|
228
|
+
const category = options?.forceCategory ?? this._detectCategory(ocrResult);
|
|
229
|
+
// -----------------------------------------------------------------------
|
|
230
|
+
// Tier 2a — Handwriting recognition (TrOCR)
|
|
231
|
+
// Triggered when content appears handwritten (low OCR confidence +
|
|
232
|
+
// single-char region heuristic) or when forced via forceCategory.
|
|
233
|
+
// -----------------------------------------------------------------------
|
|
234
|
+
if (this._shouldRunTier('handwriting', strategy, requestedTiers) &&
|
|
235
|
+
(category === 'handwritten' || category === 'mixed')) {
|
|
236
|
+
try {
|
|
237
|
+
const hwResult = await this._runTrOcr(preprocessed);
|
|
238
|
+
tierResults.push(hwResult);
|
|
239
|
+
activeTiers.push('handwriting');
|
|
240
|
+
}
|
|
241
|
+
catch {
|
|
242
|
+
// TrOCR failure is non-fatal — we still have OCR or cloud fallback
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
// -----------------------------------------------------------------------
|
|
246
|
+
// Tier 2b — Document understanding (Florence-2)
|
|
247
|
+
// Triggered for complex layouts (many regions with varying sizes).
|
|
248
|
+
// -----------------------------------------------------------------------
|
|
249
|
+
if (this._shouldRunTier('document-ai', strategy, requestedTiers) &&
|
|
250
|
+
(category === 'document-layout' || category === 'mixed')) {
|
|
251
|
+
try {
|
|
252
|
+
const docResult = await this._runFlorence2(preprocessed);
|
|
253
|
+
tierResults.push(docResult.tierResult);
|
|
254
|
+
activeTiers.push('document-ai');
|
|
255
|
+
layout = docResult.layout;
|
|
256
|
+
}
|
|
257
|
+
catch {
|
|
258
|
+
// Florence-2 failure is non-fatal
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
// -----------------------------------------------------------------------
|
|
262
|
+
// Tier 3 — Cloud Vision (GPT-4o / Claude / Gemini)
|
|
263
|
+
// In progressive mode: only if we're still below threshold.
|
|
264
|
+
// In parallel mode: always runs.
|
|
265
|
+
// In local-only mode: never runs.
|
|
266
|
+
// -----------------------------------------------------------------------
|
|
267
|
+
const bestLocalConfidence = this._bestConfidence(tierResults);
|
|
268
|
+
if (this._shouldRunCloudVision(strategy, bestLocalConfidence, threshold, requestedTiers)) {
|
|
269
|
+
try {
|
|
270
|
+
const cloudResult = await this._runCloudVision(preprocessed);
|
|
271
|
+
tierResults.push(cloudResult);
|
|
272
|
+
activeTiers.push('cloud-vision');
|
|
273
|
+
}
|
|
274
|
+
catch {
|
|
275
|
+
// Cloud failure is non-fatal if we have local results
|
|
276
|
+
if (tierResults.length === 0) {
|
|
277
|
+
throw new Error('VisionPipeline: cloud vision failed and no local results available.');
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
// -----------------------------------------------------------------------
|
|
282
|
+
// Collect CLIP embedding (was running in parallel)
|
|
283
|
+
// -----------------------------------------------------------------------
|
|
284
|
+
embedding = await embeddingPromise;
|
|
285
|
+
if (embedding)
|
|
286
|
+
activeTiers.push('embedding');
|
|
287
|
+
// -----------------------------------------------------------------------
|
|
288
|
+
// Assemble final result
|
|
289
|
+
// -----------------------------------------------------------------------
|
|
290
|
+
return this._assembleResult(tierResults, activeTiers, embedding, layout, options?.forceCategory ?? category, startTime);
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Extract text only — fastest path using OCR tier exclusively.
|
|
294
|
+
*
|
|
295
|
+
* Ignores all other tiers (handwriting, document-ai, cloud, embedding).
|
|
296
|
+
* Useful when you just need the text content and don't need confidence
|
|
297
|
+
* scoring, layout analysis, or embeddings.
|
|
298
|
+
*
|
|
299
|
+
* @param image - Image data as a Buffer or file-path / URL string.
|
|
300
|
+
* @returns Extracted text, or empty string if OCR produces no output.
|
|
301
|
+
*
|
|
302
|
+
* @throws {Error} If the configured OCR engine is missing.
|
|
303
|
+
*
|
|
304
|
+
* @example
|
|
305
|
+
* ```typescript
|
|
306
|
+
* const text = await pipeline.extractText(receiptImage);
|
|
307
|
+
* console.log(text); // "ACME STORE\n...\nTotal: $42.99"
|
|
308
|
+
* ```
|
|
309
|
+
*/
|
|
310
|
+
async extractText(image) {
|
|
311
|
+
this._assertNotDisposed();
|
|
312
|
+
const preprocessed = Buffer.isBuffer(image)
|
|
313
|
+
? await this._preprocess(image)
|
|
314
|
+
: image;
|
|
315
|
+
const result = await this._runOcr(preprocessed);
|
|
316
|
+
return result.text;
|
|
317
|
+
}
|
|
318
|
+
/**
|
|
319
|
+
* Generate an image embedding using CLIP — embedding tier only.
|
|
320
|
+
*
|
|
321
|
+
* Useful for building image similarity search indexes without running
|
|
322
|
+
* the full OCR + vision pipeline.
|
|
323
|
+
*
|
|
324
|
+
* @param image - Image data as a Buffer or file-path / URL string.
|
|
325
|
+
* @returns CLIP embedding vector (typically 512 or 768 dimensions).
|
|
326
|
+
*
|
|
327
|
+
* @throws {Error} If `@huggingface/transformers` is not installed.
|
|
328
|
+
* @throws {Error} If CLIP model loading fails.
|
|
329
|
+
*
|
|
330
|
+
* @example
|
|
331
|
+
* ```typescript
|
|
332
|
+
* const embedding = await pipeline.embed(photoBuffer);
|
|
333
|
+
* await vectorStore.upsert('images', [{
|
|
334
|
+
* id: 'photo-1',
|
|
335
|
+
* embedding,
|
|
336
|
+
* metadata: { source: 'upload' },
|
|
337
|
+
* }]);
|
|
338
|
+
* ```
|
|
339
|
+
*/
|
|
340
|
+
async embed(image) {
|
|
341
|
+
this._assertNotDisposed();
|
|
342
|
+
const preprocessed = Buffer.isBuffer(image)
|
|
343
|
+
? await this._preprocess(image)
|
|
344
|
+
: image;
|
|
345
|
+
const result = await this._runClipEmbedding(preprocessed);
|
|
346
|
+
if (!result) {
|
|
347
|
+
throw new Error('VisionPipeline: CLIP embedding returned empty result.');
|
|
348
|
+
}
|
|
349
|
+
return result;
|
|
350
|
+
}
|
|
351
|
+
/**
|
|
352
|
+
* Analyze document layout using Florence-2 — document-ai tier only.
|
|
353
|
+
*
|
|
354
|
+
* Returns structured {@link DocumentLayout} with semantic blocks
|
|
355
|
+
* (text, tables, figures, headings, lists, code) and their bounding
|
|
356
|
+
* boxes within each page.
|
|
357
|
+
*
|
|
358
|
+
* @param image - Image data as a Buffer or file-path / URL string.
|
|
359
|
+
* @returns Structured document layout with pages and blocks.
|
|
360
|
+
*
|
|
361
|
+
* @throws {Error} If `@huggingface/transformers` is not installed.
|
|
362
|
+
* @throws {Error} If Florence-2 model loading fails.
|
|
363
|
+
*
|
|
364
|
+
* @example
|
|
365
|
+
* ```typescript
|
|
366
|
+
* const layout = await pipeline.analyzeLayout(documentScan);
|
|
367
|
+
* for (const page of layout.pages) {
|
|
368
|
+
* for (const block of page.blocks) {
|
|
369
|
+
* console.log(`${block.type}: ${block.content.slice(0, 50)}...`);
|
|
370
|
+
* }
|
|
371
|
+
* }
|
|
372
|
+
* ```
|
|
373
|
+
*/
|
|
374
|
+
async analyzeLayout(image) {
|
|
375
|
+
this._assertNotDisposed();
|
|
376
|
+
const preprocessed = Buffer.isBuffer(image)
|
|
377
|
+
? await this._preprocess(image)
|
|
378
|
+
: image;
|
|
379
|
+
const result = await this._runFlorence2(preprocessed);
|
|
380
|
+
return result.layout;
|
|
381
|
+
}
|
|
382
|
+
/**
|
|
383
|
+
* Shut down the pipeline and release all loaded model resources.
|
|
384
|
+
*
|
|
385
|
+
* After calling dispose(), any further calls to `process()`,
|
|
386
|
+
* `extractText()`, `embed()`, or `analyzeLayout()` will throw.
|
|
387
|
+
*
|
|
388
|
+
* @example
|
|
389
|
+
* ```typescript
|
|
390
|
+
* const pipeline = new VisionPipeline({ strategy: 'progressive' });
|
|
391
|
+
* try {
|
|
392
|
+
* const result = await pipeline.process(image);
|
|
393
|
+
* } finally {
|
|
394
|
+
* await pipeline.dispose();
|
|
395
|
+
* }
|
|
396
|
+
* ```
|
|
397
|
+
*/
|
|
398
|
+
async dispose() {
|
|
399
|
+
this._disposed = true;
|
|
400
|
+
// Release PaddleOCR resources
|
|
401
|
+
if (this._paddleOcr?.dispose) {
|
|
402
|
+
try {
|
|
403
|
+
await this._paddleOcr.dispose();
|
|
404
|
+
}
|
|
405
|
+
catch {
|
|
406
|
+
// Swallow disposal errors — we're tearing down anyway
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
this._paddleOcr = undefined;
|
|
410
|
+
// Terminate Tesseract worker
|
|
411
|
+
if (this._tesseract?.terminate) {
|
|
412
|
+
try {
|
|
413
|
+
await this._tesseract.terminate();
|
|
414
|
+
}
|
|
415
|
+
catch {
|
|
416
|
+
// Swallow disposal errors
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
this._tesseract = undefined;
|
|
420
|
+
// Release HuggingFace pipelines by dropping references.
|
|
421
|
+
// The transformers library doesn't expose explicit dispose(),
|
|
422
|
+
// so we rely on GC to reclaim WASM/ONNX memory.
|
|
423
|
+
this._trOcrPipeline = undefined;
|
|
424
|
+
this._florencePipeline = undefined;
|
|
425
|
+
this._clipPipeline = undefined;
|
|
426
|
+
}
|
|
427
|
+
// -------------------------------------------------------------------------
|
|
428
|
+
// Preprocessing
|
|
429
|
+
// -------------------------------------------------------------------------
|
|
430
|
+
/**
|
|
431
|
+
* Apply configured preprocessing to an image buffer using sharp.
|
|
432
|
+
*
|
|
433
|
+
* @param image - Raw image buffer.
|
|
434
|
+
* @returns Preprocessed image buffer, or the original if no preprocessing
|
|
435
|
+
* is configured or sharp is unavailable.
|
|
436
|
+
*/
|
|
437
|
+
async _preprocess(image) {
|
|
438
|
+
const pp = this._config.preprocessing;
|
|
439
|
+
if (!pp)
|
|
440
|
+
return image;
|
|
441
|
+
// Only import sharp when preprocessing is actually needed.
|
|
442
|
+
// sharp is already a project dependency, but we guard the import
|
|
443
|
+
// to keep the pipeline functional even if sharp fails to load
|
|
444
|
+
// (e.g. in environments without native bindings).
|
|
445
|
+
let sharp;
|
|
446
|
+
try {
|
|
447
|
+
// @ts-ignore — sharp is an optional native dependency, may not be installed in CI
|
|
448
|
+
sharp = (await import('sharp')).default;
|
|
449
|
+
}
|
|
450
|
+
catch {
|
|
451
|
+
// sharp not available — return original image unmodified.
|
|
452
|
+
// This is a soft failure because preprocessing is an optimization,
|
|
453
|
+
// not a hard requirement.
|
|
454
|
+
return image;
|
|
455
|
+
}
|
|
456
|
+
let pipeline = sharp(image);
|
|
457
|
+
// Resize while preserving aspect ratio — never upscale
|
|
458
|
+
if (pp.resize) {
|
|
459
|
+
pipeline = pipeline.resize({
|
|
460
|
+
width: pp.resize.maxWidth,
|
|
461
|
+
height: pp.resize.maxHeight,
|
|
462
|
+
fit: 'inside',
|
|
463
|
+
withoutEnlargement: true,
|
|
464
|
+
});
|
|
465
|
+
}
|
|
466
|
+
// Convert to grayscale (improves OCR contrast on colored backgrounds)
|
|
467
|
+
if (pp.grayscale) {
|
|
468
|
+
pipeline = pipeline.grayscale();
|
|
469
|
+
}
|
|
470
|
+
// Sharpen (helps blurry scans and camera captures)
|
|
471
|
+
if (pp.sharpen) {
|
|
472
|
+
pipeline = pipeline.sharpen();
|
|
473
|
+
}
|
|
474
|
+
// Normalize brightness/contrast via histogram stretching
|
|
475
|
+
if (pp.normalize) {
|
|
476
|
+
pipeline = pipeline.normalize();
|
|
477
|
+
}
|
|
478
|
+
return pipeline.toBuffer();
|
|
479
|
+
}
|
|
480
|
+
// -------------------------------------------------------------------------
|
|
481
|
+
// Tier 1 — Local OCR
|
|
482
|
+
// -------------------------------------------------------------------------
|
|
483
|
+
/**
|
|
484
|
+
* Run OCR on the image using the configured engine (PaddleOCR or Tesseract.js).
|
|
485
|
+
*
|
|
486
|
+
* @param image - Preprocessed image buffer or URL string.
|
|
487
|
+
* @returns Tier result with extracted text, confidence, and regions.
|
|
488
|
+
* @throws {Error} If OCR engine is 'none' or neither engine is available.
|
|
489
|
+
*/
|
|
490
|
+
async _runOcr(image) {
|
|
491
|
+
const ocrEngine = this._config.ocr ?? 'paddle';
|
|
492
|
+
if (ocrEngine === 'none') {
|
|
493
|
+
throw new Error('VisionPipeline: OCR is set to "none" but OCR tier was requested.');
|
|
494
|
+
}
|
|
495
|
+
if (ocrEngine === 'paddle') {
|
|
496
|
+
return this._runPaddleOcr(image);
|
|
497
|
+
}
|
|
498
|
+
return this._runTesseract(image);
|
|
499
|
+
}
|
|
500
|
+
/**
|
|
501
|
+
* Run PaddleOCR for text extraction.
|
|
502
|
+
*
|
|
503
|
+
* Lazily loads and initializes the ppu-paddle-ocr library on first call.
|
|
504
|
+
* Subsequent calls reuse the cached service instance.
|
|
505
|
+
*
|
|
506
|
+
* @param image - Image buffer or URL string.
|
|
507
|
+
* @returns Tier result with PaddleOCR output.
|
|
508
|
+
* @throws {Error} If ppu-paddle-ocr is not installed.
|
|
509
|
+
*/
|
|
510
|
+
async _runPaddleOcr(image) {
|
|
511
|
+
const start = Date.now();
|
|
512
|
+
const ocr = await this._loadPaddleOcr();
|
|
513
|
+
// PaddleOCR expects a Buffer; convert URL/path to buffer if needed
|
|
514
|
+
const imageBuffer = Buffer.isBuffer(image) ? image : await this._urlToBuffer(image);
|
|
515
|
+
const ocrResult = await ocr.recognize(imageBuffer);
|
|
516
|
+
// Normalize PaddleOCR output into our standard shape.
|
|
517
|
+
// PaddleOCR returns an array of detected text regions with bounding
|
|
518
|
+
// boxes and per-region confidence scores.
|
|
519
|
+
const regions = (ocrResult?.regions ?? ocrResult?.data ?? []).map((r) => ({
|
|
520
|
+
text: r.text ?? r.content ?? '',
|
|
521
|
+
confidence: r.confidence ?? r.score ?? 0,
|
|
522
|
+
bbox: {
|
|
523
|
+
x: r.bbox?.[0]?.[0] ?? r.box?.[0]?.[0] ?? 0,
|
|
524
|
+
y: r.bbox?.[0]?.[1] ?? r.box?.[0]?.[1] ?? 0,
|
|
525
|
+
width: (r.bbox?.[1]?.[0] ?? r.box?.[1]?.[0] ?? 0) - (r.bbox?.[0]?.[0] ?? r.box?.[0]?.[0] ?? 0),
|
|
526
|
+
height: (r.bbox?.[2]?.[1] ?? r.box?.[2]?.[1] ?? 0) - (r.bbox?.[0]?.[1] ?? r.box?.[0]?.[1] ?? 0),
|
|
527
|
+
},
|
|
528
|
+
}));
|
|
529
|
+
const text = regions.map((r) => r.text).join('\n');
|
|
530
|
+
const avgConfidence = regions.length > 0
|
|
531
|
+
? regions.reduce((sum, r) => sum + r.confidence, 0) / regions.length
|
|
532
|
+
: 0;
|
|
533
|
+
return {
|
|
534
|
+
tier: 'ocr',
|
|
535
|
+
provider: 'paddle',
|
|
536
|
+
text,
|
|
537
|
+
confidence: avgConfidence,
|
|
538
|
+
durationMs: Date.now() - start,
|
|
539
|
+
regions,
|
|
540
|
+
};
|
|
541
|
+
}
|
|
542
|
+
/**
|
|
543
|
+
* Run Tesseract.js for text extraction.
|
|
544
|
+
*
|
|
545
|
+
* Lazily loads the tesseract.js library and creates a worker on first call.
|
|
546
|
+
* The worker is reused for subsequent calls and terminated on dispose().
|
|
547
|
+
*
|
|
548
|
+
* @param image - Image buffer or URL string.
|
|
549
|
+
* @returns Tier result with Tesseract output.
|
|
550
|
+
* @throws {Error} If tesseract.js is not installed.
|
|
551
|
+
*/
|
|
552
|
+
async _runTesseract(image) {
|
|
553
|
+
const start = Date.now();
|
|
554
|
+
const worker = await this._loadTesseract();
|
|
555
|
+
// Tesseract.js accepts Buffer, URL, or base64 string
|
|
556
|
+
const input = Buffer.isBuffer(image) ? image : image;
|
|
557
|
+
const result = await worker.recognize(input);
|
|
558
|
+
// Normalize Tesseract output into our standard shape.
|
|
559
|
+
// Tesseract returns paragraphs → lines → words with bounding boxes.
|
|
560
|
+
const regions = (result.data?.words ?? []).map((w) => ({
|
|
561
|
+
text: w.text ?? '',
|
|
562
|
+
confidence: (w.confidence ?? 0) / 100, // Tesseract uses 0-100 scale
|
|
563
|
+
bbox: {
|
|
564
|
+
x: w.bbox?.x0 ?? 0,
|
|
565
|
+
y: w.bbox?.y0 ?? 0,
|
|
566
|
+
width: (w.bbox?.x1 ?? 0) - (w.bbox?.x0 ?? 0),
|
|
567
|
+
height: (w.bbox?.y1 ?? 0) - (w.bbox?.y0 ?? 0),
|
|
568
|
+
},
|
|
569
|
+
}));
|
|
570
|
+
const text = result.data?.text ?? '';
|
|
571
|
+
// Tesseract confidence is 0-100; normalize to 0-1
|
|
572
|
+
const confidence = (result.data?.confidence ?? 0) / 100;
|
|
573
|
+
return {
|
|
574
|
+
tier: 'ocr',
|
|
575
|
+
provider: 'tesseract',
|
|
576
|
+
text,
|
|
577
|
+
confidence,
|
|
578
|
+
durationMs: Date.now() - start,
|
|
579
|
+
regions,
|
|
580
|
+
};
|
|
581
|
+
}
|
|
582
|
+
// -------------------------------------------------------------------------
|
|
583
|
+
// Tier 2a — Handwriting recognition (TrOCR)
|
|
584
|
+
// -------------------------------------------------------------------------
|
|
585
|
+
/**
|
|
586
|
+
* Run TrOCR handwriting recognition via @huggingface/transformers.
|
|
587
|
+
*
|
|
588
|
+
* TrOCR is a transformer model specifically trained for handwritten
|
|
589
|
+
* text recognition. It excels where standard OCR engines (PaddleOCR,
|
|
590
|
+
* Tesseract) produce low-confidence, garbled output on cursive text.
|
|
591
|
+
*
|
|
592
|
+
* @param image - Preprocessed image buffer or URL string.
|
|
593
|
+
* @returns Tier result with handwriting-recognized text.
|
|
594
|
+
* @throws {Error} If @huggingface/transformers is not installed.
|
|
595
|
+
*/
|
|
596
|
+
async _runTrOcr(image) {
|
|
597
|
+
const start = Date.now();
|
|
598
|
+
const pipe = await this._loadTrOcr();
|
|
599
|
+
// The image-to-text pipeline accepts Buffer, URL, or base64 data URL
|
|
600
|
+
const input = Buffer.isBuffer(image)
|
|
601
|
+
? `data:image/png;base64,${image.toString('base64')}`
|
|
602
|
+
: image;
|
|
603
|
+
const output = await pipe(input);
|
|
604
|
+
// The pipeline returns an array of { generated_text: string }
|
|
605
|
+
const text = Array.isArray(output)
|
|
606
|
+
? output.map((o) => o.generated_text ?? '').join('\n')
|
|
607
|
+
: output?.generated_text ?? '';
|
|
608
|
+
return {
|
|
609
|
+
tier: 'handwriting',
|
|
610
|
+
provider: 'trocr',
|
|
611
|
+
text,
|
|
612
|
+
// TrOCR doesn't output per-token confidence for the full sequence,
|
|
613
|
+
// so we assign a moderate default. The progressive strategy will
|
|
614
|
+
// still prefer cloud results if they exist.
|
|
615
|
+
confidence: text.length > 0 ? 0.75 : 0,
|
|
616
|
+
durationMs: Date.now() - start,
|
|
617
|
+
};
|
|
618
|
+
}
|
|
619
|
+
// -------------------------------------------------------------------------
|
|
620
|
+
// Tier 2b — Document understanding (Florence-2)
|
|
621
|
+
// -------------------------------------------------------------------------
|
|
622
|
+
/**
|
|
623
|
+
* Run Florence-2 document understanding via @huggingface/transformers.
|
|
624
|
+
*
|
|
625
|
+
* Florence-2 detects semantic blocks (text, tables, figures, headings,
|
|
626
|
+
* lists, code) and their bounding boxes, producing a structured
|
|
627
|
+
* {@link DocumentLayout} alongside extracted text.
|
|
628
|
+
*
|
|
629
|
+
* @param image - Preprocessed image buffer or URL string.
|
|
630
|
+
* @returns Tier result plus structured document layout.
|
|
631
|
+
* @throws {Error} If @huggingface/transformers is not installed.
|
|
632
|
+
*/
|
|
633
|
+
async _runFlorence2(image) {
|
|
634
|
+
const start = Date.now();
|
|
635
|
+
const pipe = await this._loadFlorence2();
|
|
636
|
+
// Florence-2 uses a VQA-style interface — we ask it to describe
|
|
637
|
+
// the document layout.
|
|
638
|
+
const input = Buffer.isBuffer(image)
|
|
639
|
+
? `data:image/png;base64,${image.toString('base64')}`
|
|
640
|
+
: image;
|
|
641
|
+
const output = await pipe(input, 'Describe the document layout in detail.');
|
|
642
|
+
// Parse Florence-2 output into our structured layout format.
|
|
643
|
+
// The model returns a description — we extract block annotations
|
|
644
|
+
// if the model provides them, or fall back to a single text block.
|
|
645
|
+
const text = Array.isArray(output)
|
|
646
|
+
? output.map((o) => o.generated_text ?? '').join('\n')
|
|
647
|
+
: output?.generated_text ?? '';
|
|
648
|
+
const blocks = [{
|
|
649
|
+
type: 'text',
|
|
650
|
+
content: text,
|
|
651
|
+
bbox: { x: 0, y: 0, width: 0, height: 0 },
|
|
652
|
+
confidence: 0.8,
|
|
653
|
+
}];
|
|
654
|
+
const layout = {
|
|
655
|
+
pages: [{
|
|
656
|
+
pageNumber: 1,
|
|
657
|
+
width: 0,
|
|
658
|
+
height: 0,
|
|
659
|
+
blocks,
|
|
660
|
+
}],
|
|
661
|
+
};
|
|
662
|
+
return {
|
|
663
|
+
tierResult: {
|
|
664
|
+
tier: 'document-ai',
|
|
665
|
+
provider: 'florence-2',
|
|
666
|
+
text,
|
|
667
|
+
confidence: text.length > 0 ? 0.8 : 0,
|
|
668
|
+
durationMs: Date.now() - start,
|
|
669
|
+
},
|
|
670
|
+
layout,
|
|
671
|
+
};
|
|
672
|
+
}
|
|
673
|
+
// -------------------------------------------------------------------------
|
|
674
|
+
// Tier 2c — Image embeddings (CLIP)
|
|
675
|
+
// -------------------------------------------------------------------------
|
|
676
|
+
/**
|
|
677
|
+
* Generate a CLIP image embedding via @huggingface/transformers.
|
|
678
|
+
*
|
|
679
|
+
* CLIP embeddings enable cross-modal similarity search — the embedding
|
|
680
|
+
* lives in the same vector space as text embeddings from the same model,
|
|
681
|
+
* so you can search images with text queries and vice versa.
|
|
682
|
+
*
|
|
683
|
+
* @param image - Preprocessed image buffer or URL string.
|
|
684
|
+
* @returns Embedding vector (typically 512 or 768 dimensions), or undefined
|
|
685
|
+
* if CLIP is not available.
|
|
686
|
+
* @throws {Error} If @huggingface/transformers is not installed.
|
|
687
|
+
*/
|
|
688
|
+
async _runClipEmbedding(image) {
|
|
689
|
+
const pipe = await this._loadClip();
|
|
690
|
+
const input = Buffer.isBuffer(image)
|
|
691
|
+
? `data:image/png;base64,${image.toString('base64')}`
|
|
692
|
+
: image;
|
|
693
|
+
const output = await pipe(input);
|
|
694
|
+
// The feature-extraction pipeline returns a nested tensor-like structure.
|
|
695
|
+
// We extract the flat float array from it.
|
|
696
|
+
if (Array.isArray(output)) {
|
|
697
|
+
// output is [[number, number, ...]] — flatten one level
|
|
698
|
+
const flat = Array.isArray(output[0]) ? output[0] : output;
|
|
699
|
+
return flat.map((v) => Number(v));
|
|
700
|
+
}
|
|
701
|
+
// Handle tensor-like output with .data or .tolist()
|
|
702
|
+
if (output?.data) {
|
|
703
|
+
return Array.from(output.data);
|
|
704
|
+
}
|
|
705
|
+
if (typeof output?.tolist === 'function') {
|
|
706
|
+
const list = output.tolist();
|
|
707
|
+
return Array.isArray(list[0]) ? list[0] : list;
|
|
708
|
+
}
|
|
709
|
+
return undefined;
|
|
710
|
+
}
|
|
711
|
+
// -------------------------------------------------------------------------
|
|
712
|
+
// Tier 3 — Cloud Vision
|
|
713
|
+
// -------------------------------------------------------------------------
|
|
714
|
+
/**
|
|
715
|
+
* Run cloud vision LLM for image understanding.
|
|
716
|
+
*
|
|
717
|
+
* Uses the existing `generateText()` API with a multimodal message
|
|
718
|
+
* containing the image as a base64 data URL. This works with any
|
|
719
|
+
* vision-capable provider (OpenAI GPT-4o, Anthropic Claude, Google
|
|
720
|
+
* Gemini, Ollama with LLaVA, etc.).
|
|
721
|
+
*
|
|
722
|
+
* @param image - Image buffer or URL string.
|
|
723
|
+
* @returns Tier result with cloud vision description.
|
|
724
|
+
* @throws {Error} If no cloud provider is configured.
|
|
725
|
+
* @throws {Error} If the cloud API call fails.
|
|
726
|
+
*/
|
|
727
|
+
async _runCloudVision(image) {
|
|
728
|
+
const start = Date.now();
|
|
729
|
+
if (!this._config.cloudProvider) {
|
|
730
|
+
throw new Error('VisionPipeline: cloud vision requested but no cloudProvider is configured. ' +
|
|
731
|
+
'Set cloudProvider in the pipeline config (e.g. "openai", "anthropic").');
|
|
732
|
+
}
|
|
733
|
+
// Import the high-level API to avoid coupling to any specific provider
|
|
734
|
+
const { generateText } = await import('../../api/generateText.js');
|
|
735
|
+
// Build the base64 data URL for the image
|
|
736
|
+
const base64 = Buffer.isBuffer(image)
|
|
737
|
+
? image.toString('base64')
|
|
738
|
+
: image;
|
|
739
|
+
const imageUrl = Buffer.isBuffer(image)
|
|
740
|
+
? `data:image/png;base64,${base64}`
|
|
741
|
+
: image;
|
|
742
|
+
// Use the multimodal message format supported by the IProvider interface.
|
|
743
|
+
// The `content` array with image_url parts is the standard format
|
|
744
|
+
// across OpenAI, Anthropic, and Gemini providers.
|
|
745
|
+
const result = await generateText({
|
|
746
|
+
provider: this._config.cloudProvider,
|
|
747
|
+
model: this._config.cloudModel,
|
|
748
|
+
messages: [{
|
|
749
|
+
role: 'user',
|
|
750
|
+
// The generateText API passes content through to the provider as-is
|
|
751
|
+
// when it's an array (multimodal message). All major providers support
|
|
752
|
+
// the OpenAI-style content parts array.
|
|
753
|
+
content: JSON.stringify([
|
|
754
|
+
{ type: 'text', text: CLOUD_VISION_PROMPT },
|
|
755
|
+
{ type: 'image_url', image_url: { url: imageUrl } },
|
|
756
|
+
]),
|
|
757
|
+
}],
|
|
758
|
+
});
|
|
759
|
+
return {
|
|
760
|
+
tier: 'cloud-vision',
|
|
761
|
+
provider: this._config.cloudProvider,
|
|
762
|
+
text: result.text,
|
|
763
|
+
confidence: CLOUD_VISION_CONFIDENCE,
|
|
764
|
+
durationMs: Date.now() - start,
|
|
765
|
+
};
|
|
766
|
+
}
|
|
767
|
+
// -------------------------------------------------------------------------
|
|
768
|
+
// Lazy loader methods (optional peer dependency pattern)
|
|
769
|
+
// -------------------------------------------------------------------------
|
|
770
|
+
/**
|
|
771
|
+
* Lazily load and initialize PaddleOCR.
|
|
772
|
+
*
|
|
773
|
+
* @returns Initialized PaddleOCR service instance.
|
|
774
|
+
* @throws {Error} If ppu-paddle-ocr is not installed, with install instructions.
|
|
775
|
+
*/
|
|
776
|
+
async _loadPaddleOcr() {
|
|
777
|
+
if (this._paddleOcr)
|
|
778
|
+
return this._paddleOcr;
|
|
779
|
+
try {
|
|
780
|
+
const mod = await import('ppu-paddle-ocr');
|
|
781
|
+
// ppu-paddle-ocr exports vary by version — handle both default and named
|
|
782
|
+
const PaddleOcrCls = mod.PaddleOcrService ?? mod.default?.PaddleOcrService ?? mod.default;
|
|
783
|
+
const instance = new PaddleOcrCls();
|
|
784
|
+
// PaddleOCR requires async initialization to load ONNX models
|
|
785
|
+
if (typeof instance.init === 'function') {
|
|
786
|
+
await instance.init();
|
|
787
|
+
}
|
|
788
|
+
this._paddleOcr = instance;
|
|
789
|
+
return instance;
|
|
790
|
+
}
|
|
791
|
+
catch (err) {
|
|
792
|
+
// Distinguish between "not installed" and "runtime init failure"
|
|
793
|
+
if (err?.code === 'ERR_MODULE_NOT_FOUND' || err?.code === 'MODULE_NOT_FOUND') {
|
|
794
|
+
throw new Error('ppu-paddle-ocr is not installed. Install with:\n' +
|
|
795
|
+
' npm install ppu-paddle-ocr\n\n' +
|
|
796
|
+
'Or switch to Tesseract.js by setting ocr: "tesseract" in the pipeline config.');
|
|
797
|
+
}
|
|
798
|
+
throw err;
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
/**
|
|
802
|
+
* Lazily load and initialize a Tesseract.js worker.
|
|
803
|
+
*
|
|
804
|
+
* @returns Initialized Tesseract worker ready for recognition.
|
|
805
|
+
* @throws {Error} If tesseract.js is not installed, with install instructions.
|
|
806
|
+
*/
|
|
807
|
+
async _loadTesseract() {
|
|
808
|
+
if (this._tesseract)
|
|
809
|
+
return this._tesseract;
|
|
810
|
+
try {
|
|
811
|
+
const mod = await import('tesseract.js');
|
|
812
|
+
const Tesseract = mod.default ?? mod;
|
|
813
|
+
// createWorker() handles downloading trained data on first run
|
|
814
|
+
const worker = await Tesseract.createWorker('eng');
|
|
815
|
+
this._tesseract = worker;
|
|
816
|
+
return worker;
|
|
817
|
+
}
|
|
818
|
+
catch (err) {
|
|
819
|
+
if (err?.code === 'ERR_MODULE_NOT_FOUND' || err?.code === 'MODULE_NOT_FOUND') {
|
|
820
|
+
throw new Error('tesseract.js is not installed. Install with:\n' +
|
|
821
|
+
' npm install tesseract.js\n\n' +
|
|
822
|
+
'Or switch to PaddleOCR by setting ocr: "paddle" in the pipeline config.');
|
|
823
|
+
}
|
|
824
|
+
throw err;
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
/**
|
|
828
|
+
* Lazily load the TrOCR image-to-text pipeline from @huggingface/transformers.
|
|
829
|
+
*
|
|
830
|
+
* @returns HuggingFace image-to-text pipeline configured with TrOCR weights.
|
|
831
|
+
* @throws {Error} If @huggingface/transformers is not installed.
|
|
832
|
+
*/
|
|
833
|
+
async _loadTrOcr() {
|
|
834
|
+
if (this._trOcrPipeline)
|
|
835
|
+
return this._trOcrPipeline;
|
|
836
|
+
try {
|
|
837
|
+
const { pipeline } = await import('@huggingface/transformers');
|
|
838
|
+
// TrOCR is an image-to-text model for handwriting recognition.
|
|
839
|
+
// microsoft/trocr-base-handwritten is the standard pretrained checkpoint.
|
|
840
|
+
this._trOcrPipeline = await pipeline('image-to-text', 'microsoft/trocr-base-handwritten');
|
|
841
|
+
return this._trOcrPipeline;
|
|
842
|
+
}
|
|
843
|
+
catch (err) {
|
|
844
|
+
if (err?.code === 'ERR_MODULE_NOT_FOUND' || err?.code === 'MODULE_NOT_FOUND') {
|
|
845
|
+
throw new Error('@huggingface/transformers is not installed. Install with:\n' +
|
|
846
|
+
' npm install @huggingface/transformers\n\n' +
|
|
847
|
+
'This is required for handwriting recognition (TrOCR).');
|
|
848
|
+
}
|
|
849
|
+
throw err;
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
/**
|
|
853
|
+
* Lazily load the Florence-2 document understanding pipeline.
|
|
854
|
+
*
|
|
855
|
+
* @returns HuggingFace pipeline configured for Florence-2 document analysis.
|
|
856
|
+
* @throws {Error} If @huggingface/transformers is not installed.
|
|
857
|
+
*/
|
|
858
|
+
async _loadFlorence2() {
|
|
859
|
+
if (this._florencePipeline)
|
|
860
|
+
return this._florencePipeline;
|
|
861
|
+
try {
|
|
862
|
+
const { pipeline } = await import('@huggingface/transformers');
|
|
863
|
+
// Florence-2 uses the image-to-text task with a VQA-style interface.
|
|
864
|
+
// microsoft/Florence-2-base is the standard pretrained checkpoint.
|
|
865
|
+
this._florencePipeline = await pipeline('image-to-text', 'microsoft/Florence-2-base');
|
|
866
|
+
return this._florencePipeline;
|
|
867
|
+
}
|
|
868
|
+
catch (err) {
|
|
869
|
+
if (err?.code === 'ERR_MODULE_NOT_FOUND' || err?.code === 'MODULE_NOT_FOUND') {
|
|
870
|
+
throw new Error('@huggingface/transformers is not installed. Install with:\n' +
|
|
871
|
+
' npm install @huggingface/transformers\n\n' +
|
|
872
|
+
'This is required for document understanding (Florence-2).');
|
|
873
|
+
}
|
|
874
|
+
throw err;
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
/**
|
|
878
|
+
* Lazily load the CLIP feature-extraction pipeline for image embeddings.
|
|
879
|
+
*
|
|
880
|
+
* @returns HuggingFace feature-extraction pipeline configured with CLIP.
|
|
881
|
+
* @throws {Error} If @huggingface/transformers is not installed.
|
|
882
|
+
*/
|
|
883
|
+
async _loadClip() {
|
|
884
|
+
if (this._clipPipeline)
|
|
885
|
+
return this._clipPipeline;
|
|
886
|
+
try {
|
|
887
|
+
const { pipeline } = await import('@huggingface/transformers');
|
|
888
|
+
// CLIP ViT-B/32 is the standard model for image embeddings.
|
|
889
|
+
// It produces 512-dimensional vectors in the same space as
|
|
890
|
+
// CLIP text embeddings, enabling cross-modal search.
|
|
891
|
+
this._clipPipeline = await pipeline('feature-extraction', 'Xenova/clip-vit-base-patch32');
|
|
892
|
+
return this._clipPipeline;
|
|
893
|
+
}
|
|
894
|
+
catch (err) {
|
|
895
|
+
if (err?.code === 'ERR_MODULE_NOT_FOUND' || err?.code === 'MODULE_NOT_FOUND') {
|
|
896
|
+
throw new Error('@huggingface/transformers is not installed. Install with:\n' +
|
|
897
|
+
' npm install @huggingface/transformers\n\n' +
|
|
898
|
+
'This is required for CLIP image embeddings.');
|
|
899
|
+
}
|
|
900
|
+
throw err;
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
// -------------------------------------------------------------------------
|
|
904
|
+
// Content category heuristics
|
|
905
|
+
// -------------------------------------------------------------------------
|
|
906
|
+
/**
|
|
907
|
+
* Detect the content category from OCR results using heuristics.
|
|
908
|
+
*
|
|
909
|
+
* This avoids running expensive classification models just to decide
|
|
910
|
+
* which Tier 2 model to invoke. The heuristics are deliberately
|
|
911
|
+
* conservative — when in doubt, they return 'mixed' which triggers
|
|
912
|
+
* both handwriting and document-ai tiers.
|
|
913
|
+
*
|
|
914
|
+
* @param ocrResult - Result from Tier 1 OCR, or undefined if OCR was skipped.
|
|
915
|
+
* @returns Detected content category.
|
|
916
|
+
*/
|
|
917
|
+
_detectCategory(ocrResult) {
|
|
918
|
+
if (!ocrResult)
|
|
919
|
+
return 'mixed';
|
|
920
|
+
// High confidence + clean text → printed document
|
|
921
|
+
if (ocrResult.confidence > 0.85)
|
|
922
|
+
return 'printed-text';
|
|
923
|
+
// Low confidence + many single-character detections is a strong
|
|
924
|
+
// handwriting signal: OCR struggles with cursive and often splits
|
|
925
|
+
// connected strokes into individual character guesses.
|
|
926
|
+
const singleCharRegions = ocrResult.regions?.filter((r) => r.text.trim().length === 1);
|
|
927
|
+
if (ocrResult.confidence < 0.5 &&
|
|
928
|
+
singleCharRegions &&
|
|
929
|
+
singleCharRegions.length > 0) {
|
|
930
|
+
return 'handwritten';
|
|
931
|
+
}
|
|
932
|
+
// Many regions with varying sizes suggests a complex document layout
|
|
933
|
+
// with headers, body text, sidebars, tables, etc.
|
|
934
|
+
if (ocrResult.regions && ocrResult.regions.length > 20) {
|
|
935
|
+
return 'document-layout';
|
|
936
|
+
}
|
|
937
|
+
// Moderate confidence but few regions — probably a photograph or
|
|
938
|
+
// diagram with some incidental text.
|
|
939
|
+
if (ocrResult.confidence < 0.6 && (ocrResult.regions?.length ?? 0) < 5) {
|
|
940
|
+
return 'photograph';
|
|
941
|
+
}
|
|
942
|
+
return 'mixed';
|
|
943
|
+
}
|
|
944
|
+
// -------------------------------------------------------------------------
|
|
945
|
+
// Routing helpers
|
|
946
|
+
// -------------------------------------------------------------------------
|
|
947
|
+
/**
|
|
948
|
+
* Determine whether a specific tier should run based on the strategy
|
|
949
|
+
* and any explicit tier overrides.
|
|
950
|
+
*
|
|
951
|
+
* @param tier - The tier to check.
|
|
952
|
+
* @param strategy - The pipeline's configured strategy.
|
|
953
|
+
* @param requestedTiers - Explicit tier overrides from the caller, if any.
|
|
954
|
+
* @returns True if the tier should run.
|
|
955
|
+
*/
|
|
956
|
+
_shouldRunTier(tier, strategy, requestedTiers) {
|
|
957
|
+
// Explicit tier list takes precedence over strategy
|
|
958
|
+
if (requestedTiers)
|
|
959
|
+
return requestedTiers.includes(tier);
|
|
960
|
+
// Strategy-based routing
|
|
961
|
+
switch (tier) {
|
|
962
|
+
case 'ocr':
|
|
963
|
+
// OCR runs in all strategies except cloud-only
|
|
964
|
+
return strategy !== 'cloud-only';
|
|
965
|
+
case 'handwriting':
|
|
966
|
+
// Handwriting only runs if explicitly enabled in config
|
|
967
|
+
if (!this._config.handwriting)
|
|
968
|
+
return false;
|
|
969
|
+
// Runs in progressive (conditionally), local-only, and parallel
|
|
970
|
+
return strategy !== 'cloud-only';
|
|
971
|
+
case 'document-ai':
|
|
972
|
+
// Document AI only runs if explicitly enabled in config
|
|
973
|
+
if (!this._config.documentAI)
|
|
974
|
+
return false;
|
|
975
|
+
return strategy !== 'cloud-only';
|
|
976
|
+
case 'embedding':
|
|
977
|
+
// Embedding only runs if explicitly enabled in config
|
|
978
|
+
if (!this._config.embedding)
|
|
979
|
+
return false;
|
|
980
|
+
return true; // CLIP runs regardless of strategy
|
|
981
|
+
case 'cloud-vision':
|
|
982
|
+
// Cloud vision routing is handled separately in _shouldRunCloudVision
|
|
983
|
+
return false;
|
|
984
|
+
default:
|
|
985
|
+
return false;
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
/**
|
|
989
|
+
* Determine whether cloud vision should run based on strategy, current
|
|
990
|
+
* confidence, and threshold.
|
|
991
|
+
*
|
|
992
|
+
* Cloud vision is the most expensive tier, so we're careful about when
|
|
993
|
+
* to invoke it — only when local results are insufficient.
|
|
994
|
+
*
|
|
995
|
+
* @param strategy - Pipeline strategy.
|
|
996
|
+
* @param bestLocalConfidence - Best confidence from local tiers so far.
|
|
997
|
+
* @param threshold - Confidence threshold for cloud escalation.
|
|
998
|
+
* @param requestedTiers - Explicit tier overrides, if any.
|
|
999
|
+
* @returns True if cloud vision should run.
|
|
1000
|
+
*/
|
|
1001
|
+
_shouldRunCloudVision(strategy, bestLocalConfidence, threshold, requestedTiers) {
|
|
1002
|
+
// Explicit tier list takes precedence
|
|
1003
|
+
if (requestedTiers)
|
|
1004
|
+
return requestedTiers.includes('cloud-vision');
|
|
1005
|
+
// No cloud provider configured — can't run
|
|
1006
|
+
if (!this._config.cloudProvider)
|
|
1007
|
+
return false;
|
|
1008
|
+
switch (strategy) {
|
|
1009
|
+
case 'cloud-only':
|
|
1010
|
+
// Already handled at the top of process() — shouldn't reach here
|
|
1011
|
+
return true;
|
|
1012
|
+
case 'local-only':
|
|
1013
|
+
// Never call cloud
|
|
1014
|
+
return false;
|
|
1015
|
+
case 'parallel':
|
|
1016
|
+
// Always run cloud alongside local
|
|
1017
|
+
return true;
|
|
1018
|
+
case 'progressive':
|
|
1019
|
+
// Only escalate when local confidence is below threshold
|
|
1020
|
+
return bestLocalConfidence < threshold;
|
|
1021
|
+
default:
|
|
1022
|
+
return false;
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
/**
|
|
1026
|
+
* Find the highest confidence among a set of tier results.
|
|
1027
|
+
*
|
|
1028
|
+
* @param tierResults - Results from tiers that have run so far.
|
|
1029
|
+
* @returns Best confidence score, or 0 if no results.
|
|
1030
|
+
*/
|
|
1031
|
+
_bestConfidence(tierResults) {
|
|
1032
|
+
if (tierResults.length === 0)
|
|
1033
|
+
return 0;
|
|
1034
|
+
return Math.max(...tierResults.map((r) => r.confidence));
|
|
1035
|
+
}
|
|
1036
|
+
// -------------------------------------------------------------------------
|
|
1037
|
+
// Result assembly
|
|
1038
|
+
// -------------------------------------------------------------------------
|
|
1039
|
+
/**
|
|
1040
|
+
* Assemble the final {@link VisionResult} from individual tier outputs.
|
|
1041
|
+
*
|
|
1042
|
+
* The winning tier is the one with the highest confidence. Layout data
|
|
1043
|
+
* from Florence-2 is always included when available, regardless of
|
|
1044
|
+
* which tier's text wins.
|
|
1045
|
+
*
|
|
1046
|
+
* @param tierResults - All tier results collected during processing.
|
|
1047
|
+
* @param activeTiers - Which tiers actually ran (for metadata).
|
|
1048
|
+
* @param embedding - CLIP embedding, if generated.
|
|
1049
|
+
* @param layout - Florence-2 document layout, if generated.
|
|
1050
|
+
* @param forcedCategory - Caller-specified category override.
|
|
1051
|
+
* @param startTime - Timestamp when processing started (for duration).
|
|
1052
|
+
* @returns Assembled vision result.
|
|
1053
|
+
*/
|
|
1054
|
+
_assembleResult(tierResults, activeTiers, embedding, layout, forcedCategory, startTime) {
|
|
1055
|
+
// Pick the tier result with the highest confidence for the primary text
|
|
1056
|
+
const winner = tierResults.reduce((best, current) => (current.confidence > best.confidence ? current : best), tierResults[0] ?? { text: '', confidence: 0, regions: undefined });
|
|
1057
|
+
// Detect category from the OCR result (first tier), unless forced
|
|
1058
|
+
const ocrResult = tierResults.find((r) => r.tier === 'ocr');
|
|
1059
|
+
const category = forcedCategory ?? this._detectCategory(ocrResult);
|
|
1060
|
+
return {
|
|
1061
|
+
text: winner?.text ?? '',
|
|
1062
|
+
confidence: winner?.confidence ?? 0,
|
|
1063
|
+
category,
|
|
1064
|
+
tiers: activeTiers,
|
|
1065
|
+
tierResults,
|
|
1066
|
+
embedding,
|
|
1067
|
+
layout,
|
|
1068
|
+
regions: winner?.regions,
|
|
1069
|
+
durationMs: Date.now() - startTime,
|
|
1070
|
+
};
|
|
1071
|
+
}
|
|
1072
|
+
// -------------------------------------------------------------------------
|
|
1073
|
+
// Utility methods
|
|
1074
|
+
// -------------------------------------------------------------------------
|
|
1075
|
+
/**
|
|
1076
|
+
* Convert a URL or file path to a Buffer by reading the file or
|
|
1077
|
+
* fetching the URL.
|
|
1078
|
+
*
|
|
1079
|
+
* @param url - URL string (http://, https://, file://, or bare path).
|
|
1080
|
+
* @returns Image data as a Buffer.
|
|
1081
|
+
*/
|
|
1082
|
+
async _urlToBuffer(url) {
|
|
1083
|
+
// Handle data URLs by extracting the base64 payload
|
|
1084
|
+
if (url.startsWith('data:')) {
|
|
1085
|
+
const commaIdx = url.indexOf(',');
|
|
1086
|
+
if (commaIdx === -1)
|
|
1087
|
+
throw new Error(`VisionPipeline: invalid data URL.`);
|
|
1088
|
+
return Buffer.from(url.slice(commaIdx + 1), 'base64');
|
|
1089
|
+
}
|
|
1090
|
+
// Handle http/https URLs
|
|
1091
|
+
if (url.startsWith('http://') || url.startsWith('https://')) {
|
|
1092
|
+
const { default: axios } = await import('axios');
|
|
1093
|
+
const response = await axios.get(url, { responseType: 'arraybuffer' });
|
|
1094
|
+
return Buffer.from(response.data);
|
|
1095
|
+
}
|
|
1096
|
+
// Handle file:// URLs and bare file paths
|
|
1097
|
+
const { readFile } = await import('node:fs/promises');
|
|
1098
|
+
const filePath = url.startsWith('file://') ? url.slice(7) : url;
|
|
1099
|
+
return readFile(filePath);
|
|
1100
|
+
}
|
|
1101
|
+
/**
|
|
1102
|
+
* Guard method that throws if the pipeline has been disposed.
|
|
1103
|
+
* Called at the top of every public method to prevent use-after-free.
|
|
1104
|
+
*
|
|
1105
|
+
* @throws {Error} If dispose() has been called.
|
|
1106
|
+
*/
|
|
1107
|
+
_assertNotDisposed() {
|
|
1108
|
+
if (this._disposed) {
|
|
1109
|
+
throw new Error('VisionPipeline: pipeline has been disposed. Create a new instance.');
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
//# sourceMappingURL=VisionPipeline.js.map
|