ummaya 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -3
- package/bin/ummaya +10 -1
- package/npm-shrinkwrap.json +253 -2
- package/package.json +5 -1
- package/prompts/manifest.yaml +2 -2
- package/prompts/session_guidance_v1.md +3 -1
- package/prompts/system_v1.md +9 -7
- package/pyproject.toml +26 -7
- package/specs/2803-document-production-hardening/contracts/document-tools.schema.json +1043 -0
- package/src/ummaya/_canonical/__init__.py +2 -0
- package/src/ummaya/context/builder.py +17 -11
- package/src/ummaya/engine/engine.py +30 -113
- package/src/ummaya/engine/query.py +20 -0
- package/src/ummaya/evidence/__init__.py +44 -0
- package/src/ummaya/evidence/__main__.py +7 -0
- package/src/ummaya/evidence/dataset_contract.py +193 -0
- package/src/ummaya/evidence/document_authoring_cases.py +33 -0
- package/src/ummaya/evidence/document_harness.py +313 -0
- package/src/ummaya/evidence/document_viewer_ux.py +391 -0
- package/src/ummaya/evidence/gates.py +70 -0
- package/src/ummaya/evidence/json_types.py +20 -0
- package/src/ummaya/evidence/models.py +145 -0
- package/src/ummaya/evidence/output_payload.py +89 -0
- package/src/ummaya/evidence/payload_documents.py +233 -0
- package/src/ummaya/evidence/route_contracts.py +224 -0
- package/src/ummaya/evidence/route_helpers.py +150 -0
- package/src/ummaya/evidence/runner.py +177 -0
- package/src/ummaya/evidence/source_provenance.py +246 -0
- package/src/ummaya/evidence/source_provenance_redaction.py +176 -0
- package/src/ummaya/evidence/task_registry.py +264 -0
- package/src/ummaya/evidence/tool_layer.py +39 -0
- package/src/ummaya/evidence/tool_layer_models.py +151 -0
- package/src/ummaya/ipc/adapter_manifest_emitter.py +26 -10
- package/src/ummaya/ipc/document_intent_normalization.py +185 -0
- package/src/ummaya/ipc/frame_schema.py +52 -5
- package/src/ummaya/ipc/route_diagnostics.py +73 -0
- package/src/ummaya/ipc/stdio.py +2282 -417
- package/src/ummaya/llm/client.py +234 -59
- package/src/ummaya/llm/config.py +8 -3
- package/src/ummaya/llm/reasoning.py +84 -0
- package/src/ummaya/primitives/__init__.py +6 -2
- package/src/ummaya/primitives/delegation.py +1 -1
- package/src/ummaya/primitives/document.py +28 -0
- package/src/ummaya/settings.py +0 -3
- package/src/ummaya/tools/discovery_bridge.py +34 -2
- package/src/ummaya/tools/documents/__init__.py +297 -0
- package/src/ummaya/tools/documents/adapter_registry.py +487 -0
- package/src/ummaya/tools/documents/archive_container_probe.py +167 -0
- package/src/ummaya/tools/documents/artifact_store.py +454 -0
- package/src/ummaya/tools/documents/authoring.py +283 -0
- package/src/ummaya/tools/documents/baselines.py +114 -0
- package/src/ummaya/tools/documents/capability.py +331 -0
- package/src/ummaya/tools/documents/contracts.py +112 -0
- package/src/ummaya/tools/documents/conversion.py +521 -0
- package/src/ummaya/tools/documents/diff.py +275 -0
- package/src/ummaya/tools/documents/engines.py +163 -0
- package/src/ummaya/tools/documents/evaluation.py +291 -0
- package/src/ummaya/tools/documents/explicit_values.py +108 -0
- package/src/ummaya/tools/documents/fixtures.py +174 -0
- package/src/ummaya/tools/documents/format_completion_audit.py +471 -0
- package/src/ummaya/tools/documents/formats/__init__.py +2 -0
- package/src/ummaya/tools/documents/formats/archive.py +528 -0
- package/src/ummaya/tools/documents/formats/base.py +41 -0
- package/src/ummaya/tools/documents/formats/code_file.py +211 -0
- package/src/ummaya/tools/documents/formats/data_file.py +272 -0
- package/src/ummaya/tools/documents/formats/hwp.py +284 -0
- package/src/ummaya/tools/documents/formats/hwpx.py +1837 -0
- package/src/ummaya/tools/documents/formats/odf.py +435 -0
- package/src/ummaya/tools/documents/formats/ooxml.py +1030 -0
- package/src/ummaya/tools/documents/formats/passive.py +766 -0
- package/src/ummaya/tools/documents/formats/pdf.py +702 -0
- package/src/ummaya/tools/documents/formats/text_web.py +268 -0
- package/src/ummaya/tools/documents/hwp_conversion_probe.py +178 -0
- package/src/ummaya/tools/documents/hwp_direct_candidate.py +141 -0
- package/src/ummaya/tools/documents/inspection.py +289 -0
- package/src/ummaya/tools/documents/intake.py +1079 -0
- package/src/ummaya/tools/documents/legacy_office_promotion_probe.py +366 -0
- package/src/ummaya/tools/documents/models.py +1598 -0
- package/src/ummaya/tools/documents/odf_promotion_probe.py +167 -0
- package/src/ummaya/tools/documents/orchestrator.py +96 -0
- package/src/ummaya/tools/documents/passive_capability_probe.py +251 -0
- package/src/ummaya/tools/documents/patch.py +170 -0
- package/src/ummaya/tools/documents/pdfa_conformance.py +284 -0
- package/src/ummaya/tools/documents/pdfa_promotion_probe.py +198 -0
- package/src/ummaya/tools/documents/permissions.py +110 -0
- package/src/ummaya/tools/documents/planner.py +616 -0
- package/src/ummaya/tools/documents/registry.py +2733 -0
- package/src/ummaya/tools/documents/render.py +978 -0
- package/src/ummaya/tools/documents/render_comparison.py +113 -0
- package/src/ummaya/tools/documents/render_comparison_models.py +74 -0
- package/src/ummaya/tools/documents/render_comparison_regions.py +73 -0
- package/src/ummaya/tools/documents/render_comparison_style.py +161 -0
- package/src/ummaya/tools/documents/reread.py +157 -0
- package/src/ummaya/tools/documents/runtime_authoring.py +244 -0
- package/src/ummaya/tools/documents/runtime_authoring_bundle.py +76 -0
- package/src/ummaya/tools/documents/scorecard.py +184 -0
- package/src/ummaya/tools/documents/socratic_planner.py +193 -0
- package/src/ummaya/tools/documents/style.py +48 -0
- package/src/ummaya/tools/documents/tool_defs.py +523 -0
- package/src/ummaya/tools/documents/validate.py +347 -0
- package/src/ummaya/tools/executor.py +61 -12
- package/src/ummaya/tools/geocoding/kakao_client.py +1 -2
- package/src/ummaya/tools/kma/apihub_catalog.py +984 -1
- package/src/ummaya/tools/kma/apihub_structured_adapter.py +86 -6
- package/src/ummaya/tools/kma/apihub_url_adapter.py +593 -0
- package/src/ummaya/tools/kma/apihub_url_catalog.py +296 -0
- package/src/ummaya/tools/live_proxy.py +0 -3
- package/src/ummaya/tools/location_adapters.py +8 -6
- package/src/ummaya/tools/manifest_metadata.py +16 -3
- package/src/ummaya/tools/models.py +5 -1
- package/src/ummaya/tools/mvp_surface.py +2 -2
- package/src/ummaya/tools/nmc/emergency_search.py +8 -6
- package/src/ummaya/tools/register_all.py +17 -0
- package/src/ummaya/tools/registry.py +10 -1
- package/src/ummaya/tools/resolve_location.py +4 -4
- package/src/ummaya/tools/routing/__init__.py +59 -0
- package/src/ummaya/tools/routing/builder.py +105 -0
- package/src/ummaya/tools/routing/cards.py +29 -0
- package/src/ummaya/tools/routing/decision_service.py +534 -0
- package/src/ummaya/tools/routing/decision_types.py +74 -0
- package/src/ummaya/tools/routing/feasibility.py +122 -0
- package/src/ummaya/tools/routing/intent.py +17 -0
- package/src/ummaya/tools/routing/intent_extractor.py +207 -0
- package/src/ummaya/tools/routing/intent_patterns.py +160 -0
- package/src/ummaya/tools/routing/intent_public_data.py +150 -0
- package/src/ummaya/tools/routing/intent_types.py +48 -0
- package/src/ummaya/tools/routing/lint.py +78 -0
- package/src/ummaya/tools/routing/metadata.py +174 -0
- package/src/ummaya/tools/routing/projection.py +340 -0
- package/src/ummaya/tools/routing/retrieval_policy.py +629 -0
- package/src/ummaya/tools/routing/schema.py +81 -0
- package/src/ummaya/tools/routing/types.py +96 -0
- package/src/ummaya/tools/routing_index.py +2 -2
- package/src/ummaya/tools/search.py +40 -106
- package/src/ummaya/tools/verified_data_go_kr/_manifest.py +115 -25
- package/src/ummaya/tools/verified_data_go_kr/airkorea_air_quality.py +109 -4
- package/src/ummaya/tools/verified_data_go_kr/nmc_aed_site.py +108 -2
- package/src/ummaya/tools/verified_data_go_kr/pps_bid_public_info.py +174 -9
- package/src/ummaya/tools/verified_data_go_kr/tago_bus_arrival.py +66 -3
- package/src/ummaya/tools/verified_data_go_kr/tago_bus_location.py +12 -2
- package/src/ummaya/tools/verified_data_go_kr/tago_bus_route.py +8 -2
- package/src/ummaya/tools/verified_data_go_kr/tago_bus_route_station.py +114 -0
- package/src/ummaya/tools/verified_data_go_kr/tago_bus_station.py +14 -3
- package/src/ummaya/tools/verify_canonical_map.py +21 -0
- package/tests/fixtures/documents/public_forms/baselines.yaml +113 -0
- package/tui/package.json +1 -2
- package/tui/src/.cc-byte-identical-whitelist.yaml +266 -0
- package/tui/src/QueryEngine.ts +12 -4
- package/tui/src/bridge/inboundAttachments.ts +3 -3
- package/tui/src/cli/handlers/auth.ts +4 -13
- package/tui/src/cli/handlers/mcp.tsx +3 -3
- package/tui/src/cli/print.ts +69 -18
- package/tui/src/cli/update.ts +13 -13
- package/tui/src/commands/copy/index.ts +1 -1
- package/tui/src/commands/cost/cost.ts +2 -2
- package/tui/src/commands/init-verifiers.ts +5 -5
- package/tui/src/commands/init.ts +30 -30
- package/tui/src/commands/insights.ts +44 -44
- package/tui/src/commands/install-github-app/install-github-app.tsx +2 -2
- package/tui/src/commands/install-github-app/setupGitHubActions.ts +3 -3
- package/tui/src/commands/install-github-app/types.ts +8 -30
- package/tui/src/commands/install.tsx +5 -5
- package/tui/src/commands/mcp/addCommand.ts +5 -5
- package/tui/src/commands/mcp/xaaIdpCommand.ts +2 -2
- package/tui/src/commands/plugin/ManageMarketplaces.tsx +2 -2
- package/tui/src/commands/plugin/types.ts +6 -28
- package/tui/src/commands/plugin/unifiedTypes.ts +4 -26
- package/tui/src/commands/reasoning/index.ts +13 -0
- package/tui/src/commands/reasoning/reasoning.tsx +177 -0
- package/tui/src/commands/rename/generateSessionName.ts +1 -1
- package/tui/src/commands/thinkback/thinkback.tsx +3 -3
- package/tui/src/commands.ts +2 -0
- package/tui/src/components/Feedback.tsx +1 -1
- package/tui/src/components/LogoV2/EmergencyTip.tsx +11 -2
- package/tui/src/components/LogoV2/WelcomeV2.tsx +1 -3
- package/tui/src/components/Messages.tsx +2 -1
- package/tui/src/components/ScrollKeybindingHandler.tsx +6 -6
- package/tui/src/components/Spinner/types.ts +6 -28
- package/tui/src/components/Spinner.tsx +2 -2
- package/tui/src/components/agents/generateAgent.ts +1 -1
- package/tui/src/components/agents/new-agent-creation/types.ts +4 -26
- package/tui/src/components/config/EnvSecretIsolatedEditor.tsx +1 -1
- package/tui/src/components/design-system/LoadingState.tsx +2 -2
- package/tui/src/components/mcp/types.ts +16 -38
- package/tui/src/components/messages/AssistantToolUseMessage.tsx +3 -2
- package/tui/src/components/messages/UserCrossSessionMessage.ts +16 -4
- package/tui/src/components/messages/UserForkBoilerplateMessage.ts +16 -4
- package/tui/src/components/messages/UserGitHubWebhookMessage.ts +16 -4
- package/tui/src/components/messages/UserToolResultMessage/utils.tsx +3 -2
- package/tui/src/components/permissions/MonitorPermissionRequest/MonitorPermissionRequest.ts +9 -4
- package/tui/src/components/permissions/ReviewArtifactPermissionRequest/ReviewArtifactPermissionRequest.ts +9 -4
- package/tui/src/components/primitive/DocumentSocraticReviewBlock.tsx +129 -0
- package/tui/src/components/primitive/DocumentToolResultCard.tsx +224 -0
- package/tui/src/components/primitive/documentSocraticReview.ts +215 -0
- package/tui/src/components/primitive/index.tsx +43 -1
- package/tui/src/components/primitive/types.ts +137 -0
- package/tui/src/components/ui/option.ts +4 -26
- package/tui/src/constants/common.ts +0 -2
- package/tui/src/constants/prompts.ts +4 -3
- package/tui/src/constants/querySource.ts +4 -26
- package/tui/src/entrypoints/sdk/controlTypes.ts +26 -48
- package/tui/src/entrypoints/sdk/coreTypes.generated.ts +3 -25
- package/tui/src/entrypoints/sdk/runtimeTypes.ts +38 -60
- package/tui/src/entrypoints/sdk/sdkUtilityTypes.ts +4 -26
- package/tui/src/entrypoints/sdk/settingsTypes.generated.ts +3 -25
- package/tui/src/entrypoints/sdk/toolTypes.ts +3 -25
- package/tui/src/hooks/toolPermission/handlers/interactiveHandler.ts +10 -0
- package/tui/src/hooks/useApiKeyVerification.ts +1 -1
- package/tui/src/hooks/useVirtualScroll.ts +1 -1
- package/tui/src/ink/ink.tsx +33 -14
- package/tui/src/ink/reconciler.ts +2 -3
- package/tui/src/ink/render-to-screen.ts +30 -10
- package/tui/src/ipc/bridge.ts +62 -15
- package/tui/src/ipc/bridgeSingleton.ts +5 -1
- package/tui/src/ipc/codec.ts +29 -3
- package/tui/src/ipc/frames.generated.ts +407 -312
- package/tui/src/ipc/llmClient.ts +279 -76
- package/tui/src/ipc/llmTypes.ts +16 -1
- package/tui/src/ipc/schema/frame.schema.json +1 -3475
- package/tui/src/keybindings/defaultBindings.ts +4 -0
- package/tui/src/main.tsx +32 -11
- package/tui/src/native-ts/file-index/index.ts +33 -3
- package/tui/src/observability/surface.ts +2 -2
- package/tui/src/probes/toolRegistryProbe.tsx +3 -1
- package/tui/src/projectOnboardingState.ts +7 -6
- package/tui/src/query/chatMessageTypes.ts +18 -0
- package/tui/src/query/chatMessagesBuilder.ts +1 -1
- package/tui/src/query/deps.ts +1 -1
- package/tui/src/query/messageGuards.ts +106 -0
- package/tui/src/query/publicDataTerminalRepair.ts +384 -0
- package/tui/src/query/run.ts +1075 -0
- package/tui/src/query/supportBoundary.ts +168 -0
- package/tui/src/query/toolResultErrors.ts +103 -0
- package/tui/src/query/toolRunner.ts +687 -0
- package/tui/src/query/unavailableToolRepair.ts +118 -0
- package/tui/src/query.ts +9 -1721
- package/tui/src/screens/REPL.tsx +42 -31
- package/tui/src/services/api/adapterManifest.ts +4 -0
- package/tui/src/services/api/backendChat/events.ts +117 -0
- package/tui/src/services/api/backendChat/finalMessage.ts +40 -0
- package/tui/src/services/api/backendChat/frame.ts +9 -0
- package/tui/src/services/api/backendChat/streaming.ts +430 -0
- package/tui/src/services/api/backendChat/types.ts +62 -0
- package/tui/src/services/api/backendChat.ts +1 -0
- package/tui/src/services/api/client.ts +98 -14
- package/tui/src/services/api/errorUtils.ts +5 -5
- package/tui/src/services/api/errors.ts +1 -1
- package/tui/src/services/api/logging.ts +1 -1
- package/tui/src/services/api/ummaya/evidence.ts +194 -0
- package/tui/src/services/api/ummaya/messages.ts +255 -0
- package/tui/src/services/api/ummaya/nonStreaming.ts +66 -0
- package/tui/src/services/api/ummaya/provider.ts +200 -0
- package/tui/src/services/api/ummaya/reasoning.ts +24 -0
- package/tui/src/services/api/ummaya/request.ts +200 -0
- package/tui/src/services/api/ummaya/selectionContext.ts +240 -0
- package/tui/src/services/api/ummaya/streaming.ts +365 -0
- package/tui/src/services/api/ummaya/streamingPayload.ts +129 -0
- package/tui/src/services/api/ummaya/streamingReader.ts +40 -0
- package/tui/src/services/api/ummaya/toolSelection.ts +217 -0
- package/tui/src/services/api/ummaya/types.ts +110 -0
- package/tui/src/services/api/ummaya/usage.ts +30 -0
- package/tui/src/services/api/ummaya.ts +26 -364
- package/tui/src/services/api/withRetry.ts +1 -1
- package/tui/src/services/awaySummary.ts +2 -2
- package/tui/src/services/claudeAiLimits.ts +1 -1
- package/tui/src/services/compact/autoCompact.ts +1 -1
- package/tui/src/services/compact/compact.ts +1 -1
- package/tui/src/services/lsp/types.ts +8 -30
- package/tui/src/services/tips/types.ts +6 -28
- package/tui/src/services/tokenEstimation.ts +1 -1
- package/tui/src/services/toolRegistry/bootGuard.ts +5 -5
- package/tui/src/services/toolUseSummary/toolUseSummaryGenerator.ts +1 -1
- package/tui/src/services/tools/toolExecution.ts +94 -1
- package/tui/src/skills/bundled/stuck.ts +12 -12
- package/tui/src/state/AppStateStore.ts +7 -0
- package/tui/src/store/pendingPermissionSlot.ts +1 -1
- package/tui/src/store/session-store.ts +10 -36
- package/tui/src/stubs/any-stub.ts +15 -10
- package/tui/src/stubs/color-diff-napi.ts +37 -23
- package/tui/src/stubs/globals.d.ts +3 -3
- package/tui/src/stubs/macro-preload.ts +23 -12
- package/tui/src/tools/AdapterTool/AdapterTool.ts +1239 -163
- package/tui/src/tools/AdapterTool/routeDiagnostics.ts +75 -0
- package/tui/src/tools/AgentTool/AgentTool.tsx +84 -1371
- package/tui/src/tools/AgentTool/agentToolHandoff.ts +114 -0
- package/tui/src/tools/AgentTool/agentToolPartialResult.ts +16 -0
- package/tui/src/tools/AgentTool/agentToolProgress.ts +32 -0
- package/tui/src/tools/AgentTool/agentToolResolver.ts +161 -0
- package/tui/src/tools/AgentTool/agentToolResult.ts +163 -0
- package/tui/src/tools/AgentTool/agentToolUtils.ts +14 -686
- package/tui/src/tools/AgentTool/asyncAgentLifecycle.ts +208 -0
- package/tui/src/tools/AgentTool/asyncLifecycle.ts +153 -0
- package/tui/src/tools/AgentTool/backgroundedCompletion.ts +126 -0
- package/tui/src/tools/AgentTool/backgroundedLifecycle.ts +174 -0
- package/tui/src/tools/AgentTool/foregroundBackground.ts +83 -0
- package/tui/src/tools/AgentTool/foregroundDrain.tsx +133 -0
- package/tui/src/tools/AgentTool/foregroundFinalize.ts +98 -0
- package/tui/src/tools/AgentTool/foregroundLifecycle.tsx +237 -0
- package/tui/src/tools/AgentTool/foregroundProgress.tsx +169 -0
- package/tui/src/tools/AgentTool/foregroundTask.ts +89 -0
- package/tui/src/tools/AgentTool/forkSubagent.ts +1 -12
- package/tui/src/tools/AgentTool/forkSubagentGate.ts +34 -0
- package/tui/src/tools/AgentTool/launchRouting.ts +203 -0
- package/tui/src/tools/AgentTool/lifecycle.ts +244 -0
- package/tui/src/tools/AgentTool/mcpRouting.ts +73 -0
- package/tui/src/tools/AgentTool/orchestrationSupport.ts +70 -0
- package/tui/src/tools/AgentTool/permissions.ts +39 -0
- package/tui/src/tools/AgentTool/promptSetup.ts +181 -0
- package/tui/src/tools/AgentTool/remoteRouting.ts +62 -0
- package/tui/src/tools/AgentTool/resultMapping.ts +116 -0
- package/tui/src/tools/AgentTool/resumeAgent.ts +39 -107
- package/tui/src/tools/AgentTool/resumeAgentHelpers.ts +140 -0
- package/tui/src/tools/AgentTool/runAgent.ts +1 -1
- package/tui/src/tools/AgentTool/runtimeConfig.ts +57 -0
- package/tui/src/tools/AgentTool/schemas.ts +196 -0
- package/tui/src/tools/AgentTool/sourceVerificationPropagation.ts +263 -0
- package/tui/src/tools/AgentTool/worktreeLifecycle.ts +105 -0
- package/tui/src/tools/AskUserQuestionTool/AskUserQuestionTool.tsx +174 -202
- package/tui/src/tools/BashTool/BashTool.tsx +71 -1072
- package/tui/src/tools/BashTool/bashCommandHelpers.ts +12 -12
- package/tui/src/tools/BashTool/bashPermissions/astPreflight.ts +173 -0
- package/tui/src/tools/BashTool/bashPermissions/classifierChecks.ts +199 -0
- package/tui/src/tools/BashTool/bashPermissions/compoundGuards.ts +53 -0
- package/tui/src/tools/BashTool/bashPermissions/constants.ts +99 -0
- package/tui/src/tools/BashTool/bashPermissions/index.ts +38 -0
- package/tui/src/tools/BashTool/bashPermissions/legacyMisparsing.ts +62 -0
- package/tui/src/tools/BashTool/bashPermissions/main.ts +135 -0
- package/tui/src/tools/BashTool/bashPermissions/normalizedCommands.ts +33 -0
- package/tui/src/tools/BashTool/bashPermissions/operatorFlow.ts +98 -0
- package/tui/src/tools/BashTool/bashPermissions/permissionChecks.ts +200 -0
- package/tui/src/tools/BashTool/bashPermissions/prefixSuggestions.ts +88 -0
- package/tui/src/tools/BashTool/bashPermissions/promptClassifierRules.ts +125 -0
- package/tui/src/tools/BashTool/bashPermissions/ruleDelegates.ts +19 -0
- package/tui/src/tools/BashTool/bashPermissions/ruleMatching.ts +145 -0
- package/tui/src/tools/BashTool/bashPermissions/sandboxAutoAllow.ts +75 -0
- package/tui/src/tools/BashTool/bashPermissions/subcommandFlow.ts +205 -0
- package/tui/src/tools/BashTool/bashPermissions/subcommandGuards.ts +73 -0
- package/tui/src/tools/BashTool/bashPermissions/subcommandResultHelpers.ts +116 -0
- package/tui/src/tools/BashTool/bashPermissions/types.ts +26 -0
- package/tui/src/tools/BashTool/bashPermissions/wrapperStripping.ts +139 -0
- package/tui/src/tools/BashTool/bashPermissions.ts +26 -2621
- package/tui/src/tools/BashTool/call.ts +202 -0
- package/tui/src/tools/BashTool/callLoader.ts +35 -0
- package/tui/src/tools/BashTool/commandClassification.ts +151 -0
- package/tui/src/tools/BashTool/commandClassificationLoader.ts +40 -0
- package/tui/src/tools/BashTool/cwdReset.ts +33 -0
- package/tui/src/tools/BashTool/lineTruncation.ts +11 -0
- package/tui/src/tools/BashTool/modeValidation.ts +13 -1
- package/tui/src/tools/BashTool/outputPersistence.ts +42 -0
- package/tui/src/tools/BashTool/permissionClassification.ts +66 -0
- package/tui/src/tools/BashTool/permissionLoader.ts +44 -0
- package/tui/src/tools/BashTool/resultLoader.ts +29 -0
- package/tui/src/tools/BashTool/resultMapping.ts +83 -0
- package/tui/src/tools/BashTool/sandboxPolicy.ts +79 -0
- package/tui/src/tools/BashTool/schemas.ts +65 -0
- package/tui/src/tools/BashTool/sedEditExecution.ts +59 -0
- package/tui/src/tools/BashTool/shellExecution.tsx +245 -0
- package/tui/src/tools/BashTool/shellOutputUtils.ts +85 -0
- package/tui/src/tools/BashTool/shellPermissionGauntlet.ts +97 -0
- package/tui/src/tools/BashTool/uiLoader.ts +37 -0
- package/tui/src/tools/BriefTool/upload.ts +1 -1
- package/tui/src/tools/CalculatorTool/parser.ts +2 -2
- package/tui/src/tools/DocumentPrimitive/DocumentPrimitive.ts +262 -0
- package/tui/src/tools/DocumentPrimitive/dispatchNormalization.ts +270 -0
- package/tui/src/tools/DocumentPrimitive/documentDestinationPath.ts +18 -0
- package/tui/src/tools/DocumentPrimitive/documentMutationGuard.ts +22 -0
- package/tui/src/tools/DocumentPrimitive/documentPatchNormalization.ts +248 -0
- package/tui/src/tools/DocumentPrimitive/documentSourceVerification.ts +245 -0
- package/tui/src/tools/DocumentPrimitive/documentSourceVerificationFields.ts +103 -0
- package/tui/src/tools/DocumentPrimitive/modelVisibleOutput.ts +40 -0
- package/tui/src/tools/DocumentPrimitive/prompt.ts +35 -0
- package/tui/src/tools/FileEditTool/FileEditTool.ts +9 -507
- package/tui/src/tools/FileEditTool/call.ts +228 -0
- package/tui/src/tools/FileEditTool/validateInput.ts +196 -0
- package/tui/src/tools/FileReadTool/imageProcessor.ts +13 -0
- package/tui/src/tools/FileWriteTool/FileWriteTool.ts +7 -300
- package/tui/src/tools/FileWriteTool/call.ts +223 -0
- package/tui/src/tools/FileWriteTool/validateInput.ts +80 -0
- package/tui/src/tools/ListMcpResourcesTool/ListMcpResourcesTool.ts +19 -3
- package/tui/src/tools/LookupPrimitive/LookupPrimitive.ts +48 -29
- package/tui/src/tools/LookupPrimitive/prompt.ts +6 -7
- package/tui/src/tools/MCPTool/trustPolicy.ts +118 -0
- package/tui/src/tools/McpAuthTool/McpAuthTool.ts +21 -3
- package/tui/src/tools/NotebookEditTool/NotebookEditTool.ts +7 -326
- package/tui/src/tools/NotebookEditTool/call.ts +254 -0
- package/tui/src/tools/NotebookEditTool/notebookModel.ts +51 -0
- package/tui/src/tools/NotebookEditTool/validateInput.ts +142 -0
- package/tui/src/tools/PowerShellTool/PowerShellTool.tsx +46 -937
- package/tui/src/tools/PowerShellTool/acceptEditsCommandValidation.ts +162 -0
- package/tui/src/tools/PowerShellTool/call.ts +179 -0
- package/tui/src/tools/PowerShellTool/callLoader.ts +37 -0
- package/tui/src/tools/PowerShellTool/commandClassification.ts +86 -0
- package/tui/src/tools/PowerShellTool/modeValidation.ts +25 -332
- package/tui/src/tools/PowerShellTool/outputPersistence.ts +42 -0
- package/tui/src/tools/PowerShellTool/permissionClassification.ts +28 -0
- package/tui/src/tools/PowerShellTool/resultLoader.ts +31 -0
- package/tui/src/tools/PowerShellTool/resultMapping.ts +75 -0
- package/tui/src/tools/PowerShellTool/schemas.ts +40 -0
- package/tui/src/tools/PowerShellTool/shellExecution.tsx +258 -0
- package/tui/src/tools/PowerShellTool/symlinkModeValidation.ts +44 -0
- package/tui/src/tools/PowerShellTool/uiLoader.ts +37 -0
- package/tui/src/tools/PowerShellTool/validation.ts +39 -0
- package/tui/src/tools/ReadMcpResourceTool/ReadMcpResourceTool.ts +19 -3
- package/tui/src/tools/ResolveLocationPrimitive/ResolveLocationPrimitive.ts +30 -19
- package/tui/src/tools/ResolveLocationPrimitive/prompt.ts +2 -6
- package/tui/src/tools/SkillTool/SkillTool.ts +2 -2
- package/tui/src/tools/SubmitPrimitive/SubmitPrimitive.ts +51 -18
- package/tui/src/tools/TaskCreateTool/TaskCreateTool.ts +16 -2
- package/tui/src/tools/TaskGetTool/TaskGetTool.ts +23 -3
- package/tui/src/tools/TaskListTool/TaskListTool.ts +22 -4
- package/tui/src/tools/TaskOutputTool/TaskOutputTool.tsx +46 -547
- package/tui/src/tools/TaskOutputTool/lookup.ts +216 -0
- package/tui/src/tools/TaskOutputTool/render.tsx +257 -0
- package/tui/src/tools/TaskOutputTool/schemas.ts +55 -0
- package/tui/src/tools/TaskOutputTool/serialization.ts +36 -0
- package/tui/src/tools/TaskStopTool/TaskStopTool.ts +10 -0
- package/tui/src/tools/TaskUpdateTool/TaskUpdateTool.ts +14 -364
- package/tui/src/tools/TaskUpdateTool/completion.ts +62 -0
- package/tui/src/tools/TaskUpdateTool/schemas.ts +62 -0
- package/tui/src/tools/TaskUpdateTool/serialization.ts +46 -0
- package/tui/src/tools/TaskUpdateTool/statusUpdate.ts +247 -0
- package/tui/src/tools/TodoWriteTool/TodoWriteTool.ts +21 -2
- package/tui/src/tools/ToolSearchTool/ToolSearchTool.ts +21 -302
- package/tui/src/tools/ToolSearchTool/ccSupportTools.ts +223 -0
- package/tui/src/tools/ToolSearchTool/descriptionCache.ts +50 -0
- package/tui/src/tools/ToolSearchTool/keywordSearch.ts +216 -0
- package/tui/src/tools/ToolSearchTool/prompt.ts +10 -4
- package/tui/src/tools/ToolSearchTool/resultMapping.ts +30 -0
- package/tui/src/tools/ToolSearchTool/schemas.ts +30 -0
- package/tui/src/tools/ToolSearchTool/searchPool.ts +47 -0
- package/tui/src/tools/ToolSearchTool/supportIntentHints.ts +140 -0
- package/tui/src/tools/TranslateTool/TranslateTool.ts +1 -1
- package/tui/src/tools/VerifyPrimitive/VerifyPrimitive.ts +27 -10
- package/tui/src/tools/WebFetchTool/WebFetchTool.ts +43 -138
- package/tui/src/tools/WebFetchTool/call.ts +227 -0
- package/tui/src/tools/WebFetchTool/resolvedAddressSafety.ts +78 -0
- package/tui/src/tools/WebFetchTool/sourceVerification.ts +204 -0
- package/tui/src/tools/WebFetchTool/types.ts +23 -0
- package/tui/src/tools/WebFetchTool/urlSafety.ts +181 -0
- package/tui/src/tools/WebFetchTool/utils.ts +1 -1
- package/tui/src/tools/WebSearchTool/UI.tsx +0 -1
- package/tui/src/tools/WebSearchTool/WebSearchTool.ts +9 -313
- package/tui/src/tools/WebSearchTool/call.ts +33 -0
- package/tui/src/tools/WebSearchTool/responseMapping.ts +190 -0
- package/tui/src/tools/WebSearchTool/resultBlock.ts +47 -0
- package/tui/src/tools/WebSearchTool/schemas.ts +47 -0
- package/tui/src/tools/WebSearchTool/toolSchema.ts +12 -0
- package/tui/src/tools/WorkspaceToolAdapter/WorkspaceToolAdapter.ts +79 -0
- package/tui/src/tools/WorkspaceToolAdapter/allowedRootPolicy.ts +85 -0
- package/tui/src/tools/WorkspaceToolAdapter/documentFormatGuards.ts +73 -0
- package/tui/src/tools/WorkspaceToolAdapter/inputNormalization.ts +105 -0
- package/tui/src/tools/WorkspaceToolAdapter/mcpExposurePolicy.ts +64 -0
- package/tui/src/tools/WorkspaceToolAdapter/toolDefFactory.ts +215 -0
- package/tui/src/tools/WorkspaceToolAdapter/toolNames.ts +6 -0
- package/tui/src/tools/WorkspaceToolAdapter/workspacePolicy.ts +15 -0
- package/tui/src/tools/_shared/citizenUserText.ts +49 -0
- package/tui/src/tools/_shared/dispatchPrimitive.ts +6 -6
- package/tui/src/tools/_shared/documentChangeToPatch.ts +125 -0
- package/tui/src/tools/_shared/documentDispatchArguments.ts +87 -0
- package/tui/src/tools/_shared/documentPrimitiveTimeout.ts +13 -0
- package/tui/src/tools/_shared/documentToolResultRender.ts +98 -0
- package/tui/src/tools/_shared/locationInputRepair.ts +112 -0
- package/tui/src/tools/_shared/pendingCallRegistry.ts +1 -6
- package/tui/src/tools/_shared/rootPrimitiveInput.ts +68 -0
- package/tui/src/tools/_shared/toolChoiceRepair/documentCompletionPatterns.ts +58 -0
- package/tui/src/tools/_shared/toolChoiceRepair/documentCompletionPrompt.ts +271 -0
- package/tui/src/tools/_shared/toolChoiceRepair/documentRepair.ts +452 -0
- package/tui/src/tools/_shared/toolChoiceRepair/messageAccess.ts +80 -0
- package/tui/src/tools/_shared/toolChoiceRepair/publicDataRepair.ts +92 -0
- package/tui/src/tools/_shared/toolChoiceRepair/supportRepair.ts +135 -0
- package/tui/src/tools/_shared/toolChoiceRepair.ts +61 -0
- package/tui/src/tools/shared/mockDisclaimer.ts +1 -1
- package/tui/src/tools.ts +39 -190
- package/tui/src/types/fileSuggestion.ts +4 -26
- package/tui/src/types/generated/events_mono/claude_code/v1/claude_code_internal_event.ts +186 -148
- package/tui/src/types/generated/events_mono/common/v1/auth.ts +25 -11
- package/tui/src/types/generated/events_mono/growthbook/v1/growthbook_experiment_event.ts +47 -30
- package/tui/src/types/generated/google/protobuf/timestamp.ts +21 -7
- package/tui/src/types/message.ts +80 -102
- package/tui/src/types/messageQueueTypes.ts +6 -28
- package/tui/src/types/notebook.ts +16 -38
- package/tui/src/types/statusLine.ts +4 -26
- package/tui/src/types/tools.ts +24 -46
- package/tui/src/types/utils.ts +6 -28
- package/tui/src/upstreamproxy/relay.ts +7 -3
- package/tui/src/upstreamproxy/upstreamproxy.ts +1 -1
- package/tui/src/utils/assistantMessageFactories.ts +9 -3
- package/tui/src/utils/attachments.ts +1 -1
- package/tui/src/utils/auth.ts +129 -139
- package/tui/src/utils/bash/ast.ts +23 -23
- package/tui/src/utils/bash/bashParser.ts +5 -5
- package/tui/src/utils/billing.ts +1 -1
- package/tui/src/utils/collapseReadSearch.ts +3 -3
- package/tui/src/utils/cronTasks.ts +1 -1
- package/tui/src/utils/execFileNoThrow.ts +1 -1
- package/tui/src/utils/filePersistence/types.ts +16 -38
- package/tui/src/utils/forkedAgent.ts +1 -1
- package/tui/src/utils/gracefulShutdown.ts +4 -4
- package/tui/src/utils/heapDumpService.ts +12 -8
- package/tui/src/utils/hooks/apiQueryHookHelper.ts +1 -1
- package/tui/src/utils/hooks/execPromptHook.ts +1 -1
- package/tui/src/utils/hooks/skillImprovement.ts +1 -1
- package/tui/src/utils/kExaoneReasoning.ts +138 -0
- package/tui/src/utils/mcp/dateTimeParser.ts +1 -1
- package/tui/src/utils/messages.ts +19 -0
- package/tui/src/utils/migrateSessions.ts +3 -3
- package/tui/src/utils/model/model.ts +6 -6
- package/tui/src/utils/multiToolLayout.ts +13 -0
- package/tui/src/utils/permissions/yoloClassifier.ts +1 -1
- package/tui/src/utils/plugins/headlessPluginInstall.ts +1 -1
- package/tui/src/utils/plugins/mcpPluginIntegration.ts +1 -1
- package/tui/src/utils/plugins/mcpbHandler.ts +1 -1
- package/tui/src/utils/plugins/pluginLoader.ts +8 -8
- package/tui/src/utils/processUserInput/processSlashCommand.tsx +2 -2
- package/tui/src/utils/processUserInput/processUserInput.ts +26 -0
- package/tui/src/utils/protectedNamespace.ts +5 -3
- package/tui/src/utils/rawJsonToolCall.ts +242 -0
- package/tui/src/utils/ripgrep.ts +16 -7
- package/tui/src/utils/sessionTitle.ts +1 -1
- package/tui/src/utils/settings/applySettingsChange.ts +4 -0
- package/tui/src/utils/settings/permissionValidation.ts +14 -2
- package/tui/src/utils/settings/types.ts +9 -3
- package/tui/src/utils/shell/prefix.ts +1 -1
- package/tui/src/utils/sideQuery.ts +1 -1
- package/tui/src/utils/stats.ts +1 -1
- package/tui/src/utils/systemThemeWatcher.ts +13 -3
- package/tui/src/utils/teleport.tsx +1 -1
- package/uv.lock +394 -22
- package/assets/copilot-gate-logo.svg +0 -58
- package/assets/govon-logo.svg +0 -40
- package/src/ummaya/eval/__init__.py +0 -5
- package/src/ummaya/eval/retrieval.py +0 -713
- package/tui/src/services/api/claude.ts +0 -3510
- package/tui/src/utils/messageStream.ts +0 -186
|
@@ -0,0 +1,766 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
"""Known-only passive adapters for non-promoted document families."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import csv
|
|
7
|
+
import gzip
|
|
8
|
+
import io
|
|
9
|
+
import json
|
|
10
|
+
import tarfile
|
|
11
|
+
import zipfile
|
|
12
|
+
from html.parser import HTMLParser
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
from xml.etree import ElementTree as StdElementTree
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
from defusedxml import ElementTree # type: ignore[import-untyped]
|
|
19
|
+
|
|
20
|
+
from ummaya.tools.documents.models import (
|
|
21
|
+
DocumentExtraction,
|
|
22
|
+
DocumentFormat,
|
|
23
|
+
ImageReference,
|
|
24
|
+
KnownDocumentFormat,
|
|
25
|
+
MetadataValue,
|
|
26
|
+
ParagraphBlock,
|
|
27
|
+
TableBlock,
|
|
28
|
+
TableCell,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from ummaya.tools.documents.tool_defs import DocumentFieldPatch
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
_ODF_FORMATS: tuple[KnownDocumentFormat, ...] = (
|
|
36
|
+
KnownDocumentFormat.odt,
|
|
37
|
+
KnownDocumentFormat.ods,
|
|
38
|
+
KnownDocumentFormat.odp,
|
|
39
|
+
)
|
|
40
|
+
_DATA_FORMATS: tuple[KnownDocumentFormat, ...] = (
|
|
41
|
+
KnownDocumentFormat.csv,
|
|
42
|
+
KnownDocumentFormat.tsv,
|
|
43
|
+
KnownDocumentFormat.xml,
|
|
44
|
+
KnownDocumentFormat.rdf,
|
|
45
|
+
KnownDocumentFormat.ttl,
|
|
46
|
+
KnownDocumentFormat.lod,
|
|
47
|
+
KnownDocumentFormat.json,
|
|
48
|
+
KnownDocumentFormat.jsonl,
|
|
49
|
+
KnownDocumentFormat.yaml,
|
|
50
|
+
KnownDocumentFormat.yml,
|
|
51
|
+
KnownDocumentFormat.geojson,
|
|
52
|
+
KnownDocumentFormat.gpx,
|
|
53
|
+
KnownDocumentFormat.kml,
|
|
54
|
+
KnownDocumentFormat.fasta,
|
|
55
|
+
KnownDocumentFormat.sgml,
|
|
56
|
+
KnownDocumentFormat.dtd,
|
|
57
|
+
KnownDocumentFormat.hml,
|
|
58
|
+
KnownDocumentFormat.etc,
|
|
59
|
+
)
|
|
60
|
+
_TEXT_WEB_FORMATS: tuple[KnownDocumentFormat, ...] = (
|
|
61
|
+
KnownDocumentFormat.html,
|
|
62
|
+
KnownDocumentFormat.htm,
|
|
63
|
+
KnownDocumentFormat.txt,
|
|
64
|
+
KnownDocumentFormat.rtf,
|
|
65
|
+
KnownDocumentFormat.md,
|
|
66
|
+
)
|
|
67
|
+
_LEGACY_OFFICE_FORMATS: tuple[KnownDocumentFormat, ...] = (
|
|
68
|
+
KnownDocumentFormat.doc,
|
|
69
|
+
KnownDocumentFormat.xls,
|
|
70
|
+
KnownDocumentFormat.ppt,
|
|
71
|
+
)
|
|
72
|
+
_CODE_FORMATS: tuple[KnownDocumentFormat, ...] = (KnownDocumentFormat.python,)
|
|
73
|
+
_IMAGE_FORMATS: tuple[KnownDocumentFormat, ...] = (
|
|
74
|
+
KnownDocumentFormat.png,
|
|
75
|
+
KnownDocumentFormat.jpg,
|
|
76
|
+
KnownDocumentFormat.jpeg,
|
|
77
|
+
KnownDocumentFormat.gif,
|
|
78
|
+
KnownDocumentFormat.tif,
|
|
79
|
+
KnownDocumentFormat.tiff,
|
|
80
|
+
KnownDocumentFormat.bmp,
|
|
81
|
+
KnownDocumentFormat.webp,
|
|
82
|
+
)
|
|
83
|
+
_GEOSPATIAL_FORMATS: tuple[KnownDocumentFormat, ...] = (
|
|
84
|
+
KnownDocumentFormat.shp,
|
|
85
|
+
KnownDocumentFormat.shx,
|
|
86
|
+
KnownDocumentFormat.dbf,
|
|
87
|
+
KnownDocumentFormat.prj,
|
|
88
|
+
KnownDocumentFormat.stl,
|
|
89
|
+
)
|
|
90
|
+
_MEDIA_FORMATS: tuple[KnownDocumentFormat, ...] = (
|
|
91
|
+
KnownDocumentFormat.wav,
|
|
92
|
+
KnownDocumentFormat.mp3,
|
|
93
|
+
KnownDocumentFormat.mp4,
|
|
94
|
+
)
|
|
95
|
+
_ARCHIVE_FORMATS: tuple[KnownDocumentFormat, ...] = (
|
|
96
|
+
KnownDocumentFormat.epub,
|
|
97
|
+
KnownDocumentFormat.zip,
|
|
98
|
+
KnownDocumentFormat.seven_z,
|
|
99
|
+
KnownDocumentFormat.tar,
|
|
100
|
+
KnownDocumentFormat.gz,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
_KNOWN_BY_EXTENSION = {
|
|
104
|
+
".odt": KnownDocumentFormat.odt,
|
|
105
|
+
".ods": KnownDocumentFormat.ods,
|
|
106
|
+
".odp": KnownDocumentFormat.odp,
|
|
107
|
+
".doc": KnownDocumentFormat.doc,
|
|
108
|
+
".xls": KnownDocumentFormat.xls,
|
|
109
|
+
".ppt": KnownDocumentFormat.ppt,
|
|
110
|
+
".csv": KnownDocumentFormat.csv,
|
|
111
|
+
".tsv": KnownDocumentFormat.tsv,
|
|
112
|
+
".xml": KnownDocumentFormat.xml,
|
|
113
|
+
".rdf": KnownDocumentFormat.rdf,
|
|
114
|
+
".ttl": KnownDocumentFormat.ttl,
|
|
115
|
+
".lod": KnownDocumentFormat.lod,
|
|
116
|
+
".json": KnownDocumentFormat.json,
|
|
117
|
+
".jsonl": KnownDocumentFormat.jsonl,
|
|
118
|
+
".yaml": KnownDocumentFormat.yaml,
|
|
119
|
+
".yml": KnownDocumentFormat.yml,
|
|
120
|
+
".geojson": KnownDocumentFormat.geojson,
|
|
121
|
+
".gpx": KnownDocumentFormat.gpx,
|
|
122
|
+
".kml": KnownDocumentFormat.kml,
|
|
123
|
+
".fasta": KnownDocumentFormat.fasta,
|
|
124
|
+
".sgml": KnownDocumentFormat.sgml,
|
|
125
|
+
".dtd": KnownDocumentFormat.dtd,
|
|
126
|
+
".hml": KnownDocumentFormat.hml,
|
|
127
|
+
".etc": KnownDocumentFormat.etc,
|
|
128
|
+
".py": KnownDocumentFormat.python,
|
|
129
|
+
".html": KnownDocumentFormat.html,
|
|
130
|
+
".htm": KnownDocumentFormat.htm,
|
|
131
|
+
".txt": KnownDocumentFormat.txt,
|
|
132
|
+
".rtf": KnownDocumentFormat.rtf,
|
|
133
|
+
".md": KnownDocumentFormat.md,
|
|
134
|
+
".png": KnownDocumentFormat.png,
|
|
135
|
+
".jpg": KnownDocumentFormat.jpg,
|
|
136
|
+
".jpeg": KnownDocumentFormat.jpeg,
|
|
137
|
+
".gif": KnownDocumentFormat.gif,
|
|
138
|
+
".tif": KnownDocumentFormat.tif,
|
|
139
|
+
".tiff": KnownDocumentFormat.tiff,
|
|
140
|
+
".bmp": KnownDocumentFormat.bmp,
|
|
141
|
+
".webp": KnownDocumentFormat.webp,
|
|
142
|
+
".shp": KnownDocumentFormat.shp,
|
|
143
|
+
".shx": KnownDocumentFormat.shx,
|
|
144
|
+
".dbf": KnownDocumentFormat.dbf,
|
|
145
|
+
".prj": KnownDocumentFormat.prj,
|
|
146
|
+
".stl": KnownDocumentFormat.stl,
|
|
147
|
+
".wav": KnownDocumentFormat.wav,
|
|
148
|
+
".mp3": KnownDocumentFormat.mp3,
|
|
149
|
+
".mp4": KnownDocumentFormat.mp4,
|
|
150
|
+
".epub": KnownDocumentFormat.epub,
|
|
151
|
+
".zip": KnownDocumentFormat.zip,
|
|
152
|
+
".7z": KnownDocumentFormat.seven_z,
|
|
153
|
+
".tar": KnownDocumentFormat.tar,
|
|
154
|
+
".gz": KnownDocumentFormat.gz,
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
_IMAGE_MIME = {
|
|
158
|
+
KnownDocumentFormat.png: "image/png",
|
|
159
|
+
KnownDocumentFormat.jpg: "image/jpeg",
|
|
160
|
+
KnownDocumentFormat.jpeg: "image/jpeg",
|
|
161
|
+
KnownDocumentFormat.gif: "image/gif",
|
|
162
|
+
KnownDocumentFormat.tif: "image/tiff",
|
|
163
|
+
KnownDocumentFormat.tiff: "image/tiff",
|
|
164
|
+
KnownDocumentFormat.bmp: "image/bmp",
|
|
165
|
+
KnownDocumentFormat.webp: "image/webp",
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class _KnownOnlyAdapterBase:
|
|
170
|
+
"""Shared known-only adapter behavior."""
|
|
171
|
+
|
|
172
|
+
adapter_id: str
|
|
173
|
+
known_formats: tuple[KnownDocumentFormat, ...]
|
|
174
|
+
promoted_formats: tuple[DocumentFormat, ...] = ()
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def engine_id(self) -> str:
|
|
178
|
+
"""Return adapter id for diagnostics."""
|
|
179
|
+
return self.adapter_id
|
|
180
|
+
|
|
181
|
+
def normalize_fill_patches(
|
|
182
|
+
self,
|
|
183
|
+
patches: tuple[DocumentFieldPatch, ...],
|
|
184
|
+
*,
|
|
185
|
+
extraction: DocumentExtraction | None,
|
|
186
|
+
) -> tuple[DocumentFieldPatch, ...]:
|
|
187
|
+
"""No passive adapter normalizes fill patches because writes are not promoted."""
|
|
188
|
+
_ = extraction
|
|
189
|
+
return patches
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class OdfDocumentAdapter(_KnownOnlyAdapterBase):
|
|
193
|
+
"""Read-only ODF package candidate backed by ZIP/XML inspection."""
|
|
194
|
+
|
|
195
|
+
adapter_id = "odf-package-read-only-adapter"
|
|
196
|
+
known_formats = _ODF_FORMATS
|
|
197
|
+
|
|
198
|
+
def inspect(self, path: Path, *, artifact_id: str) -> DocumentExtraction:
|
|
199
|
+
"""Extract ODF package text from content.xml without claiming mutation."""
|
|
200
|
+
known_format = _known_format(path)
|
|
201
|
+
paragraphs: list[ParagraphBlock] = []
|
|
202
|
+
warnings: list[str] = []
|
|
203
|
+
metadata: dict[str, MetadataValue] = _base_metadata(
|
|
204
|
+
path,
|
|
205
|
+
known_format=known_format,
|
|
206
|
+
adapter_id=self.adapter_id,
|
|
207
|
+
mutation_policy="read_only_odf_candidate",
|
|
208
|
+
)
|
|
209
|
+
try:
|
|
210
|
+
with zipfile.ZipFile(path) as archive:
|
|
211
|
+
metadata["package_entry_count"] = len(archive.infolist())
|
|
212
|
+
if "content.xml" in archive.namelist():
|
|
213
|
+
root = ElementTree.fromstring(archive.read("content.xml"))
|
|
214
|
+
paragraphs = _paragraphs_from_text_lines(
|
|
215
|
+
artifact_id,
|
|
216
|
+
_xml_text_lines(root),
|
|
217
|
+
source_prefix="content.xml",
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
warnings.append("ODF package does not contain content.xml.")
|
|
221
|
+
except zipfile.BadZipFile:
|
|
222
|
+
warnings.append("ODF read-only candidate could not open the package as ZIP.")
|
|
223
|
+
|
|
224
|
+
return DocumentExtraction(
|
|
225
|
+
artifact_id=artifact_id,
|
|
226
|
+
paragraphs=paragraphs,
|
|
227
|
+
metadata=metadata,
|
|
228
|
+
warnings=warnings,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class DataFileDocumentAdapter(_KnownOnlyAdapterBase):
|
|
233
|
+
"""Read-only data-file adapter with serializer round-trip evidence."""
|
|
234
|
+
|
|
235
|
+
adapter_id = "data-file-read-only-adapter"
|
|
236
|
+
known_formats = _DATA_FORMATS
|
|
237
|
+
|
|
238
|
+
def inspect(self, path: Path, *, artifact_id: str) -> DocumentExtraction:
|
|
239
|
+
"""Parse structured data files through safe local serializers."""
|
|
240
|
+
known_format = _known_format(path)
|
|
241
|
+
if known_format in {KnownDocumentFormat.csv, KnownDocumentFormat.tsv}:
|
|
242
|
+
return _inspect_delimited(path, artifact_id=artifact_id, known_format=known_format)
|
|
243
|
+
if known_format in {
|
|
244
|
+
KnownDocumentFormat.json,
|
|
245
|
+
KnownDocumentFormat.jsonl,
|
|
246
|
+
KnownDocumentFormat.geojson,
|
|
247
|
+
}:
|
|
248
|
+
return _inspect_json(path, artifact_id=artifact_id, known_format=known_format)
|
|
249
|
+
if known_format in {KnownDocumentFormat.yaml, KnownDocumentFormat.yml}:
|
|
250
|
+
return _inspect_yaml(path, artifact_id=artifact_id, known_format=known_format)
|
|
251
|
+
if known_format in {
|
|
252
|
+
KnownDocumentFormat.xml,
|
|
253
|
+
KnownDocumentFormat.rdf,
|
|
254
|
+
KnownDocumentFormat.gpx,
|
|
255
|
+
KnownDocumentFormat.kml,
|
|
256
|
+
KnownDocumentFormat.hml,
|
|
257
|
+
}:
|
|
258
|
+
return _inspect_xml(path, artifact_id=artifact_id, known_format=known_format)
|
|
259
|
+
if known_format in {
|
|
260
|
+
KnownDocumentFormat.ttl,
|
|
261
|
+
KnownDocumentFormat.lod,
|
|
262
|
+
KnownDocumentFormat.fasta,
|
|
263
|
+
KnownDocumentFormat.sgml,
|
|
264
|
+
KnownDocumentFormat.dtd,
|
|
265
|
+
KnownDocumentFormat.etc,
|
|
266
|
+
}:
|
|
267
|
+
return _inspect_text_data(path, artifact_id=artifact_id, known_format=known_format)
|
|
268
|
+
return DocumentExtraction(
|
|
269
|
+
artifact_id=artifact_id,
|
|
270
|
+
metadata=_base_metadata(
|
|
271
|
+
path,
|
|
272
|
+
known_format=known_format,
|
|
273
|
+
adapter_id=self.adapter_id,
|
|
274
|
+
mutation_policy="read_only_data_file",
|
|
275
|
+
),
|
|
276
|
+
warnings=[f"No passive data parser is implemented for {known_format.value}."],
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class LegacyOfficeDocumentAdapter(_KnownOnlyAdapterBase):
|
|
281
|
+
"""Metadata-only adapter for pre-OOXML Office binaries."""
|
|
282
|
+
|
|
283
|
+
adapter_id = "legacy-office-metadata-only-adapter"
|
|
284
|
+
known_formats = _LEGACY_OFFICE_FORMATS
|
|
285
|
+
|
|
286
|
+
def inspect(self, path: Path, *, artifact_id: str) -> DocumentExtraction:
|
|
287
|
+
"""Identify legacy Office documents without parsing binary internals."""
|
|
288
|
+
known_format = _known_format(path)
|
|
289
|
+
return DocumentExtraction(
|
|
290
|
+
artifact_id=artifact_id,
|
|
291
|
+
metadata=_base_metadata(
|
|
292
|
+
path,
|
|
293
|
+
known_format=known_format,
|
|
294
|
+
adapter_id=self.adapter_id,
|
|
295
|
+
mutation_policy="conversion_required_legacy_office",
|
|
296
|
+
),
|
|
297
|
+
warnings=[
|
|
298
|
+
"Legacy Office binary inspection is metadata-only until an explicit local "
|
|
299
|
+
"conversion bridge is approved."
|
|
300
|
+
],
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class TextWebExportAdapter(_KnownOnlyAdapterBase):
|
|
305
|
+
"""Read-only HTML, text, RTF, and Markdown export adapter."""
|
|
306
|
+
|
|
307
|
+
adapter_id = "text-web-export-read-only-adapter"
|
|
308
|
+
known_formats = _TEXT_WEB_FORMATS
|
|
309
|
+
|
|
310
|
+
def inspect(self, path: Path, *, artifact_id: str) -> DocumentExtraction:
|
|
311
|
+
"""Extract visible text lines from text and web-export formats."""
|
|
312
|
+
known_format = _known_format(path)
|
|
313
|
+
payload = path.read_text(encoding="utf-8", errors="replace")
|
|
314
|
+
lines = (
|
|
315
|
+
_html_text_lines(payload)
|
|
316
|
+
if known_format in {KnownDocumentFormat.html, KnownDocumentFormat.htm}
|
|
317
|
+
else _plain_text_lines(_strip_minimal_rtf(payload))
|
|
318
|
+
)
|
|
319
|
+
return DocumentExtraction(
|
|
320
|
+
artifact_id=artifact_id,
|
|
321
|
+
paragraphs=_paragraphs_from_text_lines(
|
|
322
|
+
artifact_id,
|
|
323
|
+
lines,
|
|
324
|
+
source_prefix=path.name,
|
|
325
|
+
),
|
|
326
|
+
metadata=_base_metadata(
|
|
327
|
+
path,
|
|
328
|
+
known_format=known_format,
|
|
329
|
+
adapter_id=self.adapter_id,
|
|
330
|
+
mutation_policy="read_only_text_export",
|
|
331
|
+
),
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class CodeFileDocumentAdapter(_KnownOnlyAdapterBase):
|
|
336
|
+
"""Read-only source-code export adapter for public-data attachments."""
|
|
337
|
+
|
|
338
|
+
adapter_id = "code-file-read-only-adapter"
|
|
339
|
+
known_formats = _CODE_FORMATS
|
|
340
|
+
|
|
341
|
+
def inspect(self, path: Path, *, artifact_id: str) -> DocumentExtraction:
|
|
342
|
+
"""Extract source lines for context without using the document writer."""
|
|
343
|
+
payload = path.read_text(encoding="utf-8", errors="replace")
|
|
344
|
+
return DocumentExtraction(
|
|
345
|
+
artifact_id=artifact_id,
|
|
346
|
+
paragraphs=_paragraphs_from_text_lines(
|
|
347
|
+
artifact_id,
|
|
348
|
+
_plain_text_lines(payload)[:200],
|
|
349
|
+
source_prefix=path.name,
|
|
350
|
+
),
|
|
351
|
+
metadata=_base_metadata(
|
|
352
|
+
path,
|
|
353
|
+
known_format=_known_format(path),
|
|
354
|
+
adapter_id=self.adapter_id,
|
|
355
|
+
mutation_policy="read_only_code_file",
|
|
356
|
+
),
|
|
357
|
+
warnings=["Code files are not public-form documents and cannot be mutated here."],
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class ImageScanDocumentAdapter(_KnownOnlyAdapterBase):
|
|
362
|
+
"""Extraction-only image/scan adapter."""
|
|
363
|
+
|
|
364
|
+
adapter_id = "image-scan-extraction-only-adapter"
|
|
365
|
+
known_formats = _IMAGE_FORMATS
|
|
366
|
+
|
|
367
|
+
def inspect(self, path: Path, *, artifact_id: str) -> DocumentExtraction:
|
|
368
|
+
"""Return an image reference without claiming OCR or write support."""
|
|
369
|
+
known_format = _known_format(path)
|
|
370
|
+
return DocumentExtraction(
|
|
371
|
+
artifact_id=artifact_id,
|
|
372
|
+
images=[
|
|
373
|
+
ImageReference(
|
|
374
|
+
image_id=f"image-{known_format.value}",
|
|
375
|
+
source_path=str(path),
|
|
376
|
+
content_type=_IMAGE_MIME.get(known_format, "image/unknown"),
|
|
377
|
+
)
|
|
378
|
+
],
|
|
379
|
+
metadata=_base_metadata(
|
|
380
|
+
path,
|
|
381
|
+
known_format=known_format,
|
|
382
|
+
adapter_id=self.adapter_id,
|
|
383
|
+
mutation_policy="extraction_only",
|
|
384
|
+
),
|
|
385
|
+
warnings=["Image scan adapter does not mutate raster originals."],
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
class GeospatialDocumentAdapter(_KnownOnlyAdapterBase):
|
|
390
|
+
"""Metadata-only geospatial and 3D model adapter."""
|
|
391
|
+
|
|
392
|
+
adapter_id = "geospatial-metadata-only-adapter"
|
|
393
|
+
known_formats = _GEOSPATIAL_FORMATS
|
|
394
|
+
|
|
395
|
+
def inspect(self, path: Path, *, artifact_id: str) -> DocumentExtraction:
|
|
396
|
+
"""Classify GIS/model artifacts without claiming document editing."""
|
|
397
|
+
known_format = _known_format(path)
|
|
398
|
+
paragraphs = (
|
|
399
|
+
_paragraphs_from_text_lines(
|
|
400
|
+
artifact_id,
|
|
401
|
+
_plain_text_lines(path.read_text(encoding="utf-8", errors="replace"))[:40],
|
|
402
|
+
source_prefix=path.name,
|
|
403
|
+
)
|
|
404
|
+
if known_format in {KnownDocumentFormat.prj, KnownDocumentFormat.stl}
|
|
405
|
+
else []
|
|
406
|
+
)
|
|
407
|
+
return DocumentExtraction(
|
|
408
|
+
artifact_id=artifact_id,
|
|
409
|
+
paragraphs=paragraphs,
|
|
410
|
+
metadata=_base_metadata(
|
|
411
|
+
path,
|
|
412
|
+
known_format=known_format,
|
|
413
|
+
adapter_id=self.adapter_id,
|
|
414
|
+
mutation_policy="metadata_only_geospatial_asset",
|
|
415
|
+
),
|
|
416
|
+
warnings=[
|
|
417
|
+
"Geospatial and 3D geometry files are classified for routing, not mutated "
|
|
418
|
+
"as public documents."
|
|
419
|
+
],
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
class MediaAssetDocumentAdapter(_KnownOnlyAdapterBase):
|
|
424
|
+
"""Metadata-only audio/video adapter."""
|
|
425
|
+
|
|
426
|
+
adapter_id = "media-asset-metadata-only-adapter"
|
|
427
|
+
known_formats = _MEDIA_FORMATS
|
|
428
|
+
|
|
429
|
+
def inspect(self, path: Path, *, artifact_id: str) -> DocumentExtraction:
|
|
430
|
+
"""Classify media attachments without transcription or mutation claims."""
|
|
431
|
+
return DocumentExtraction(
|
|
432
|
+
artifact_id=artifact_id,
|
|
433
|
+
metadata=_base_metadata(
|
|
434
|
+
path,
|
|
435
|
+
known_format=_known_format(path),
|
|
436
|
+
adapter_id=self.adapter_id,
|
|
437
|
+
mutation_policy="metadata_only_media_asset",
|
|
438
|
+
),
|
|
439
|
+
warnings=[
|
|
440
|
+
"Media files need a dedicated transcription or extraction adapter before "
|
|
441
|
+
"content can be written into a public document derivative."
|
|
442
|
+
],
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
class ArchiveDocumentSetAdapter(_KnownOnlyAdapterBase):
|
|
447
|
+
"""Read-only archive enumerator for secure child routing."""
|
|
448
|
+
|
|
449
|
+
adapter_id = "archive-document-set-read-only-adapter"
|
|
450
|
+
known_formats = _ARCHIVE_FORMATS
|
|
451
|
+
|
|
452
|
+
def __init__(
|
|
453
|
+
self,
|
|
454
|
+
known_formats: tuple[KnownDocumentFormat, ...] | None = None,
|
|
455
|
+
) -> None:
|
|
456
|
+
self.known_formats = known_formats or _ARCHIVE_FORMATS
|
|
457
|
+
|
|
458
|
+
def inspect(self, path: Path, *, artifact_id: str) -> DocumentExtraction:
|
|
459
|
+
"""Enumerate archive members without mutating children in place."""
|
|
460
|
+
known_format = _known_format(path)
|
|
461
|
+
names, warnings = _archive_member_names(path, known_format=known_format)
|
|
462
|
+
metadata = _base_metadata(
|
|
463
|
+
path,
|
|
464
|
+
known_format=known_format,
|
|
465
|
+
adapter_id=self.adapter_id,
|
|
466
|
+
mutation_policy="archive_read_only",
|
|
467
|
+
)
|
|
468
|
+
metadata["entry_count"] = len(names)
|
|
469
|
+
metadata["child_mutation_policy"] = "route_children_as_derivatives"
|
|
470
|
+
return DocumentExtraction(
|
|
471
|
+
artifact_id=artifact_id,
|
|
472
|
+
paragraphs=_paragraphs_from_text_lines(
|
|
473
|
+
artifact_id,
|
|
474
|
+
names,
|
|
475
|
+
source_prefix=path.name,
|
|
476
|
+
),
|
|
477
|
+
metadata=metadata,
|
|
478
|
+
warnings=warnings,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _inspect_delimited(
|
|
483
|
+
path: Path,
|
|
484
|
+
*,
|
|
485
|
+
artifact_id: str,
|
|
486
|
+
known_format: KnownDocumentFormat,
|
|
487
|
+
) -> DocumentExtraction:
|
|
488
|
+
delimiter = "\t" if known_format is KnownDocumentFormat.tsv else ","
|
|
489
|
+
payload = path.read_text(encoding="utf-8-sig", errors="replace")
|
|
490
|
+
rows = list(csv.reader(io.StringIO(payload), delimiter=delimiter))
|
|
491
|
+
serialized = io.StringIO()
|
|
492
|
+
writer = csv.writer(serialized, delimiter=delimiter, lineterminator="\n")
|
|
493
|
+
writer.writerows(rows)
|
|
494
|
+
reparsed = list(csv.reader(io.StringIO(serialized.getvalue()), delimiter=delimiter))
|
|
495
|
+
metadata = _base_metadata(
|
|
496
|
+
path,
|
|
497
|
+
known_format=known_format,
|
|
498
|
+
adapter_id=DataFileDocumentAdapter.adapter_id,
|
|
499
|
+
mutation_policy="read_only_data_file",
|
|
500
|
+
)
|
|
501
|
+
metadata.update(
|
|
502
|
+
{
|
|
503
|
+
"serializer": known_format.value,
|
|
504
|
+
"round_trip_passed": rows == reparsed,
|
|
505
|
+
"row_count": len(rows),
|
|
506
|
+
"column_count": max((len(row) for row in rows), default=0),
|
|
507
|
+
}
|
|
508
|
+
)
|
|
509
|
+
return DocumentExtraction(
|
|
510
|
+
artifact_id=artifact_id,
|
|
511
|
+
tables=[_table_from_rows(rows, source_path=path.name)],
|
|
512
|
+
paragraphs=_paragraphs_from_text_lines(
|
|
513
|
+
artifact_id,
|
|
514
|
+
[",".join(row) for row in rows],
|
|
515
|
+
source_prefix=path.name,
|
|
516
|
+
),
|
|
517
|
+
metadata=metadata,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def _inspect_json(
|
|
522
|
+
path: Path,
|
|
523
|
+
*,
|
|
524
|
+
artifact_id: str,
|
|
525
|
+
known_format: KnownDocumentFormat,
|
|
526
|
+
) -> DocumentExtraction:
|
|
527
|
+
payload = path.read_text(encoding="utf-8")
|
|
528
|
+
if known_format is KnownDocumentFormat.jsonl:
|
|
529
|
+
values = [json.loads(line) for line in payload.splitlines() if line.strip()]
|
|
530
|
+
serialized = "\n".join(
|
|
531
|
+
json.dumps(value, ensure_ascii=False, sort_keys=True) for value in values
|
|
532
|
+
)
|
|
533
|
+
reparsed: object = [json.loads(line) for line in serialized.splitlines()]
|
|
534
|
+
parsed: object = values
|
|
535
|
+
else:
|
|
536
|
+
parsed = json.loads(payload)
|
|
537
|
+
serialized = json.dumps(parsed, ensure_ascii=False, sort_keys=True)
|
|
538
|
+
reparsed = json.loads(serialized)
|
|
539
|
+
return _structured_data_extraction(
|
|
540
|
+
artifact_id,
|
|
541
|
+
path=path,
|
|
542
|
+
known_format=known_format,
|
|
543
|
+
serializer=known_format.value,
|
|
544
|
+
parsed=parsed,
|
|
545
|
+
round_trip_passed=parsed == reparsed,
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def _inspect_yaml(
|
|
550
|
+
path: Path,
|
|
551
|
+
*,
|
|
552
|
+
artifact_id: str,
|
|
553
|
+
known_format: KnownDocumentFormat,
|
|
554
|
+
) -> DocumentExtraction:
|
|
555
|
+
parsed = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
556
|
+
serialized = yaml.safe_dump(parsed, allow_unicode=True, sort_keys=True)
|
|
557
|
+
return _structured_data_extraction(
|
|
558
|
+
artifact_id,
|
|
559
|
+
path=path,
|
|
560
|
+
known_format=known_format,
|
|
561
|
+
serializer="yaml.safe_load/safe_dump",
|
|
562
|
+
parsed=parsed,
|
|
563
|
+
round_trip_passed=parsed == yaml.safe_load(serialized),
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _inspect_xml(
|
|
568
|
+
path: Path,
|
|
569
|
+
*,
|
|
570
|
+
artifact_id: str,
|
|
571
|
+
known_format: KnownDocumentFormat,
|
|
572
|
+
) -> DocumentExtraction:
|
|
573
|
+
root = ElementTree.fromstring(path.read_bytes())
|
|
574
|
+
serialized = StdElementTree.tostring(root, encoding="unicode")
|
|
575
|
+
reparsed = ElementTree.fromstring(serialized.encode("utf-8"))
|
|
576
|
+
return _structured_data_extraction(
|
|
577
|
+
artifact_id,
|
|
578
|
+
path=path,
|
|
579
|
+
known_format=known_format,
|
|
580
|
+
serializer="defusedxml.ElementTree",
|
|
581
|
+
parsed={"root_tag": _local_name(root.tag), "text": " ".join(_xml_text_lines(root))},
|
|
582
|
+
round_trip_passed=_local_name(root.tag) == _local_name(reparsed.tag),
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def _inspect_text_data(
|
|
587
|
+
path: Path,
|
|
588
|
+
*,
|
|
589
|
+
artifact_id: str,
|
|
590
|
+
known_format: KnownDocumentFormat,
|
|
591
|
+
) -> DocumentExtraction:
|
|
592
|
+
payload = path.read_text(encoding="utf-8", errors="replace")
|
|
593
|
+
lines = _plain_text_lines(payload)[:200]
|
|
594
|
+
metadata = _base_metadata(
|
|
595
|
+
path,
|
|
596
|
+
known_format=known_format,
|
|
597
|
+
adapter_id=DataFileDocumentAdapter.adapter_id,
|
|
598
|
+
mutation_policy="read_only_data_file",
|
|
599
|
+
)
|
|
600
|
+
metadata["serializer"] = "plain-text-preview"
|
|
601
|
+
metadata["round_trip_passed"] = True
|
|
602
|
+
metadata["line_count"] = len(lines)
|
|
603
|
+
return DocumentExtraction(
|
|
604
|
+
artifact_id=artifact_id,
|
|
605
|
+
paragraphs=_paragraphs_from_text_lines(
|
|
606
|
+
artifact_id,
|
|
607
|
+
lines,
|
|
608
|
+
source_prefix=path.name,
|
|
609
|
+
),
|
|
610
|
+
metadata=metadata,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def _structured_data_extraction(
|
|
615
|
+
artifact_id: str,
|
|
616
|
+
*,
|
|
617
|
+
path: Path,
|
|
618
|
+
known_format: KnownDocumentFormat,
|
|
619
|
+
serializer: str,
|
|
620
|
+
parsed: object,
|
|
621
|
+
round_trip_passed: bool,
|
|
622
|
+
) -> DocumentExtraction:
|
|
623
|
+
metadata = _base_metadata(
|
|
624
|
+
path,
|
|
625
|
+
known_format=known_format,
|
|
626
|
+
adapter_id=DataFileDocumentAdapter.adapter_id,
|
|
627
|
+
mutation_policy="read_only_data_file",
|
|
628
|
+
)
|
|
629
|
+
metadata.update(
|
|
630
|
+
{
|
|
631
|
+
"serializer": serializer,
|
|
632
|
+
"round_trip_passed": round_trip_passed,
|
|
633
|
+
"root_type": type(parsed).__name__,
|
|
634
|
+
}
|
|
635
|
+
)
|
|
636
|
+
lines = _structured_preview_lines(parsed)
|
|
637
|
+
return DocumentExtraction(
|
|
638
|
+
artifact_id=artifact_id,
|
|
639
|
+
paragraphs=_paragraphs_from_text_lines(
|
|
640
|
+
artifact_id,
|
|
641
|
+
lines,
|
|
642
|
+
source_prefix=path.name,
|
|
643
|
+
),
|
|
644
|
+
metadata=metadata,
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def _table_from_rows(rows: list[list[str]], *, source_path: str) -> TableBlock:
|
|
649
|
+
cells: list[TableCell] = []
|
|
650
|
+
for row_index, row in enumerate(rows):
|
|
651
|
+
for column_index, value in enumerate(row):
|
|
652
|
+
cells.append(
|
|
653
|
+
TableCell(
|
|
654
|
+
row_index=row_index,
|
|
655
|
+
column_index=column_index,
|
|
656
|
+
text=value,
|
|
657
|
+
source_path=f"{source_path}#r{row_index + 1}c{column_index + 1}",
|
|
658
|
+
)
|
|
659
|
+
)
|
|
660
|
+
return TableBlock(block_id="data-table-001", source_path=source_path, cells=cells)
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def _archive_member_names(
|
|
664
|
+
path: Path,
|
|
665
|
+
*,
|
|
666
|
+
known_format: KnownDocumentFormat,
|
|
667
|
+
) -> tuple[list[str], list[str]]:
|
|
668
|
+
if known_format is KnownDocumentFormat.zip:
|
|
669
|
+
with zipfile.ZipFile(path) as archive:
|
|
670
|
+
return _safe_member_names(archive.namelist()), []
|
|
671
|
+
if known_format is KnownDocumentFormat.tar:
|
|
672
|
+
with tarfile.open(path) as archive:
|
|
673
|
+
return _safe_member_names(archive.getnames()), []
|
|
674
|
+
if known_format is KnownDocumentFormat.gz:
|
|
675
|
+
with gzip.open(path) as payload:
|
|
676
|
+
payload.read(1)
|
|
677
|
+
return [path.with_suffix("").name or path.name], [
|
|
678
|
+
"Gzip payload is treated as one compressed child candidate."
|
|
679
|
+
]
|
|
680
|
+
return [], ["7z archive enumeration is known but not promoted without a 7z runtime."]
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def _safe_member_names(names: list[str]) -> list[str]:
|
|
684
|
+
return sorted(name for name in names if name and not name.startswith("/") and ".." not in name)
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def _html_text_lines(payload: str) -> list[str]:
|
|
688
|
+
parser = _VisibleTextParser()
|
|
689
|
+
parser.feed(payload)
|
|
690
|
+
return parser.lines
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def _strip_minimal_rtf(payload: str) -> str:
|
|
694
|
+
if not payload.lstrip().startswith("{\\rtf"):
|
|
695
|
+
return payload
|
|
696
|
+
stripped = payload.replace("\\par", "\n")
|
|
697
|
+
return "".join(ch for ch in stripped if ch not in "{}")
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
def _plain_text_lines(payload: str) -> list[str]:
|
|
701
|
+
return [line.strip() for line in payload.splitlines() if line.strip()]
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def _xml_text_lines(root: StdElementTree.Element) -> list[str]:
|
|
705
|
+
return [text.strip() for text in root.itertext() if text and text.strip()]
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def _structured_preview_lines(value: object) -> list[str]:
|
|
709
|
+
if isinstance(value, dict):
|
|
710
|
+
return [f"{key}: {preview}" for key, preview in list(value.items())[:20]]
|
|
711
|
+
if isinstance(value, list):
|
|
712
|
+
return [json.dumps(item, ensure_ascii=False, sort_keys=True) for item in value[:20]]
|
|
713
|
+
return [str(value)]
|
|
714
|
+
|
|
715
|
+
|
|
716
|
+
def _paragraphs_from_text_lines(
|
|
717
|
+
artifact_id: str,
|
|
718
|
+
lines: list[str],
|
|
719
|
+
*,
|
|
720
|
+
source_prefix: str,
|
|
721
|
+
) -> list[ParagraphBlock]:
|
|
722
|
+
return [
|
|
723
|
+
ParagraphBlock(
|
|
724
|
+
block_id=f"{artifact_id}-line-{index:03d}",
|
|
725
|
+
text=line,
|
|
726
|
+
source_path=f"{source_prefix}#line[{index}]",
|
|
727
|
+
)
|
|
728
|
+
for index, line in enumerate(lines, start=1)
|
|
729
|
+
if line
|
|
730
|
+
]
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _base_metadata(
|
|
734
|
+
path: Path,
|
|
735
|
+
*,
|
|
736
|
+
known_format: KnownDocumentFormat,
|
|
737
|
+
adapter_id: str,
|
|
738
|
+
mutation_policy: str,
|
|
739
|
+
) -> dict[str, MetadataValue]:
|
|
740
|
+
return {
|
|
741
|
+
"adapter_id": adapter_id,
|
|
742
|
+
"known_format": known_format.value,
|
|
743
|
+
"mutation_policy": mutation_policy,
|
|
744
|
+
"byte_size": path.stat().st_size,
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def _known_format(path: Path) -> KnownDocumentFormat:
|
|
749
|
+
return _KNOWN_BY_EXTENSION.get(path.suffix.lower(), KnownDocumentFormat.txt)
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def _local_name(tag: str) -> str:
|
|
753
|
+
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
class _VisibleTextParser(HTMLParser):
|
|
757
|
+
"""Small HTML text extractor for passive public-form exports."""
|
|
758
|
+
|
|
759
|
+
def __init__(self) -> None:
|
|
760
|
+
super().__init__(convert_charrefs=True)
|
|
761
|
+
self.lines: list[str] = []
|
|
762
|
+
|
|
763
|
+
def handle_data(self, data: str) -> None:
|
|
764
|
+
text = data.strip()
|
|
765
|
+
if text:
|
|
766
|
+
self.lines.append(text)
|