@ngocsangairvds/vsaf 3.1.27 → 3.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/global.js +65 -39
- package/tools/skills/vds-scripts-skill/.openskills.json +6 -0
- package/tools/skills/vds-scripts-skill/QUALITY.md +44 -0
- package/tools/skills/vds-scripts-skill/SKILL.md +135 -0
- package/tools/skills/vds-scripts-skill/references/audit-commands.md +171 -0
- package/tools/skills/vds-scripts-skill/references/capability-index.md +34 -0
- package/tools/skills/vds-scripts-skill/references/development-commands.md +12 -0
- package/tools/skills/vds-scripts-skill/references/google-sheets.md +73 -0
- package/tools/skills/vds-scripts-skill/references/integration-commands.md +17 -0
- package/tools/skills/vds-scripts-skill/references/platform-bootstrap.md +31 -0
- package/tools/skills/vds-scripts-skill/references/specialist-routing.md +14 -0
- package/tools/skills/vds-scripts-skill/references/validation-commands.md +15 -0
- package/tools/skills/vsaf-build/SKILL.md +32 -2
- package/tools/skills/vsaf-ship/SKILL.md +41 -10
- package/tools/skills/vsaf-test/SKILL.md +8 -0
- package/tools/vds-scripts/.mcp.json +11 -0
- package/tools/vds-scripts/.secrets.baseline +133 -0
- package/tools/vds-scripts/AGENTS.md +152 -0
- package/tools/vds-scripts/CLAUDE.md +101 -0
- package/tools/vds-scripts/CLI_COMMAND_OPTIMIZATION.md +156 -0
- package/tools/vds-scripts/PACKAGE_P125B_IMPLEMENTATION_SUMMARY.md +131 -0
- package/tools/vds-scripts/PROJECT_COMPLETION_SUMMARY.md +45 -0
- package/tools/vds-scripts/README.md +97 -0
- package/tools/vds-scripts/bitbucket_manifest_mapping.toml +34 -0
- package/tools/vds-scripts/bitbucket_orchestrator/ARCHITECTURE_ANALYSIS.md +258 -0
- package/tools/vds-scripts/bitbucket_orchestrator/BITBUCKET_API_PRACTICES.md +393 -0
- package/tools/vds-scripts/bitbucket_orchestrator/EVALUATION_REPORT.md +61 -0
- package/tools/vds-scripts/bitbucket_orchestrator/FEATURES.md +908 -0
- package/tools/vds-scripts/bitbucket_orchestrator/README.md +687 -0
- package/tools/vds-scripts/bitbucket_orchestrator/pyproject.toml +40 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/__init__.py +20 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/async_client.py +657 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/cli.py +2108 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/client.py +2534 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/config.py +171 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/errors.py +67 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/factory.py +185 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/protocols.py +244 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/__init__.py +8 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/conftest.py +65 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_advanced_search.py +151 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_async_client.py +546 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_branch_permissions.py +145 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_cli.py +115 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client.py +157 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_branch_conditions.py +79 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_code_advanced.py +163 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_code_file.py +32 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_deployment_environments.py +194 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_issues.py +164 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_pipelines_advanced.py +179 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_pr_blockers.py +119 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_repository_variables.py +156 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code.py +98 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code_advanced.py +282 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code_insights.py +335 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_conditions.py +147 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_config.py +131 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_deployment_env.py +352 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_factory.py +371 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_fork_operations.py +204 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_issue_cli.py +261 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_pipeline_advanced.py +270 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_pr_blocker.py +204 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_protocols.py +334 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_repo_settings.py +343 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_repo_variables.py +270 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_webhooks.py +189 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_workspace.py +233 -0
- package/tools/vds-scripts/bitbucket_orchestrator/uv.lock +742 -0
- package/tools/vds-scripts/confluence_orchestrator/Dockerfile +19 -0
- package/tools/vds-scripts/confluence_orchestrator/README.md +412 -0
- package/tools/vds-scripts/confluence_orchestrator/SYNC_SCRIPTS.md +127 -0
- package/tools/vds-scripts/confluence_orchestrator/SYNC_STANDARDIZATION.md +108 -0
- package/tools/vds-scripts/confluence_orchestrator/pyproject.toml +48 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/__init__.py +20 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/cli.py +2532 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/config.py +175 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/content.py +290 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/content_v2.py +94 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/crawl_tree.py +1835 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/errors.py +80 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/eventing.py +109 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/http.py +1114 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/orchestration.py +165 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/reporting.py +78 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/tree.py +121 -0
- package/tools/vds-scripts/confluence_orchestrator/sync_pdfs_from_markdown.py +213 -0
- package/tools/vds-scripts/confluence_orchestrator/sync_pdfs_to_confluence.py +305 -0
- package/tools/vds-scripts/confluence_orchestrator/sync_png_attachments.py +305 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/__init__.py +0 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/conftest.py +8 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_advanced_content.py +224 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_advanced_search.py +188 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_cache_management.py +247 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_cli.py +499 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_config.py +83 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_content.py +186 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_content_flags.py +27 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_crawl_tree.py +2250 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_draft_management.py +223 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing.py +71 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_chaos.py +37 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_rate_limit.py +44 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_timeout.py +49 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_export.py +230 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_history.py +204 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_http.py +117 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_orchestration.py +91 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_reporting.py +24 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_search_cql.py +34 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_space_management.py +237 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_space_permissions.py +332 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_user_group_management.py +388 -0
- package/tools/vds-scripts/confluence_orchestrator/uv.lock +1023 -0
- package/tools/vds-scripts/git_orchestrator/ENHANCEMENT_SUMMARY.md +119 -0
- package/tools/vds-scripts/git_orchestrator/README.md +280 -0
- package/tools/vds-scripts/git_orchestrator/VERIFICATION_REPORT.md +152 -0
- package/tools/vds-scripts/git_orchestrator/pyproject.toml +35 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/__init__.py +7 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/__main__.py +4 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/cli.py +847 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/logging_config.py +63 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/manifest.py +129 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/orchestrator.py +819 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/reporting.py +53 -0
- package/tools/vds-scripts/git_orchestrator/tests/__init__.py +0 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_cli_settings.py +21 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_integration.py +74 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_manifest.py +79 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_orchestrator.py +204 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_public_api.py +236 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_resilience.py +345 -0
- package/tools/vds-scripts/git_orchestrator/uv.lock +271 -0
- package/tools/vds-scripts/jira_orchestrator/README.md +770 -0
- package/tools/vds-scripts/jira_orchestrator/pyproject.toml +39 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/__init__.py +1 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/adapter.py +1320 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/cli.py +2271 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/config.py +138 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/errors.py +67 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/reporting.py +65 -0
- package/tools/vds-scripts/jira_orchestrator/tests/__init__.py +1 -0
- package/tools/vds-scripts/jira_orchestrator/tests/conftest.py +86 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_agile_list_payloads.py +54 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_bulk_operations.py +69 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_components.py +57 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_createmeta.py +45 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_dashboard.py +117 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_issue_properties.py +54 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_permissions_compat.py +42 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_reindex.py +42 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_remote_links.py +76 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_transitions.py +91 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_user_management.py +110 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_version_management.py +133 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_watchers.py +41 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_advanced_search.py +164 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_agile.py +256 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_application_properties.py +193 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_backlog.py +91 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_bulk_operations.py +277 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_cli.py +106 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_components.py +106 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_config.py +164 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_dashboard.py +122 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_discover_fields.py +207 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_filter_management.py +333 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_issue_archiving.py +164 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_issue_links.py +257 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_issue_properties.py +171 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_link_types.py +314 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_parse_set.py +37 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_permissions.py +273 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_reindex.py +81 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_remote_links.py +254 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_security_schemes.py +170 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_transitions_changelog.py +114 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_user_management.py +226 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_version_management.py +339 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_watchers.py +101 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_worklog.py +223 -0
- package/tools/vds-scripts/jira_orchestrator/uv.lock +738 -0
- package/tools/vds-scripts/mcp_server/Dockerfile +34 -0
- package/tools/vds-scripts/mcp_server/README.md +140 -0
- package/tools/vds-scripts/mcp_server/pyproject.toml +42 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/__init__.py +4 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/config.py +36 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/server.py +66 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/__init__.py +14 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/bitbucket_tools.py +47 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/confluence_tools.py +59 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/git_tools.py +71 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/jira_tools.py +63 -0
- package/tools/vds-scripts/mcp_server/tests/__init__.py +2 -0
- package/tools/vds-scripts/mcp_server/tests/conftest.py +29 -0
- package/tools/vds-scripts/mcp_server/tests/unit/__init__.py +2 -0
- package/tools/vds-scripts/mcp_server/tests/unit/test_bitbucket_tools.py +25 -0
- package/tools/vds-scripts/mcp_server/tests/unit/test_confluence_tools.py +25 -0
- package/tools/vds-scripts/mcp_server/tests/unit/test_git_tools.py +32 -0
- package/tools/vds-scripts/mcp_server/tests/unit/test_jira_tools.py +32 -0
- package/tools/vds-scripts/mcp_server/tests/verification/__init__.py +2 -0
- package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_confluence_tools.py +40 -0
- package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_jira_tools.py +37 -0
- package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_tool_registration.py +47 -0
- package/tools/vds-scripts/mcp_server/uv.lock +1032 -0
- package/tools/vds-scripts/mypy.ini +5 -0
- package/tools/vds-scripts/pyproject.toml +29 -0
- package/tools/vds-scripts/repo-manifest.yaml +273 -0
- package/tools/vds-scripts/repo-manifest.yaml.example +25 -0
- package/tools/vds-scripts/scripts/BRD-Validation-API.postman_collection.json +706 -0
- package/tools/vds-scripts/scripts/BRD-Validation-README.md +308 -0
- package/tools/vds-scripts/scripts/README.md +162 -0
- package/tools/vds-scripts/scripts/bootstrap_uv.sh +30 -0
- package/tools/vds-scripts/scripts/brd-validation-environment.json +51 -0
- package/tools/vds-scripts/scripts/brd-validation-test-results.json +13023 -0
- package/tools/vds-scripts/scripts/brd_coverage_report.json +276 -0
- package/tools/vds-scripts/scripts/create_memory_session.py +35 -0
- package/tools/vds-scripts/scripts/deployment/load_docker_images_offline.sh +90 -0
- package/tools/vds-scripts/scripts/final_completion_report.md +139 -0
- package/tools/vds-scripts/scripts/folder_structure_report.json +321 -0
- package/tools/vds-scripts/scripts/generate_completion_report.py +125 -0
- package/tools/vds-scripts/scripts/generate_intellij_modules.py +150 -0
- package/tools/vds-scripts/scripts/link_integrity_report.json +807 -0
- package/tools/vds-scripts/scripts/move_audit_artifact_pages.py +255 -0
- package/tools/vds-scripts/scripts/move_audit_artifact_pages_rest.py +165 -0
- package/tools/vds-scripts/scripts/move_wrong_dept_pages.py +216 -0
- package/tools/vds-scripts/scripts/save_intellij_memories.py +120 -0
- package/tools/vds-scripts/scripts/save_memories_to_vds_ai.py +83 -0
- package/tools/vds-scripts/scripts/save_memories_vds_style.py +129 -0
- package/tools/vds-scripts/scripts/search_intellij_memories.py +50 -0
- package/tools/vds-scripts/scripts/setup_intellij_workspace.py +65 -0
- package/tools/vds-scripts/scripts/target-state-automation/README.md +89 -0
- package/tools/vds-scripts/scripts/target-state-automation/confluence_sync_coordinator.sh +27 -0
- package/tools/vds-scripts/scripts/target-state-automation/coordination.sh +114 -0
- package/tools/vds-scripts/scripts/target-state-automation/diagram_coordinator.sh +25 -0
- package/tools/vds-scripts/scripts/target-state-automation/docs_root.sh +22 -0
- package/tools/vds-scripts/scripts/target-state-automation/generate_diagrams.sh +22 -0
- package/tools/vds-scripts/scripts/target-state-automation/markdown_coordinator.sh +25 -0
- package/tools/vds-scripts/scripts/target-state-automation/progress_dashboard.sh +17 -0
- package/tools/vds-scripts/scripts/target-state-automation/schema_coordinator.sh +25 -0
- package/tools/vds-scripts/scripts/target-state-automation/sync_confluence.sh +30 -0
- package/tools/vds-scripts/scripts/target-state-automation/update_dependencies.sh +19 -0
- package/tools/vds-scripts/scripts/target-state-automation/validate_links.sh +86 -0
- package/tools/vds-scripts/scripts/target-state-automation/validate_markdown.sh +52 -0
- package/tools/vds-scripts/scripts/target-state-automation/validate_schemas.sh +26 -0
- package/tools/vds-scripts/scripts/target-state-automation/validate_structure.sh +98 -0
- package/tools/vds-scripts/scripts/update_modules_xml.py +190 -0
- package/tools/vds-scripts/scripts/uv-workspace-alignment-verification-2026-03-25.md +128 -0
- package/tools/vds-scripts/scripts/validate_brd_coverage.py +179 -0
- package/tools/vds-scripts/scripts/validate_folder_structure.py +240 -0
- package/tools/vds-scripts/scripts/validate_link_integrity.py +272 -0
- package/tools/vds-scripts/scripts/vds_sh_helpers.sh +180 -0
- package/tools/vds-scripts/scripts/verification/phase2_portable_paths_ubuntu_docker.sh +26 -0
- package/tools/vds-scripts/scripts/worktree_uv.sh +48 -0
- package/tools/vds-scripts/uv.lock +8 -0
- package/tools/vds-scripts/vds_cli/README.md +126 -0
- package/tools/vds-scripts/vds_cli/VERIFICATION_REPORT.md +41 -0
- package/tools/vds-scripts/vds_cli/pyproject.toml +38 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/__init__.py +3 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/cli.py +173 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/docs_sync.py +1203 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/env.py +41 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/google_sheets_orchestrator/__init__.py +3 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/google_sheets_orchestrator/google_sheets_orchestrator.py +198 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/router.py +93 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/sync_api.py +647 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/sync_service.py +266 -0
- package/tools/vds-scripts/vds_cli/tests/__init__.py +2 -0
- package/tools/vds-scripts/vds_cli/tests/conftest.py +49 -0
- package/tools/vds-scripts/vds_cli/tests/unit/__init__.py +2 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_cli.py +143 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_docs_sync.py +422 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_env.py +51 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_router.py +72 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_sync_api.py +357 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_sync_service.py +160 -0
- package/tools/vds-scripts/vds_cli/tests/verification/__init__.py +2 -0
- package/tools/vds-scripts/vds_cli/tests/verification/test_bitbucket_real.py +33 -0
- package/tools/vds-scripts/vds_cli/tests/verification/test_confluence_real.py +35 -0
- package/tools/vds-scripts/vds_cli/tests/verification/test_jira_real.py +41 -0
- package/tools/vds-scripts/vds_cli/uv.lock +524 -0
- package/tools/vds-scripts/vds_cli_common/README.md +190 -0
- package/tools/vds-scripts/vds_cli_common/pyproject.toml +92 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/__init__.py +34 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/completers.py +139 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/context.py +201 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/env.py +119 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/errors.py +318 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/output.py +284 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/paths.py +78 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/testing.py +213 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/version.py +85 -0
- package/tools/vds-scripts/vds_cli_common/tests/__init__.py +1 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_completers.py +148 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_context.py +192 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_env.py +102 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_errors.py +186 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_output.py +229 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_paths.py +61 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_testing.py +138 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_version.py +64 -0
|
@@ -0,0 +1,2250 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
import zipfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import pytest
|
|
12
|
+
from confluence_orchestrator.cli import app
|
|
13
|
+
from confluence_orchestrator.crawl_tree import CrawlOptions, CrawlState, _create_bundle_zip, crawl_tree_to_disk
|
|
14
|
+
from typer.testing import CliRunner
|
|
15
|
+
|
|
16
|
+
runner = CliRunner()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_crawl_options_phase80_defaults() -> None:
|
|
20
|
+
options = CrawlOptions()
|
|
21
|
+
assert options.max_depth == 3
|
|
22
|
+
assert options.max_nodes == 500
|
|
23
|
+
assert options.page_concurrency == 5
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_crawl_tree_downloads_pages_and_attachments(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
27
|
+
class DummyClient:
|
|
28
|
+
def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
|
|
29
|
+
raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
|
|
30
|
+
|
|
31
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
32
|
+
assert page_id == "1"
|
|
33
|
+
assert expand is not None
|
|
34
|
+
return {
|
|
35
|
+
"id": page_id,
|
|
36
|
+
"title": "Root",
|
|
37
|
+
"body": {
|
|
38
|
+
"view": {
|
|
39
|
+
"value": (
|
|
40
|
+
"<h1>Intro</h1>"
|
|
41
|
+
"<p>Hello</p>"
|
|
42
|
+
'<a class="confluence-userlink" data-username="john.doe" href="/display/~john.doe">@John Doe</a>'
|
|
43
|
+
"<table><tr><th>H1</th><th>H2</th></tr><tr><td>A</td><td>B</td></tr></table>"
|
|
44
|
+
'<a href="/display/X/Y">Y</a>'
|
|
45
|
+
'<a href="/pages/viewpage.action?pageId=777">P777</a>'
|
|
46
|
+
'<a href="/pages/888/Test">P888</a>'
|
|
47
|
+
)
|
|
48
|
+
},
|
|
49
|
+
"storage": {"value": "<p>fallback</p>"},
|
|
50
|
+
},
|
|
51
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
|
|
52
|
+
"space": {"key": "SPACE"},
|
|
53
|
+
"version": {"number": 1, "when": "2026-01-01T00:00:00.000+00:00"},
|
|
54
|
+
"ancestors": [{"id": "10", "title": "Ancestor"}],
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
58
|
+
assert page_id == "1"
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
62
|
+
assert page_id == "1"
|
|
63
|
+
return [
|
|
64
|
+
{
|
|
65
|
+
"id": "a1",
|
|
66
|
+
"title": "file.txt",
|
|
67
|
+
"version": {"number": 2},
|
|
68
|
+
"extensions": {"fileSize": 3},
|
|
69
|
+
}
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
73
|
+
assert attachment_id == "a1"
|
|
74
|
+
return b"abc"
|
|
75
|
+
|
|
76
|
+
dummy = DummyClient()
|
|
77
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
78
|
+
|
|
79
|
+
out_dir = tmp_path / "out"
|
|
80
|
+
result = runner.invoke(
|
|
81
|
+
app,
|
|
82
|
+
[
|
|
83
|
+
"--report-dir",
|
|
84
|
+
str(tmp_path),
|
|
85
|
+
"crawl-tree",
|
|
86
|
+
"1",
|
|
87
|
+
"--out",
|
|
88
|
+
str(out_dir),
|
|
89
|
+
"--depth",
|
|
90
|
+
"0",
|
|
91
|
+
"--emit-corpus",
|
|
92
|
+
"--emit-link-graph",
|
|
93
|
+
"--emit-tables",
|
|
94
|
+
"--emit-mentions",
|
|
95
|
+
],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
assert result.exit_code == 0, result.output
|
|
99
|
+
payload = json.loads(result.stdout)
|
|
100
|
+
|
|
101
|
+
assert payload["root_page_id"] == "1"
|
|
102
|
+
assert (out_dir / "1_Root.html").exists()
|
|
103
|
+
assert (out_dir / "attachments" / "a1_file.txt").exists()
|
|
104
|
+
assert payload["corpus_path"]
|
|
105
|
+
assert payload["link_graph_path"]
|
|
106
|
+
assert payload["tables_path"]
|
|
107
|
+
assert payload["mentions_path"]
|
|
108
|
+
|
|
109
|
+
corpus_records = [
|
|
110
|
+
json.loads(line)
|
|
111
|
+
for line in Path(payload["corpus_path"]).read_text(encoding="utf-8").splitlines()
|
|
112
|
+
if line.strip()
|
|
113
|
+
]
|
|
114
|
+
assert corpus_records
|
|
115
|
+
assert corpus_records[0]["depth"] == 0
|
|
116
|
+
assert corpus_records[0]["parent_id"] is None
|
|
117
|
+
assert corpus_records[0]["ancestors"] == ["10"]
|
|
118
|
+
assert corpus_records[0]["ancestor_titles"] == ["Ancestor"]
|
|
119
|
+
|
|
120
|
+
tables_path = Path(payload["tables_path"])
|
|
121
|
+
mentions_path = Path(payload["mentions_path"])
|
|
122
|
+
assert tables_path.exists()
|
|
123
|
+
assert mentions_path.exists()
|
|
124
|
+
|
|
125
|
+
table_record = json.loads(tables_path.read_text(encoding="utf-8").splitlines()[0])
|
|
126
|
+
assert table_record["page_id"] == "1"
|
|
127
|
+
assert table_record["table_index"] == 0
|
|
128
|
+
assert table_record["headers"] == ["H1", "H2"]
|
|
129
|
+
assert table_record["rows"] == [["A", "B"]]
|
|
130
|
+
|
|
131
|
+
mention_records = [
|
|
132
|
+
json.loads(line) for line in mentions_path.read_text(encoding="utf-8").splitlines() if line.strip()
|
|
133
|
+
]
|
|
134
|
+
assert mention_records
|
|
135
|
+
assert all(record["page_id"] == "1" for record in mention_records)
|
|
136
|
+
assert any(record.get("type") == "confluence_page" and record.get("value") == "777" for record in mention_records)
|
|
137
|
+
|
|
138
|
+
link_graph = json.loads(Path(payload["link_graph_path"]).read_text(encoding="utf-8"))
|
|
139
|
+
edges = link_graph["edges"]
|
|
140
|
+
enriched = {edge.get("target_page_id"): edge for edge in edges if edge.get("target_page_id")}
|
|
141
|
+
assert enriched["777"]["target_type"] == "confluence_page"
|
|
142
|
+
assert enriched["888"]["target_type"] == "confluence_page"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_crawl_tree_appends_attachment_suffix_when_missing(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
146
|
+
class DummyClient:
|
|
147
|
+
def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
|
|
148
|
+
raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
|
|
149
|
+
|
|
150
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
151
|
+
assert page_id == "1"
|
|
152
|
+
return {
|
|
153
|
+
"id": page_id,
|
|
154
|
+
"title": "Root",
|
|
155
|
+
"body": {"view": {"value": "<p>ok</p>"}, "storage": {"value": ""}},
|
|
156
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
160
|
+
return []
|
|
161
|
+
|
|
162
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
163
|
+
return [
|
|
164
|
+
{
|
|
165
|
+
"id": "a2",
|
|
166
|
+
"title": "diagram",
|
|
167
|
+
"extensions": {"fileSize": 3, "mediaType": "application/pdf"},
|
|
168
|
+
}
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
172
|
+
assert attachment_id == "a2"
|
|
173
|
+
return b"abc"
|
|
174
|
+
|
|
175
|
+
dummy = DummyClient()
|
|
176
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
177
|
+
|
|
178
|
+
out_dir = tmp_path / "out"
|
|
179
|
+
result = runner.invoke(
|
|
180
|
+
app,
|
|
181
|
+
[
|
|
182
|
+
"--report-dir",
|
|
183
|
+
str(tmp_path),
|
|
184
|
+
"crawl-tree",
|
|
185
|
+
"1",
|
|
186
|
+
"--out",
|
|
187
|
+
str(out_dir),
|
|
188
|
+
"--depth",
|
|
189
|
+
"0",
|
|
190
|
+
"--emit-corpus",
|
|
191
|
+
],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
assert result.exit_code == 0, result.output
|
|
195
|
+
assert (out_dir / "attachments" / "a2_diagram.pdf").exists()
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def test_crawl_tree_infers_server_from_root_url(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
199
|
+
class DummyClient:
|
|
200
|
+
def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
|
|
201
|
+
raise AssertionError("search_cql should not be called for pageId URL root")
|
|
202
|
+
|
|
203
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
204
|
+
assert page_id == "1"
|
|
205
|
+
return {
|
|
206
|
+
"id": page_id,
|
|
207
|
+
"title": "Root",
|
|
208
|
+
"body": {"view": {"value": "<p>ok</p>"}, "storage": {"value": ""}},
|
|
209
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
213
|
+
return []
|
|
214
|
+
|
|
215
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
216
|
+
return []
|
|
217
|
+
|
|
218
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
219
|
+
raise AssertionError("no attachments expected")
|
|
220
|
+
|
|
221
|
+
dummy = DummyClient()
|
|
222
|
+
observed: dict[str, str] = {}
|
|
223
|
+
|
|
224
|
+
def _build_http_client(settings, server: str):
|
|
225
|
+
observed["server"] = server
|
|
226
|
+
return dummy
|
|
227
|
+
|
|
228
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", _build_http_client)
|
|
229
|
+
|
|
230
|
+
out_dir = tmp_path / "out"
|
|
231
|
+
result = runner.invoke(
|
|
232
|
+
app,
|
|
233
|
+
[
|
|
234
|
+
"--report-dir",
|
|
235
|
+
str(tmp_path),
|
|
236
|
+
"crawl-tree",
|
|
237
|
+
"http://10.254.136.35:8090/pages/viewpage.action?pageId=1",
|
|
238
|
+
"--out",
|
|
239
|
+
str(out_dir),
|
|
240
|
+
"--depth",
|
|
241
|
+
"0",
|
|
242
|
+
"--no-include-attachments",
|
|
243
|
+
],
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
assert result.exit_code == 0, result.output
|
|
247
|
+
# 10.254.136.35 is one of the two old external Confluence servers — both map to "external"
|
|
248
|
+
assert observed["server"] == "external"
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def test_crawl_tree_does_not_emit_tables_or_mentions_by_default(
|
|
252
|
+
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
|
253
|
+
) -> None:
|
|
254
|
+
class DummyClient:
|
|
255
|
+
def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
|
|
256
|
+
raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
|
|
257
|
+
|
|
258
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
259
|
+
assert page_id == "1"
|
|
260
|
+
return {
|
|
261
|
+
"id": page_id,
|
|
262
|
+
"title": "Root",
|
|
263
|
+
"body": {
|
|
264
|
+
"view": {
|
|
265
|
+
"value": (
|
|
266
|
+
'<a class="confluence-userlink" data-username="john.doe">@John Doe</a>'
|
|
267
|
+
"<table><tr><td>only</td></tr></table>"
|
|
268
|
+
)
|
|
269
|
+
},
|
|
270
|
+
"storage": {"value": ""},
|
|
271
|
+
},
|
|
272
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
276
|
+
return []
|
|
277
|
+
|
|
278
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
282
|
+
raise AssertionError("no attachments expected")
|
|
283
|
+
|
|
284
|
+
dummy = DummyClient()
|
|
285
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
286
|
+
|
|
287
|
+
out_dir = tmp_path / "out"
|
|
288
|
+
result = runner.invoke(
|
|
289
|
+
app,
|
|
290
|
+
[
|
|
291
|
+
"--report-dir",
|
|
292
|
+
str(tmp_path),
|
|
293
|
+
"crawl-tree",
|
|
294
|
+
"1",
|
|
295
|
+
"--out",
|
|
296
|
+
str(out_dir),
|
|
297
|
+
"--depth",
|
|
298
|
+
"0",
|
|
299
|
+
"--no-include-attachments",
|
|
300
|
+
],
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
assert result.exit_code == 0, result.output
|
|
304
|
+
payload = json.loads(result.stdout)
|
|
305
|
+
assert payload["tables_path"] is None
|
|
306
|
+
assert payload["mentions_path"] is None
|
|
307
|
+
assert not (out_dir / "tables.jsonl").exists()
|
|
308
|
+
assert not (out_dir / "mentions.jsonl").exists()
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def test_crawl_tree_resolves_display_url(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
312
|
+
class DummyClient:
|
|
313
|
+
def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
|
|
314
|
+
assert 'space="TDOV"' in cql
|
|
315
|
+
assert 'title="My Page"' in cql
|
|
316
|
+
return {"results": [{"id": "42"}]}
|
|
317
|
+
|
|
318
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
319
|
+
assert page_id == "42"
|
|
320
|
+
return {
|
|
321
|
+
"id": page_id,
|
|
322
|
+
"title": "My Page",
|
|
323
|
+
"body": {"storage": {"value": "<p>ok</p>"}, "view": {"value": ""}},
|
|
324
|
+
"_links": {"base": "http://example", "webui": "/display/TDOV/My+Page"},
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
331
|
+
return []
|
|
332
|
+
|
|
333
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
334
|
+
raise AssertionError("no attachments expected")
|
|
335
|
+
|
|
336
|
+
dummy = DummyClient()
|
|
337
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
338
|
+
|
|
339
|
+
out_dir = tmp_path / "out"
|
|
340
|
+
result = runner.invoke(
|
|
341
|
+
app,
|
|
342
|
+
[
|
|
343
|
+
"--report-dir",
|
|
344
|
+
str(tmp_path),
|
|
345
|
+
"crawl-tree",
|
|
346
|
+
"http://example/display/TDOV/My+Page",
|
|
347
|
+
"--out",
|
|
348
|
+
str(out_dir),
|
|
349
|
+
"--depth",
|
|
350
|
+
"0",
|
|
351
|
+
"--no-include-attachments",
|
|
352
|
+
],
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
assert result.exit_code == 0, result.output
|
|
356
|
+
payload = json.loads(result.stdout)
|
|
357
|
+
assert payload["root_page_id"] == "42"
|
|
358
|
+
assert (out_dir / "42_My Page.html").exists()
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def test_crawl_tree_splits_corpus_chunks_by_max_chars(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
362
|
+
long_text = "word " * 200 # > 50 chars
|
|
363
|
+
|
|
364
|
+
class DummyClient:
|
|
365
|
+
def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
|
|
366
|
+
raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
|
|
367
|
+
|
|
368
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
369
|
+
assert page_id == "1"
|
|
370
|
+
return {
|
|
371
|
+
"id": page_id,
|
|
372
|
+
"title": "Root",
|
|
373
|
+
"body": {"view": {"value": f"<h1>Intro</h1><p>{long_text}</p>"}, "storage": {"value": ""}},
|
|
374
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
378
|
+
return []
|
|
379
|
+
|
|
380
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
381
|
+
return []
|
|
382
|
+
|
|
383
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
384
|
+
raise AssertionError("no attachments expected")
|
|
385
|
+
|
|
386
|
+
dummy = DummyClient()
|
|
387
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
388
|
+
|
|
389
|
+
out_dir = tmp_path / "out"
|
|
390
|
+
result = runner.invoke(
|
|
391
|
+
app,
|
|
392
|
+
[
|
|
393
|
+
"--report-dir",
|
|
394
|
+
str(tmp_path),
|
|
395
|
+
"crawl-tree",
|
|
396
|
+
"1",
|
|
397
|
+
"--out",
|
|
398
|
+
str(out_dir),
|
|
399
|
+
"--depth",
|
|
400
|
+
"0",
|
|
401
|
+
"--no-include-attachments",
|
|
402
|
+
"--emit-corpus",
|
|
403
|
+
"--chunk-max-chars",
|
|
404
|
+
"50",
|
|
405
|
+
],
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
assert result.exit_code == 0, result.output
|
|
409
|
+
payload = json.loads(result.stdout)
|
|
410
|
+
corpus_path = Path(payload["corpus_path"])
|
|
411
|
+
corpus = [json.loads(line) for line in corpus_path.read_text(encoding="utf-8").splitlines() if line.strip()]
|
|
412
|
+
assert len(corpus) > 1
|
|
413
|
+
assert all(len(entry.get("text") or "") <= 50 for entry in corpus)
|
|
414
|
+
assert {entry.get("chunk_ordinal") for entry in corpus} == set(range(len(corpus)))
|
|
415
|
+
assert corpus[0]["source_type"] == "confluence"
|
|
416
|
+
assert corpus[0]["doc_class"] == "canonical_doc"
|
|
417
|
+
assert corpus[0]["generated_flag"] is False
|
|
418
|
+
assert corpus[0]["historical_flag"] is False
|
|
419
|
+
assert corpus[0]["discovery"] == "confluence_crawl_page"
|
|
420
|
+
|
|
421
|
+
base_key = "1|Intro"
|
|
422
|
+
expected_first = hashlib.sha256(base_key.encode("utf-8")).hexdigest()[:16]
|
|
423
|
+
expected_second = hashlib.sha256(f"{base_key}|1".encode()).hexdigest()[:16]
|
|
424
|
+
assert corpus[0]["chunk_id"] == expected_first
|
|
425
|
+
assert corpus[1]["chunk_id"] == expected_second
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def test_crawl_tree_uses_export_view_when_requested(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
429
|
+
class DummyClient:
|
|
430
|
+
def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
|
|
431
|
+
raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
|
|
432
|
+
|
|
433
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
434
|
+
assert page_id == "1"
|
|
435
|
+
assert expand is not None
|
|
436
|
+
assert "body.export_view" in expand
|
|
437
|
+
return {
|
|
438
|
+
"id": page_id,
|
|
439
|
+
"title": "Root",
|
|
440
|
+
"body": {
|
|
441
|
+
"view": {"value": "<p>view</p>"},
|
|
442
|
+
"export_view": {"value": "<p>export</p>"},
|
|
443
|
+
"storage": {"value": "<p>storage</p>"},
|
|
444
|
+
},
|
|
445
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
449
|
+
return []
|
|
450
|
+
|
|
451
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
452
|
+
return []
|
|
453
|
+
|
|
454
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
455
|
+
raise AssertionError("no attachments expected")
|
|
456
|
+
|
|
457
|
+
dummy = DummyClient()
|
|
458
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
459
|
+
|
|
460
|
+
out_dir = tmp_path / "out"
|
|
461
|
+
result = runner.invoke(
|
|
462
|
+
app,
|
|
463
|
+
[
|
|
464
|
+
"--report-dir",
|
|
465
|
+
str(tmp_path),
|
|
466
|
+
"crawl-tree",
|
|
467
|
+
"1",
|
|
468
|
+
"--out",
|
|
469
|
+
str(out_dir),
|
|
470
|
+
"--depth",
|
|
471
|
+
"0",
|
|
472
|
+
"--no-include-attachments",
|
|
473
|
+
"--body-format",
|
|
474
|
+
"export_view",
|
|
475
|
+
],
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
assert result.exit_code == 0, result.output
|
|
479
|
+
payload = json.loads(result.stdout)
|
|
480
|
+
html_path = Path(payload["page"]["html_path"])
|
|
481
|
+
assert "export" in html_path.read_text(encoding="utf-8")
|
|
482
|
+
assert payload["page"]["source_format"] == "html-export-view"
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def test_crawl_tree_concurrency_preserves_deterministic_order(tmp_path: Path) -> None:
|
|
486
|
+
class DummyClient:
|
|
487
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
488
|
+
raise AssertionError("search_cql should not be called")
|
|
489
|
+
|
|
490
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
491
|
+
# Force non-deterministic completion order when concurrency > 1.
|
|
492
|
+
if page_id == "2":
|
|
493
|
+
time.sleep(0.04)
|
|
494
|
+
if page_id == "3":
|
|
495
|
+
time.sleep(0.01)
|
|
496
|
+
return {
|
|
497
|
+
"id": page_id,
|
|
498
|
+
"title": f"Page {page_id}",
|
|
499
|
+
"body": {"view": {"value": f"<h1>{page_id}</h1><p>Content {page_id}</p>"}},
|
|
500
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
501
|
+
"version": {"number": int(page_id)},
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
505
|
+
if page_id == "1":
|
|
506
|
+
return [{"id": "2"}, {"id": "3"}]
|
|
507
|
+
return []
|
|
508
|
+
|
|
509
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
510
|
+
return []
|
|
511
|
+
|
|
512
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
513
|
+
return b""
|
|
514
|
+
|
|
515
|
+
client = DummyClient()
|
|
516
|
+
sequential = crawl_tree_to_disk(
|
|
517
|
+
http=client,
|
|
518
|
+
root_ref="1",
|
|
519
|
+
output_dir=tmp_path / "seq",
|
|
520
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, emit_corpus=True, page_concurrency=1),
|
|
521
|
+
)
|
|
522
|
+
concurrent = crawl_tree_to_disk(
|
|
523
|
+
http=client,
|
|
524
|
+
root_ref="1",
|
|
525
|
+
output_dir=tmp_path / "concurrent",
|
|
526
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, emit_corpus=True, page_concurrency=4),
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
assert [p["page_id"] for p in sequential["child_pages"]] == ["2", "3"]
|
|
530
|
+
assert [p["page_id"] for p in concurrent["child_pages"]] == ["2", "3"]
|
|
531
|
+
|
|
532
|
+
seq_corpus = Path(sequential["corpus_path"]).read_text(encoding="utf-8").splitlines()
|
|
533
|
+
conc_corpus = Path(concurrent["corpus_path"]).read_text(encoding="utf-8").splitlines()
|
|
534
|
+
assert [json.loads(line)["page_id"] for line in seq_corpus] == [json.loads(line)["page_id"] for line in conc_corpus]
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def test_crawl_tree_parallelizes_child_discovery_with_page_fetch(tmp_path: Path) -> None:
|
|
538
|
+
lock = threading.Lock()
|
|
539
|
+
overlap_counter = 0
|
|
540
|
+
active_page_calls = 0
|
|
541
|
+
active_children_calls = 0
|
|
542
|
+
|
|
543
|
+
class DummyClient:
|
|
544
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
545
|
+
raise AssertionError("search_cql should not be called")
|
|
546
|
+
|
|
547
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
548
|
+
nonlocal overlap_counter, active_page_calls, active_children_calls
|
|
549
|
+
with lock:
|
|
550
|
+
active_page_calls += 1
|
|
551
|
+
if active_children_calls > 0:
|
|
552
|
+
overlap_counter += 1
|
|
553
|
+
time.sleep(0.03)
|
|
554
|
+
with lock:
|
|
555
|
+
active_page_calls -= 1
|
|
556
|
+
return {
|
|
557
|
+
"id": page_id,
|
|
558
|
+
"title": f"Page {page_id}",
|
|
559
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
560
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
561
|
+
"version": {"number": int(page_id)},
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
565
|
+
nonlocal overlap_counter, active_page_calls, active_children_calls
|
|
566
|
+
with lock:
|
|
567
|
+
active_children_calls += 1
|
|
568
|
+
if active_page_calls > 0:
|
|
569
|
+
overlap_counter += 1
|
|
570
|
+
time.sleep(0.03)
|
|
571
|
+
with lock:
|
|
572
|
+
active_children_calls -= 1
|
|
573
|
+
if page_id == "1":
|
|
574
|
+
return [{"id": "2"}]
|
|
575
|
+
return []
|
|
576
|
+
|
|
577
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
578
|
+
return []
|
|
579
|
+
|
|
580
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
581
|
+
return b""
|
|
582
|
+
|
|
583
|
+
result = crawl_tree_to_disk(
|
|
584
|
+
http=DummyClient(),
|
|
585
|
+
root_ref="1",
|
|
586
|
+
output_dir=tmp_path / "out",
|
|
587
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, page_concurrency=1),
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
assert result["child_pages"][0]["page_id"] == "2"
|
|
591
|
+
assert overlap_counter > 0
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def test_crawl_tree_concurrent_throughput_exceeds_2x_sequential(tmp_path: Path) -> None:
|
|
595
|
+
class DummyClient:
|
|
596
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
597
|
+
raise AssertionError("search_cql should not be called")
|
|
598
|
+
|
|
599
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
600
|
+
time.sleep(0.08)
|
|
601
|
+
return {
|
|
602
|
+
"id": page_id,
|
|
603
|
+
"title": f"Page {page_id}",
|
|
604
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
605
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
606
|
+
"version": {"number": int(page_id)},
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
610
|
+
if page_id == "1":
|
|
611
|
+
return [{"id": str(i)} for i in range(2, 8)]
|
|
612
|
+
return []
|
|
613
|
+
|
|
614
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
615
|
+
return []
|
|
616
|
+
|
|
617
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
618
|
+
return b""
|
|
619
|
+
|
|
620
|
+
client = DummyClient()
|
|
621
|
+
start = time.perf_counter()
|
|
622
|
+
crawl_tree_to_disk(
|
|
623
|
+
http=client,
|
|
624
|
+
root_ref="1",
|
|
625
|
+
output_dir=tmp_path / "seq",
|
|
626
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, page_concurrency=1),
|
|
627
|
+
)
|
|
628
|
+
sequential_elapsed = time.perf_counter() - start
|
|
629
|
+
|
|
630
|
+
start = time.perf_counter()
|
|
631
|
+
crawl_tree_to_disk(
|
|
632
|
+
http=client,
|
|
633
|
+
root_ref="1",
|
|
634
|
+
output_dir=tmp_path / "concurrent",
|
|
635
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, page_concurrency=5),
|
|
636
|
+
)
|
|
637
|
+
concurrent_elapsed = time.perf_counter() - start
|
|
638
|
+
|
|
639
|
+
assert concurrent_elapsed * 2 <= sequential_elapsed
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def test_crawl_tree_structure_first_mode_pre_discovers_before_fetch(tmp_path: Path) -> None:
|
|
643
|
+
call_log: list[tuple[str, str]] = []
|
|
644
|
+
|
|
645
|
+
class DummyClient:
|
|
646
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
647
|
+
raise AssertionError("search_cql should not be called")
|
|
648
|
+
|
|
649
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
650
|
+
call_log.append(("page", page_id))
|
|
651
|
+
return {
|
|
652
|
+
"id": page_id,
|
|
653
|
+
"title": f"Page {page_id}",
|
|
654
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
655
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
656
|
+
"version": {"number": int(page_id)},
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
660
|
+
call_log.append(("children", page_id))
|
|
661
|
+
if page_id == "1":
|
|
662
|
+
return [{"id": "2"}, {"id": "3"}]
|
|
663
|
+
if page_id == "2":
|
|
664
|
+
return [{"id": "4"}]
|
|
665
|
+
return []
|
|
666
|
+
|
|
667
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
668
|
+
return []
|
|
669
|
+
|
|
670
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
671
|
+
return b""
|
|
672
|
+
|
|
673
|
+
result = crawl_tree_to_disk(
|
|
674
|
+
http=DummyClient(),
|
|
675
|
+
root_ref="1",
|
|
676
|
+
output_dir=tmp_path / "out",
|
|
677
|
+
options=CrawlOptions(
|
|
678
|
+
max_depth=2,
|
|
679
|
+
include_attachments=False,
|
|
680
|
+
page_concurrency=2,
|
|
681
|
+
staged_mode="structure_first",
|
|
682
|
+
max_nodes=2,
|
|
683
|
+
),
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
first_page_call_index = next(index for index, event in enumerate(call_log) if event[0] == "page")
|
|
687
|
+
assert all(event[0] == "children" for event in call_log[:first_page_call_index])
|
|
688
|
+
assert call_log[0] == ("children", "1")
|
|
689
|
+
assert [page["page_id"] for page in result["child_pages"]] == ["2"]
|
|
690
|
+
assert result["resume_state"]["termination_reason"] == "max_nodes_reached"
|
|
691
|
+
assert result["resume_state"]["pending_queue_size"] > 0
|
|
692
|
+
assert result["crawl_mode"]["staged_mode"] == "structure_first"
|
|
693
|
+
assert result["crawl_mode"]["page_concurrency"] == 2
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
# =============================================================================
|
|
697
|
+
# TSK-188: Resume/Incremental Crawling Tests
|
|
698
|
+
# =============================================================================
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
def test_crawl_state_serialization() -> None:
|
|
702
|
+
"""Test CrawlState can serialize and deserialize correctly."""
|
|
703
|
+
state = CrawlState(
|
|
704
|
+
root_page_id="123",
|
|
705
|
+
body_format="view",
|
|
706
|
+
visited_pages={
|
|
707
|
+
"123": {"version": 5, "html_sha256": "abc123", "visited_at": "2026-01-01T00:00:00Z"},
|
|
708
|
+
"456": {"version": 3, "html_sha256": "def456", "visited_at": "2026-01-01T00:01:00Z"},
|
|
709
|
+
},
|
|
710
|
+
pending_queue=[("789", "123", 1), ("101", "123", 1)],
|
|
711
|
+
started_at="2026-01-01T00:00:00Z",
|
|
712
|
+
last_updated_at="2026-01-01T00:05:00Z",
|
|
713
|
+
completed=False,
|
|
714
|
+
last_sync_timestamp="2026-01-01T00:05:00Z",
|
|
715
|
+
last_full_reconciliation_timestamp="2026-01-01T00:00:00Z",
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Serialize
|
|
719
|
+
data = state.to_dict()
|
|
720
|
+
assert data["root_page_id"] == "123"
|
|
721
|
+
assert len(data["visited_pages"]) == 2
|
|
722
|
+
assert len(data["pending_queue"]) == 2
|
|
723
|
+
assert data["completed"] is False
|
|
724
|
+
assert data["last_sync_timestamp"] == "2026-01-01T00:05:00Z"
|
|
725
|
+
assert data["last_full_reconciliation_timestamp"] == "2026-01-01T00:00:00Z"
|
|
726
|
+
|
|
727
|
+
# Deserialize
|
|
728
|
+
restored = CrawlState.from_dict(data)
|
|
729
|
+
assert restored.root_page_id == "123"
|
|
730
|
+
assert restored.body_format == "view"
|
|
731
|
+
assert len(restored.visited_pages) == 2
|
|
732
|
+
assert restored.visited_pages["123"]["version"] == 5
|
|
733
|
+
assert len(restored.pending_queue) == 2
|
|
734
|
+
assert restored.pending_queue[0] == ("789", "123", 1)
|
|
735
|
+
assert restored.last_sync_timestamp == "2026-01-01T00:05:00Z"
|
|
736
|
+
assert restored.last_full_reconciliation_timestamp == "2026-01-01T00:00:00Z"
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def test_crawl_state_is_page_unchanged_by_version() -> None:
|
|
740
|
+
"""Test page unchanged detection by version number."""
|
|
741
|
+
state = CrawlState(
|
|
742
|
+
root_page_id="1",
|
|
743
|
+
body_format="view",
|
|
744
|
+
visited_pages={"1": {"version": 5, "html_sha256": "abc", "visited_at": "2026-01-01T00:00:00Z"}},
|
|
745
|
+
pending_queue=[],
|
|
746
|
+
started_at="2026-01-01T00:00:00Z",
|
|
747
|
+
last_updated_at="2026-01-01T00:00:00Z",
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
# Same version = unchanged
|
|
751
|
+
assert state.is_page_unchanged("1", 5, None) is True
|
|
752
|
+
# Different version = changed
|
|
753
|
+
assert state.is_page_unchanged("1", 6, None) is False
|
|
754
|
+
# Unknown page = changed
|
|
755
|
+
assert state.is_page_unchanged("2", 5, None) is False
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
def test_crawl_state_is_page_unchanged_by_hash() -> None:
|
|
759
|
+
"""Test page unchanged detection by content hash."""
|
|
760
|
+
state = CrawlState(
|
|
761
|
+
root_page_id="1",
|
|
762
|
+
body_format="view",
|
|
763
|
+
visited_pages={"1": {"version": None, "html_sha256": "abc123", "visited_at": "2026-01-01T00:00:00Z"}},
|
|
764
|
+
pending_queue=[],
|
|
765
|
+
started_at="2026-01-01T00:00:00Z",
|
|
766
|
+
last_updated_at="2026-01-01T00:00:00Z",
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
# Same hash = unchanged
|
|
770
|
+
assert state.is_page_unchanged("1", None, "abc123") is True
|
|
771
|
+
# Different hash = changed
|
|
772
|
+
assert state.is_page_unchanged("1", None, "xyz789") is False
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
def test_crawl_tree_max_nodes_limits_pages(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
776
|
+
"""Test --max-nodes stops crawl after N pages."""
|
|
777
|
+
pages_fetched: list[str] = []
|
|
778
|
+
|
|
779
|
+
class DummyClient:
|
|
780
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
781
|
+
raise AssertionError("search_cql should not be called")
|
|
782
|
+
|
|
783
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
784
|
+
pages_fetched.append(page_id)
|
|
785
|
+
return {
|
|
786
|
+
"id": page_id,
|
|
787
|
+
"title": f"Page {page_id}",
|
|
788
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
789
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
790
|
+
"version": {"number": 1},
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
794
|
+
# Root has 5 children
|
|
795
|
+
if page_id == "1":
|
|
796
|
+
return [{"id": str(i)} for i in range(2, 7)]
|
|
797
|
+
return []
|
|
798
|
+
|
|
799
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
800
|
+
return []
|
|
801
|
+
|
|
802
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
803
|
+
return b""
|
|
804
|
+
|
|
805
|
+
dummy = DummyClient()
|
|
806
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
807
|
+
|
|
808
|
+
out_dir = tmp_path / "out"
|
|
809
|
+
result = runner.invoke(
|
|
810
|
+
app,
|
|
811
|
+
[
|
|
812
|
+
"--report-dir",
|
|
813
|
+
str(tmp_path),
|
|
814
|
+
"crawl-tree",
|
|
815
|
+
"1",
|
|
816
|
+
"--out",
|
|
817
|
+
str(out_dir),
|
|
818
|
+
"--depth",
|
|
819
|
+
"2",
|
|
820
|
+
"--no-include-attachments",
|
|
821
|
+
"--max-nodes",
|
|
822
|
+
"3",
|
|
823
|
+
],
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
assert result.exit_code == 0, result.output
|
|
827
|
+
payload = json.loads(result.stdout)
|
|
828
|
+
|
|
829
|
+
# Should have processed exactly 3 nodes
|
|
830
|
+
assert payload["resume_state"]["nodes_processed"] == 3
|
|
831
|
+
assert payload["resume_state"]["completed"] is False
|
|
832
|
+
assert payload["resume_state"]["termination_reason"] == "max_nodes_reached"
|
|
833
|
+
assert payload["resume_state"]["pending_queue_size"] > 0
|
|
834
|
+
assert len(pages_fetched) == 3
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def test_crawl_tree_cli_accepts_crawl_max_aliases(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
838
|
+
class DummyClient:
|
|
839
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
840
|
+
raise AssertionError("search_cql should not be called")
|
|
841
|
+
|
|
842
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
843
|
+
return {
|
|
844
|
+
"id": page_id,
|
|
845
|
+
"title": f"Page {page_id}",
|
|
846
|
+
"body": {"view": {"value": "<p>Content</p>"}},
|
|
847
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
848
|
+
"version": {"number": 1},
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
852
|
+
return [{"id": "2"}] if page_id == "1" else []
|
|
853
|
+
|
|
854
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
855
|
+
return []
|
|
856
|
+
|
|
857
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
858
|
+
return b""
|
|
859
|
+
|
|
860
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: DummyClient())
|
|
861
|
+
|
|
862
|
+
out_dir = tmp_path / "out"
|
|
863
|
+
result = runner.invoke(
|
|
864
|
+
app,
|
|
865
|
+
[
|
|
866
|
+
"--report-dir",
|
|
867
|
+
str(tmp_path),
|
|
868
|
+
"crawl-tree",
|
|
869
|
+
"1",
|
|
870
|
+
"--out",
|
|
871
|
+
str(out_dir),
|
|
872
|
+
"--crawl-max-depth",
|
|
873
|
+
"0",
|
|
874
|
+
"--crawl-max-nodes",
|
|
875
|
+
"10",
|
|
876
|
+
"--no-include-attachments",
|
|
877
|
+
],
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
assert result.exit_code == 0, result.output
|
|
881
|
+
payload = json.loads(result.stdout)
|
|
882
|
+
assert payload["resume_state"]["nodes_processed"] == 1
|
|
883
|
+
assert payload["crawl_mode"]["page_concurrency"] == 5
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
def test_crawl_tree_rejects_depth_out_of_range(tmp_path: Path) -> None:
|
|
887
|
+
out_dir = tmp_path / "out"
|
|
888
|
+
result = runner.invoke(
|
|
889
|
+
app,
|
|
890
|
+
[
|
|
891
|
+
"--report-dir",
|
|
892
|
+
str(tmp_path),
|
|
893
|
+
"crawl-tree",
|
|
894
|
+
"1",
|
|
895
|
+
"--out",
|
|
896
|
+
str(out_dir),
|
|
897
|
+
"--depth",
|
|
898
|
+
"-1",
|
|
899
|
+
],
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
assert result.exit_code != 0
|
|
903
|
+
assert "Invalid value for '--depth'" in result.output
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def test_crawl_tree_resume_saves_state(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
907
|
+
"""Test that crawl saves state file for resume."""
|
|
908
|
+
|
|
909
|
+
class DummyClient:
|
|
910
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
911
|
+
raise AssertionError("search_cql should not be called")
|
|
912
|
+
|
|
913
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
914
|
+
return {
|
|
915
|
+
"id": page_id,
|
|
916
|
+
"title": f"Page {page_id}",
|
|
917
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
918
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
919
|
+
"version": {"number": 1},
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
923
|
+
return []
|
|
924
|
+
|
|
925
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
926
|
+
return []
|
|
927
|
+
|
|
928
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
929
|
+
return b""
|
|
930
|
+
|
|
931
|
+
dummy = DummyClient()
|
|
932
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
933
|
+
|
|
934
|
+
out_dir = tmp_path / "out"
|
|
935
|
+
result = runner.invoke(
|
|
936
|
+
app,
|
|
937
|
+
[
|
|
938
|
+
"--report-dir",
|
|
939
|
+
str(tmp_path),
|
|
940
|
+
"crawl-tree",
|
|
941
|
+
"1",
|
|
942
|
+
"--out",
|
|
943
|
+
str(out_dir),
|
|
944
|
+
"--depth",
|
|
945
|
+
"0",
|
|
946
|
+
"--no-include-attachments",
|
|
947
|
+
"--resume",
|
|
948
|
+
],
|
|
949
|
+
)
|
|
950
|
+
|
|
951
|
+
assert result.exit_code == 0, result.output
|
|
952
|
+
|
|
953
|
+
# State file should exist
|
|
954
|
+
state_path = out_dir / "crawl_state.json"
|
|
955
|
+
assert state_path.exists()
|
|
956
|
+
|
|
957
|
+
state_data = json.loads(state_path.read_text(encoding="utf-8"))
|
|
958
|
+
assert state_data["root_page_id"] == "1"
|
|
959
|
+
assert state_data["completed"] is True
|
|
960
|
+
assert "1" in state_data["visited_pages"]
|
|
961
|
+
assert state_data["run_diagnostics"]["termination_reason"] == "queue_exhausted"
|
|
962
|
+
assert state_data["run_diagnostics"]["pending_queue_size"] == 0
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
def test_crawl_tree_resume_skips_unchanged_pages(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
966
|
+
"""Test that resume mode skips pages that haven't changed."""
|
|
967
|
+
pages_fetched: list[str] = []
|
|
968
|
+
call_count: dict[str, int] = {"1": 0}
|
|
969
|
+
|
|
970
|
+
class DummyClient:
|
|
971
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
972
|
+
raise AssertionError("search_cql should not be called")
|
|
973
|
+
|
|
974
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
975
|
+
call_count["1"] = call_count.get("1", 0) + 1
|
|
976
|
+
pages_fetched.append(page_id)
|
|
977
|
+
return {
|
|
978
|
+
"id": page_id,
|
|
979
|
+
"title": f"Page {page_id}",
|
|
980
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
981
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
982
|
+
"version": {"number": 1}, # Version stays at 1
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
986
|
+
return []
|
|
987
|
+
|
|
988
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
989
|
+
return []
|
|
990
|
+
|
|
991
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
992
|
+
return b""
|
|
993
|
+
|
|
994
|
+
dummy = DummyClient()
|
|
995
|
+
out_dir = tmp_path / "out"
|
|
996
|
+
|
|
997
|
+
# First crawl
|
|
998
|
+
result1 = crawl_tree_to_disk(
|
|
999
|
+
http=dummy,
|
|
1000
|
+
root_ref="1",
|
|
1001
|
+
output_dir=out_dir,
|
|
1002
|
+
options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
assert result1["resume_state"]["nodes_processed"] == 1
|
|
1006
|
+
pages_fetched.clear()
|
|
1007
|
+
|
|
1008
|
+
# Second crawl with resume - should skip unchanged page
|
|
1009
|
+
result2 = crawl_tree_to_disk(
|
|
1010
|
+
http=dummy,
|
|
1011
|
+
root_ref="1",
|
|
1012
|
+
output_dir=out_dir,
|
|
1013
|
+
options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
# Page should be skipped (only version check, not full fetch)
|
|
1017
|
+
assert result2["resume_state"]["skipped_unchanged"] == 1
|
|
1018
|
+
assert result2["progress_event"]["pages_skipped_unchanged"] == 1
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def test_crawl_tree_resume_avoids_full_fetch_for_unchanged_pages(tmp_path: Path) -> None:
|
|
1022
|
+
"""Phase 27 TSK-275.1: unchanged pages should only use version checks on warm runs."""
|
|
1023
|
+
full_fetch_calls: list[str] = []
|
|
1024
|
+
|
|
1025
|
+
class DummyClient:
|
|
1026
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1027
|
+
raise AssertionError("search_cql should not be called")
|
|
1028
|
+
|
|
1029
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1030
|
+
if expand and "version" in expand and len(expand) == 1:
|
|
1031
|
+
return {
|
|
1032
|
+
"id": page_id,
|
|
1033
|
+
"title": f"Page {page_id}",
|
|
1034
|
+
"body": {"view": {"value": ""}},
|
|
1035
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1036
|
+
"version": {"number": 1},
|
|
1037
|
+
}
|
|
1038
|
+
full_fetch_calls.append(page_id)
|
|
1039
|
+
return {
|
|
1040
|
+
"id": page_id,
|
|
1041
|
+
"title": f"Page {page_id}",
|
|
1042
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1043
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1044
|
+
"version": {"number": 1},
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1048
|
+
if page_id == "88716667":
|
|
1049
|
+
return [{"id": "100"}, {"id": "200"}]
|
|
1050
|
+
return []
|
|
1051
|
+
|
|
1052
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1053
|
+
return []
|
|
1054
|
+
|
|
1055
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1056
|
+
return b""
|
|
1057
|
+
|
|
1058
|
+
out_dir = tmp_path / "out"
|
|
1059
|
+
client = DummyClient()
|
|
1060
|
+
crawl_tree_to_disk(
|
|
1061
|
+
http=client,
|
|
1062
|
+
root_ref="88716667",
|
|
1063
|
+
output_dir=out_dir,
|
|
1064
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1065
|
+
)
|
|
1066
|
+
assert len(full_fetch_calls) == 3
|
|
1067
|
+
|
|
1068
|
+
full_fetch_calls.clear()
|
|
1069
|
+
result = crawl_tree_to_disk(
|
|
1070
|
+
http=client,
|
|
1071
|
+
root_ref="88716667",
|
|
1072
|
+
output_dir=out_dir,
|
|
1073
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1074
|
+
)
|
|
1075
|
+
assert result["resume_state"]["skipped_unchanged"] == 3
|
|
1076
|
+
assert full_fetch_calls == []
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
def test_crawl_tree_resume_runs_version_pass_before_full_fetch(tmp_path: Path) -> None:
|
|
1080
|
+
call_log: list[tuple[str, str]] = []
|
|
1081
|
+
versions = {"1": 1, "2": 1}
|
|
1082
|
+
|
|
1083
|
+
class DummyClient:
|
|
1084
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1085
|
+
raise AssertionError("search_cql should not be called")
|
|
1086
|
+
|
|
1087
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1088
|
+
if expand == ["version"]:
|
|
1089
|
+
call_log.append(("version", page_id))
|
|
1090
|
+
return {
|
|
1091
|
+
"id": page_id,
|
|
1092
|
+
"title": f"Page {page_id}",
|
|
1093
|
+
"body": {"view": {"value": ""}},
|
|
1094
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1095
|
+
"version": {"number": versions[page_id]},
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
call_log.append(("full", page_id))
|
|
1099
|
+
return {
|
|
1100
|
+
"id": page_id,
|
|
1101
|
+
"title": f"Page {page_id}",
|
|
1102
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1103
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1104
|
+
"space": {"key": "SPACE"},
|
|
1105
|
+
"version": {"number": versions[page_id]},
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1109
|
+
if page_id == "1":
|
|
1110
|
+
return [{"id": "2"}]
|
|
1111
|
+
return []
|
|
1112
|
+
|
|
1113
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1114
|
+
return []
|
|
1115
|
+
|
|
1116
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1117
|
+
return b""
|
|
1118
|
+
|
|
1119
|
+
out_dir = tmp_path / "out"
|
|
1120
|
+
client = DummyClient()
|
|
1121
|
+
crawl_tree_to_disk(
|
|
1122
|
+
http=client,
|
|
1123
|
+
root_ref="1",
|
|
1124
|
+
output_dir=out_dir,
|
|
1125
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
call_log.clear()
|
|
1129
|
+
versions["2"] = 2
|
|
1130
|
+
result = crawl_tree_to_disk(
|
|
1131
|
+
http=client,
|
|
1132
|
+
root_ref="1",
|
|
1133
|
+
output_dir=out_dir,
|
|
1134
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
assert result["resume_state"]["skipped_unchanged"] == 1
|
|
1138
|
+
assert ("version", "1") in call_log
|
|
1139
|
+
assert ("full", "1") not in call_log
|
|
1140
|
+
assert call_log.index(("version", "2")) < call_log.index(("full", "2"))
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
def test_crawl_tree_resume_refreshes_only_changed_page(tmp_path: Path) -> None:
|
|
1144
|
+
"""Phase 27 TSK-275.2: changed-only refresh should fetch only the mutated page."""
|
|
1145
|
+
full_fetch_calls: list[str] = []
|
|
1146
|
+
versions = {"88716667": 1, "100": 1, "200": 1}
|
|
1147
|
+
|
|
1148
|
+
class DummyClient:
|
|
1149
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1150
|
+
raise AssertionError("search_cql should not be called")
|
|
1151
|
+
|
|
1152
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1153
|
+
if expand and "version" in expand and len(expand) == 1:
|
|
1154
|
+
return {
|
|
1155
|
+
"id": page_id,
|
|
1156
|
+
"title": f"Page {page_id}",
|
|
1157
|
+
"body": {"view": {"value": ""}},
|
|
1158
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1159
|
+
"version": {"number": versions[page_id]},
|
|
1160
|
+
}
|
|
1161
|
+
full_fetch_calls.append(page_id)
|
|
1162
|
+
return {
|
|
1163
|
+
"id": page_id,
|
|
1164
|
+
"title": f"Page {page_id}",
|
|
1165
|
+
"body": {"view": {"value": f"<p>Content {page_id} v{versions[page_id]}</p>"}},
|
|
1166
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1167
|
+
"version": {"number": versions[page_id]},
|
|
1168
|
+
}
|
|
1169
|
+
|
|
1170
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1171
|
+
if page_id == "88716667":
|
|
1172
|
+
return [{"id": "100"}, {"id": "200"}]
|
|
1173
|
+
return []
|
|
1174
|
+
|
|
1175
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1176
|
+
return []
|
|
1177
|
+
|
|
1178
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1179
|
+
return b""
|
|
1180
|
+
|
|
1181
|
+
out_dir = tmp_path / "out"
|
|
1182
|
+
client = DummyClient()
|
|
1183
|
+
crawl_tree_to_disk(
|
|
1184
|
+
http=client,
|
|
1185
|
+
root_ref="88716667",
|
|
1186
|
+
output_dir=out_dir,
|
|
1187
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
full_fetch_calls.clear()
|
|
1191
|
+
versions["200"] = 2
|
|
1192
|
+
result = crawl_tree_to_disk(
|
|
1193
|
+
http=client,
|
|
1194
|
+
root_ref="88716667",
|
|
1195
|
+
output_dir=out_dir,
|
|
1196
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1197
|
+
)
|
|
1198
|
+
assert result["resume_state"]["skipped_unchanged"] == 2
|
|
1199
|
+
assert full_fetch_calls == ["200"]
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
def test_crawl_tree_resume_emit_corpus_preserves_unchanged_pages_on_partial_refresh(tmp_path: Path) -> None:
|
|
1203
|
+
versions = {"1": 1, "2": 1, "3": 1}
|
|
1204
|
+
|
|
1205
|
+
class DummyClient:
|
|
1206
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1207
|
+
raise AssertionError("search_cql should not be called")
|
|
1208
|
+
|
|
1209
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1210
|
+
if expand and "version" in expand and len(expand) == 1:
|
|
1211
|
+
return {
|
|
1212
|
+
"id": page_id,
|
|
1213
|
+
"title": f"Page {page_id}",
|
|
1214
|
+
"body": {"view": {"value": ""}},
|
|
1215
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1216
|
+
"version": {"number": versions[page_id]},
|
|
1217
|
+
}
|
|
1218
|
+
return {
|
|
1219
|
+
"id": page_id,
|
|
1220
|
+
"title": f"Page {page_id}",
|
|
1221
|
+
"body": {"view": {"value": f"<p>Content {page_id} v{versions[page_id]}</p>"}},
|
|
1222
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1223
|
+
"version": {"number": versions[page_id]},
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1227
|
+
if page_id == "1":
|
|
1228
|
+
return [{"id": "2"}, {"id": "3"}]
|
|
1229
|
+
return []
|
|
1230
|
+
|
|
1231
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1232
|
+
return []
|
|
1233
|
+
|
|
1234
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1235
|
+
return b""
|
|
1236
|
+
|
|
1237
|
+
out_dir = tmp_path / "out"
|
|
1238
|
+
client = DummyClient()
|
|
1239
|
+
initial = crawl_tree_to_disk(
|
|
1240
|
+
http=client,
|
|
1241
|
+
root_ref="1",
|
|
1242
|
+
output_dir=out_dir,
|
|
1243
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, emit_corpus=True),
|
|
1244
|
+
)
|
|
1245
|
+
assert initial["corpus_path"]
|
|
1246
|
+
|
|
1247
|
+
versions["3"] = 2
|
|
1248
|
+
refreshed = crawl_tree_to_disk(
|
|
1249
|
+
http=client,
|
|
1250
|
+
root_ref="1",
|
|
1251
|
+
output_dir=out_dir,
|
|
1252
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, emit_corpus=True),
|
|
1253
|
+
)
|
|
1254
|
+
assert refreshed["resume_state"]["skipped_unchanged"] == 2
|
|
1255
|
+
|
|
1256
|
+
corpus_path = Path(str(refreshed["corpus_path"]))
|
|
1257
|
+
corpus_rows = [json.loads(line) for line in corpus_path.read_text(encoding="utf-8").splitlines() if line.strip()]
|
|
1258
|
+
page_ids = {str(row.get("page_id") or "") for row in corpus_rows if isinstance(row, dict)}
|
|
1259
|
+
assert page_ids == {"1", "2", "3"}
|
|
1260
|
+
|
|
1261
|
+
page_three_text = [str(row.get("text") or "") for row in corpus_rows if str(row.get("page_id") or "") == "3"]
|
|
1262
|
+
assert any("v2" in text for text in page_three_text)
|
|
1263
|
+
|
|
1264
|
+
|
|
1265
|
+
def test_crawl_tree_cold_vs_warm_call_profile_for_root_88716667(tmp_path: Path) -> None:
|
|
1266
|
+
"""Phase 27 TSK-275.3 benchmark proxy: warm run should reduce full fetch calls."""
|
|
1267
|
+
full_fetch_calls: list[str] = []
|
|
1268
|
+
|
|
1269
|
+
class DummyClient:
|
|
1270
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1271
|
+
raise AssertionError("search_cql should not be called")
|
|
1272
|
+
|
|
1273
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1274
|
+
if expand and "version" in expand and len(expand) == 1:
|
|
1275
|
+
return {
|
|
1276
|
+
"id": page_id,
|
|
1277
|
+
"title": f"Page {page_id}",
|
|
1278
|
+
"body": {"view": {"value": ""}},
|
|
1279
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1280
|
+
"version": {"number": 1},
|
|
1281
|
+
}
|
|
1282
|
+
full_fetch_calls.append(page_id)
|
|
1283
|
+
return {
|
|
1284
|
+
"id": page_id,
|
|
1285
|
+
"title": f"Page {page_id}",
|
|
1286
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1287
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1288
|
+
"version": {"number": 1},
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1292
|
+
if page_id == "88716667":
|
|
1293
|
+
return [{"id": "100"}, {"id": "200"}, {"id": "300"}]
|
|
1294
|
+
return []
|
|
1295
|
+
|
|
1296
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1297
|
+
return []
|
|
1298
|
+
|
|
1299
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1300
|
+
return b""
|
|
1301
|
+
|
|
1302
|
+
out_dir = tmp_path / "out"
|
|
1303
|
+
client = DummyClient()
|
|
1304
|
+
crawl_tree_to_disk(
|
|
1305
|
+
http=client,
|
|
1306
|
+
root_ref="88716667",
|
|
1307
|
+
output_dir=out_dir,
|
|
1308
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1309
|
+
)
|
|
1310
|
+
cold_full_fetch = len(full_fetch_calls)
|
|
1311
|
+
assert cold_full_fetch == 4
|
|
1312
|
+
|
|
1313
|
+
full_fetch_calls.clear()
|
|
1314
|
+
crawl_tree_to_disk(
|
|
1315
|
+
http=client,
|
|
1316
|
+
root_ref="88716667",
|
|
1317
|
+
output_dir=out_dir,
|
|
1318
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1319
|
+
)
|
|
1320
|
+
warm_full_fetch = len(full_fetch_calls)
|
|
1321
|
+
assert warm_full_fetch < cold_full_fetch
|
|
1322
|
+
|
|
1323
|
+
|
|
1324
|
+
def test_crawl_tree_resume_state_reports_queue_exhausted(tmp_path: Path) -> None:
|
|
1325
|
+
class DummyClient:
|
|
1326
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1327
|
+
raise AssertionError("search_cql should not be called")
|
|
1328
|
+
|
|
1329
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1330
|
+
return {
|
|
1331
|
+
"id": page_id,
|
|
1332
|
+
"title": f"Page {page_id}",
|
|
1333
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1334
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1335
|
+
"version": {"number": 1},
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1339
|
+
return []
|
|
1340
|
+
|
|
1341
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1342
|
+
return []
|
|
1343
|
+
|
|
1344
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1345
|
+
return b""
|
|
1346
|
+
|
|
1347
|
+
result = crawl_tree_to_disk(
|
|
1348
|
+
http=DummyClient(),
|
|
1349
|
+
root_ref="1",
|
|
1350
|
+
output_dir=tmp_path / "out",
|
|
1351
|
+
options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
|
|
1352
|
+
)
|
|
1353
|
+
|
|
1354
|
+
assert result["resume_state"]["completed"] is True
|
|
1355
|
+
assert result["resume_state"]["termination_reason"] == "queue_exhausted"
|
|
1356
|
+
assert result["resume_state"]["pending_queue_size"] == 0
|
|
1357
|
+
|
|
1358
|
+
|
|
1359
|
+
def test_crawl_tree_resume_skip_unchanged_with_concurrency(tmp_path: Path) -> None:
|
|
1360
|
+
full_fetch_calls: list[str] = []
|
|
1361
|
+
|
|
1362
|
+
class DummyClient:
|
|
1363
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1364
|
+
raise AssertionError("search_cql should not be called")
|
|
1365
|
+
|
|
1366
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1367
|
+
if expand and "version" in expand and len(expand) == 1:
|
|
1368
|
+
return {
|
|
1369
|
+
"id": page_id,
|
|
1370
|
+
"title": f"Page {page_id}",
|
|
1371
|
+
"body": {"view": {"value": ""}},
|
|
1372
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1373
|
+
"version": {"number": 1},
|
|
1374
|
+
}
|
|
1375
|
+
full_fetch_calls.append(page_id)
|
|
1376
|
+
return {
|
|
1377
|
+
"id": page_id,
|
|
1378
|
+
"title": f"Page {page_id}",
|
|
1379
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1380
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1381
|
+
"version": {"number": 1},
|
|
1382
|
+
}
|
|
1383
|
+
|
|
1384
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1385
|
+
if page_id == "1":
|
|
1386
|
+
return [{"id": "2"}, {"id": "3"}, {"id": "4"}]
|
|
1387
|
+
return []
|
|
1388
|
+
|
|
1389
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1390
|
+
return []
|
|
1391
|
+
|
|
1392
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1393
|
+
return b""
|
|
1394
|
+
|
|
1395
|
+
out_dir = tmp_path / "out"
|
|
1396
|
+
client = DummyClient()
|
|
1397
|
+
crawl_tree_to_disk(
|
|
1398
|
+
http=client,
|
|
1399
|
+
root_ref="1",
|
|
1400
|
+
output_dir=out_dir,
|
|
1401
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, page_concurrency=3),
|
|
1402
|
+
)
|
|
1403
|
+
cold_full_fetch = len(full_fetch_calls)
|
|
1404
|
+
assert cold_full_fetch == 4
|
|
1405
|
+
|
|
1406
|
+
full_fetch_calls.clear()
|
|
1407
|
+
crawl_tree_to_disk(
|
|
1408
|
+
http=client,
|
|
1409
|
+
root_ref="1",
|
|
1410
|
+
output_dir=out_dir,
|
|
1411
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, page_concurrency=3),
|
|
1412
|
+
)
|
|
1413
|
+
assert len(full_fetch_calls) < cold_full_fetch
|
|
1414
|
+
|
|
1415
|
+
|
|
1416
|
+
def test_crawl_tree_resume_supports_incremental_high_water_mark_and_reconciliation(tmp_path: Path) -> None:
|
|
1417
|
+
search_calls: list[dict[str, Any]] = []
|
|
1418
|
+
deleted = {"value": False}
|
|
1419
|
+
|
|
1420
|
+
class DummyClient:
|
|
1421
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1422
|
+
raise AssertionError("search_cql should not be called")
|
|
1423
|
+
|
|
1424
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1425
|
+
version_num = 1
|
|
1426
|
+
if expand == ["version"]:
|
|
1427
|
+
return {
|
|
1428
|
+
"id": page_id,
|
|
1429
|
+
"title": f"Page {page_id}",
|
|
1430
|
+
"version": {"number": version_num},
|
|
1431
|
+
}
|
|
1432
|
+
return {
|
|
1433
|
+
"id": page_id,
|
|
1434
|
+
"title": f"Page {page_id}",
|
|
1435
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1436
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1437
|
+
"space": {"key": "SPACE"},
|
|
1438
|
+
"version": {"number": version_num},
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1441
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1442
|
+
if page_id == "1" and not deleted["value"]:
|
|
1443
|
+
return [{"id": "2"}]
|
|
1444
|
+
return []
|
|
1445
|
+
|
|
1446
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1447
|
+
return []
|
|
1448
|
+
|
|
1449
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1450
|
+
return b""
|
|
1451
|
+
|
|
1452
|
+
def search_by_space_and_type(
|
|
1453
|
+
self,
|
|
1454
|
+
space_key: str | None = None,
|
|
1455
|
+
content_type: str | None = None,
|
|
1456
|
+
*,
|
|
1457
|
+
limit: int = 25,
|
|
1458
|
+
start: int = 0,
|
|
1459
|
+
expand: list[str] | None = None,
|
|
1460
|
+
last_modified_after: Any = None,
|
|
1461
|
+
overlap_minutes: int = 0,
|
|
1462
|
+
**_: Any,
|
|
1463
|
+
) -> dict[str, Any]:
|
|
1464
|
+
search_calls.append(
|
|
1465
|
+
{
|
|
1466
|
+
"space_key": space_key,
|
|
1467
|
+
"content_type": content_type,
|
|
1468
|
+
"limit": limit,
|
|
1469
|
+
"start": start,
|
|
1470
|
+
"last_modified_after": last_modified_after,
|
|
1471
|
+
"overlap_minutes": overlap_minutes,
|
|
1472
|
+
}
|
|
1473
|
+
)
|
|
1474
|
+
if last_modified_after is not None:
|
|
1475
|
+
return {"results": []}
|
|
1476
|
+
if deleted["value"]:
|
|
1477
|
+
return {"results": [{"id": "1"}]}
|
|
1478
|
+
return {"results": [{"id": "1"}, {"id": "2"}]}
|
|
1479
|
+
|
|
1480
|
+
out_dir = tmp_path / "out"
|
|
1481
|
+
client = DummyClient()
|
|
1482
|
+
|
|
1483
|
+
crawl_tree_to_disk(
|
|
1484
|
+
http=client,
|
|
1485
|
+
root_ref="1",
|
|
1486
|
+
output_dir=out_dir,
|
|
1487
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1488
|
+
)
|
|
1489
|
+
|
|
1490
|
+
deleted["value"] = True
|
|
1491
|
+
search_calls.clear()
|
|
1492
|
+
result = crawl_tree_to_disk(
|
|
1493
|
+
http=client,
|
|
1494
|
+
root_ref="1",
|
|
1495
|
+
output_dir=out_dir,
|
|
1496
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1497
|
+
)
|
|
1498
|
+
|
|
1499
|
+
state_data = json.loads((out_dir / "crawl_state.json").read_text(encoding="utf-8"))
|
|
1500
|
+
assert "2" not in state_data["visited_pages"]
|
|
1501
|
+
assert state_data["last_sync_timestamp"] is not None
|
|
1502
|
+
assert state_data["last_full_reconciliation_timestamp"] is not None
|
|
1503
|
+
assert state_data["run_diagnostics"]["full_reconciliation_ran"] is True
|
|
1504
|
+
assert state_data["run_diagnostics"]["full_reconciliation_deleted_ids"] == ["2"]
|
|
1505
|
+
assert state_data["run_diagnostics"]["full_reconciliation_skipped"] is False
|
|
1506
|
+
assert state_data["run_diagnostics"]["full_reconciliation_skip_reason"] is None
|
|
1507
|
+
assert state_data["run_diagnostics"]["incremental_known_page_count"] == 2
|
|
1508
|
+
assert state_data["run_diagnostics"]["incremental_high_water_mark_ran"] is True
|
|
1509
|
+
assert state_data["run_diagnostics"]["incremental_high_water_mark_skipped"] is False
|
|
1510
|
+
assert state_data["run_diagnostics"]["incremental_high_water_mark_skip_reason"] is None
|
|
1511
|
+
|
|
1512
|
+
incremental_calls = [call for call in search_calls if call["last_modified_after"] is not None]
|
|
1513
|
+
assert incremental_calls
|
|
1514
|
+
assert all(call["overlap_minutes"] == 5 for call in incremental_calls)
|
|
1515
|
+
full_calls = [call for call in search_calls if call["last_modified_after"] is None]
|
|
1516
|
+
assert full_calls
|
|
1517
|
+
|
|
1518
|
+
assert result["resume_state"]["last_sync_timestamp"] is not None
|
|
1519
|
+
assert result["resume_state"]["last_full_reconciliation_timestamp"] is not None
|
|
1520
|
+
|
|
1521
|
+
|
|
1522
|
+
def test_crawl_tree_resume_skips_full_reconciliation_for_root_only_state(tmp_path: Path) -> None:
|
|
1523
|
+
search_calls: list[dict[str, Any]] = []
|
|
1524
|
+
|
|
1525
|
+
class DummyClient:
|
|
1526
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1527
|
+
raise AssertionError("search_cql should not be called")
|
|
1528
|
+
|
|
1529
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1530
|
+
if expand == ["version"]:
|
|
1531
|
+
return {"id": page_id, "title": f"Page {page_id}", "version": {"number": 1}}
|
|
1532
|
+
return {
|
|
1533
|
+
"id": page_id,
|
|
1534
|
+
"title": f"Page {page_id}",
|
|
1535
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1536
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1537
|
+
"space": {"key": "SPACE"},
|
|
1538
|
+
"version": {"number": 1},
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1542
|
+
return []
|
|
1543
|
+
|
|
1544
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1545
|
+
return []
|
|
1546
|
+
|
|
1547
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1548
|
+
return b""
|
|
1549
|
+
|
|
1550
|
+
def search_by_space_and_type(
|
|
1551
|
+
self,
|
|
1552
|
+
space_key: str | None = None,
|
|
1553
|
+
content_type: str | None = None,
|
|
1554
|
+
*,
|
|
1555
|
+
limit: int = 25,
|
|
1556
|
+
start: int = 0,
|
|
1557
|
+
expand: list[str] | None = None,
|
|
1558
|
+
last_modified_after: Any = None,
|
|
1559
|
+
overlap_minutes: int = 0,
|
|
1560
|
+
**_: Any,
|
|
1561
|
+
) -> dict[str, Any]:
|
|
1562
|
+
search_calls.append(
|
|
1563
|
+
{
|
|
1564
|
+
"space_key": space_key,
|
|
1565
|
+
"content_type": content_type,
|
|
1566
|
+
"limit": limit,
|
|
1567
|
+
"start": start,
|
|
1568
|
+
"last_modified_after": last_modified_after,
|
|
1569
|
+
"overlap_minutes": overlap_minutes,
|
|
1570
|
+
}
|
|
1571
|
+
)
|
|
1572
|
+
if last_modified_after is not None:
|
|
1573
|
+
return {"results": []}
|
|
1574
|
+
return {"results": [{"id": "1"}]}
|
|
1575
|
+
|
|
1576
|
+
out_dir = tmp_path / "out"
|
|
1577
|
+
client = DummyClient()
|
|
1578
|
+
|
|
1579
|
+
crawl_tree_to_disk(
|
|
1580
|
+
http=client,
|
|
1581
|
+
root_ref="1",
|
|
1582
|
+
output_dir=out_dir,
|
|
1583
|
+
options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
|
|
1584
|
+
)
|
|
1585
|
+
|
|
1586
|
+
search_calls.clear()
|
|
1587
|
+
result = crawl_tree_to_disk(
|
|
1588
|
+
http=client,
|
|
1589
|
+
root_ref="1",
|
|
1590
|
+
output_dir=out_dir,
|
|
1591
|
+
options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
|
|
1592
|
+
)
|
|
1593
|
+
|
|
1594
|
+
state_data = json.loads((out_dir / "crawl_state.json").read_text(encoding="utf-8"))
|
|
1595
|
+
assert state_data["last_full_reconciliation_timestamp"] is not None
|
|
1596
|
+
assert state_data["run_diagnostics"]["full_reconciliation_ran"] is False
|
|
1597
|
+
assert state_data["run_diagnostics"]["full_reconciliation_skipped"] is True
|
|
1598
|
+
assert state_data["run_diagnostics"]["full_reconciliation_skip_reason"] == "known_pages_lte_1"
|
|
1599
|
+
assert state_data["run_diagnostics"]["incremental_known_page_count"] == 1
|
|
1600
|
+
assert state_data["run_diagnostics"]["incremental_high_water_mark_ran"] is False
|
|
1601
|
+
assert state_data["run_diagnostics"]["incremental_high_water_mark_skipped"] is True
|
|
1602
|
+
assert state_data["run_diagnostics"]["incremental_high_water_mark_skip_reason"] == "known_pages_lte_1"
|
|
1603
|
+
|
|
1604
|
+
incremental_calls = [call for call in search_calls if call["last_modified_after"] is not None]
|
|
1605
|
+
assert not incremental_calls
|
|
1606
|
+
full_calls = [call for call in search_calls if call["last_modified_after"] is None]
|
|
1607
|
+
assert not full_calls
|
|
1608
|
+
|
|
1609
|
+
assert result["resume_state"]["last_full_reconciliation_timestamp"] is not None
|
|
1610
|
+
|
|
1611
|
+
|
|
1612
|
+
def test_crawl_tree_exposes_http_telemetry_in_manifest_and_state(tmp_path: Path) -> None:
|
|
1613
|
+
class DummyClient:
|
|
1614
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1615
|
+
raise AssertionError("search_cql should not be called")
|
|
1616
|
+
|
|
1617
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1618
|
+
return {
|
|
1619
|
+
"id": page_id,
|
|
1620
|
+
"title": f"Page {page_id}",
|
|
1621
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1622
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1623
|
+
"version": {"number": 1},
|
|
1624
|
+
}
|
|
1625
|
+
|
|
1626
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1627
|
+
return []
|
|
1628
|
+
|
|
1629
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1630
|
+
return []
|
|
1631
|
+
|
|
1632
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1633
|
+
return b""
|
|
1634
|
+
|
|
1635
|
+
def get_runtime_telemetry(self) -> dict[str, Any]:
|
|
1636
|
+
return {
|
|
1637
|
+
"retries_total": 2,
|
|
1638
|
+
"rate_limit_total": 1,
|
|
1639
|
+
"timeout_total": 1,
|
|
1640
|
+
"adaptive_throttle": {"effective_concurrency_signal": 0.73, "pressure_score": 0.27},
|
|
1641
|
+
"recent_retry_events": [{"reason": "http_429"}, {"reason": "timeout"}],
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
out_dir = tmp_path / "out"
|
|
1645
|
+
result = crawl_tree_to_disk(
|
|
1646
|
+
http=DummyClient(),
|
|
1647
|
+
root_ref="1",
|
|
1648
|
+
output_dir=out_dir,
|
|
1649
|
+
options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
|
|
1650
|
+
)
|
|
1651
|
+
|
|
1652
|
+
assert result["http_telemetry"]["retries_total"] == 2
|
|
1653
|
+
assert result["telemetry"]["retries_total"] == 2
|
|
1654
|
+
assert result["http_telemetry"]["adaptive_throttle"]["effective_concurrency_signal"] == 0.73
|
|
1655
|
+
assert result["resume_state"]["completed"] is True
|
|
1656
|
+
|
|
1657
|
+
state_data = json.loads((out_dir / "crawl_state.json").read_text(encoding="utf-8"))
|
|
1658
|
+
assert state_data["run_diagnostics"]["http_telemetry"]["timeout_total"] == 1
|
|
1659
|
+
assert state_data["run_diagnostics"]["telemetry"]["timeout_total"] == 1
|
|
1660
|
+
assert state_data["run_diagnostics"]["progress_event"]["pages_discovered"] == 1
|
|
1661
|
+
|
|
1662
|
+
|
|
1663
|
+
def test_crawl_tree_emits_crawl_progress_runtime_events(tmp_path: Path) -> None:
|
|
1664
|
+
emitted_events: list[tuple[str, dict[str, Any]]] = []
|
|
1665
|
+
|
|
1666
|
+
class DummyClient:
|
|
1667
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1668
|
+
raise AssertionError("search_cql should not be called")
|
|
1669
|
+
|
|
1670
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1671
|
+
return {
|
|
1672
|
+
"id": page_id,
|
|
1673
|
+
"title": f"Page {page_id}",
|
|
1674
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1675
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1676
|
+
"version": {"number": 1},
|
|
1677
|
+
}
|
|
1678
|
+
|
|
1679
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1680
|
+
if page_id == "1":
|
|
1681
|
+
return [{"id": "2"}, {"id": "3"}]
|
|
1682
|
+
return []
|
|
1683
|
+
|
|
1684
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1685
|
+
return []
|
|
1686
|
+
|
|
1687
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1688
|
+
return b""
|
|
1689
|
+
|
|
1690
|
+
def _capture_runtime_event(event: str, payload: dict[str, Any]) -> None:
|
|
1691
|
+
emitted_events.append((event, payload))
|
|
1692
|
+
|
|
1693
|
+
result = crawl_tree_to_disk(
|
|
1694
|
+
http=DummyClient(),
|
|
1695
|
+
root_ref="1",
|
|
1696
|
+
output_dir=tmp_path / "out",
|
|
1697
|
+
options=CrawlOptions(
|
|
1698
|
+
max_depth=1,
|
|
1699
|
+
include_attachments=False,
|
|
1700
|
+
page_concurrency=2,
|
|
1701
|
+
runtime_event_sink=_capture_runtime_event,
|
|
1702
|
+
),
|
|
1703
|
+
)
|
|
1704
|
+
|
|
1705
|
+
progress_events = [payload for name, payload in emitted_events if name == "crawl_progress"]
|
|
1706
|
+
assert progress_events
|
|
1707
|
+
assert progress_events[-1]["pages_discovered"] == result["progress_event"]["pages_discovered"]
|
|
1708
|
+
assert progress_events[-1]["pages_fetched"] == result["progress_event"]["pages_fetched"]
|
|
1709
|
+
assert progress_events[-1]["pages_failed"] == result["progress_event"]["pages_failed"]
|
|
1710
|
+
assert progress_events[-1]["pages_skipped_unchanged"] == result["progress_event"]["pages_skipped_unchanged"]
|
|
1711
|
+
|
|
1712
|
+
|
|
1713
|
+
def test_crawl_tree_emits_pages_deleted_event(tmp_path: Path) -> None:
|
|
1714
|
+
emitted_events: list[tuple[str, dict[str, Any]]] = []
|
|
1715
|
+
search_calls: list[dict[str, Any]] = []
|
|
1716
|
+
deleted = {"value": False}
|
|
1717
|
+
|
|
1718
|
+
class DummyClient:
|
|
1719
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1720
|
+
raise AssertionError("search_cql should not be called")
|
|
1721
|
+
|
|
1722
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1723
|
+
if expand == ["version"]:
|
|
1724
|
+
return {"id": page_id, "title": f"Page {page_id}", "version": {"number": 1}}
|
|
1725
|
+
return {
|
|
1726
|
+
"id": page_id,
|
|
1727
|
+
"title": f"Page {page_id}",
|
|
1728
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1729
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1730
|
+
"space": {"key": "SPACE"},
|
|
1731
|
+
"version": {"number": 1},
|
|
1732
|
+
}
|
|
1733
|
+
|
|
1734
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1735
|
+
if page_id == "1" and not deleted["value"]:
|
|
1736
|
+
return [{"id": "2"}]
|
|
1737
|
+
return []
|
|
1738
|
+
|
|
1739
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1740
|
+
return []
|
|
1741
|
+
|
|
1742
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1743
|
+
return b""
|
|
1744
|
+
|
|
1745
|
+
def search_by_space_and_type(
|
|
1746
|
+
self,
|
|
1747
|
+
space_key: str | None = None,
|
|
1748
|
+
content_type: str | None = None,
|
|
1749
|
+
*,
|
|
1750
|
+
limit: int = 25,
|
|
1751
|
+
start: int = 0,
|
|
1752
|
+
expand: list[str] | None = None,
|
|
1753
|
+
last_modified_after: Any = None,
|
|
1754
|
+
overlap_minutes: int = 0,
|
|
1755
|
+
**_: Any,
|
|
1756
|
+
) -> dict[str, Any]:
|
|
1757
|
+
search_calls.append(
|
|
1758
|
+
{
|
|
1759
|
+
"space_key": space_key,
|
|
1760
|
+
"content_type": content_type,
|
|
1761
|
+
"limit": limit,
|
|
1762
|
+
"start": start,
|
|
1763
|
+
"last_modified_after": last_modified_after,
|
|
1764
|
+
"overlap_minutes": overlap_minutes,
|
|
1765
|
+
}
|
|
1766
|
+
)
|
|
1767
|
+
if last_modified_after is not None:
|
|
1768
|
+
return {"results": []}
|
|
1769
|
+
if deleted["value"]:
|
|
1770
|
+
return {"results": [{"id": "1"}]}
|
|
1771
|
+
return {"results": [{"id": "1"}, {"id": "2"}]}
|
|
1772
|
+
|
|
1773
|
+
def _capture_runtime_event(event: str, payload: dict[str, Any]) -> None:
|
|
1774
|
+
emitted_events.append((event, payload))
|
|
1775
|
+
|
|
1776
|
+
out_dir = tmp_path / "out"
|
|
1777
|
+
client = DummyClient()
|
|
1778
|
+
|
|
1779
|
+
crawl_tree_to_disk(
|
|
1780
|
+
http=client,
|
|
1781
|
+
root_ref="1",
|
|
1782
|
+
output_dir=out_dir,
|
|
1783
|
+
options=CrawlOptions(
|
|
1784
|
+
max_depth=1,
|
|
1785
|
+
include_attachments=False,
|
|
1786
|
+
resume=True,
|
|
1787
|
+
runtime_event_sink=_capture_runtime_event,
|
|
1788
|
+
),
|
|
1789
|
+
)
|
|
1790
|
+
deleted["value"] = True
|
|
1791
|
+
emitted_events.clear()
|
|
1792
|
+
search_calls.clear()
|
|
1793
|
+
|
|
1794
|
+
crawl_tree_to_disk(
|
|
1795
|
+
http=client,
|
|
1796
|
+
root_ref="1",
|
|
1797
|
+
output_dir=out_dir,
|
|
1798
|
+
options=CrawlOptions(
|
|
1799
|
+
max_depth=1,
|
|
1800
|
+
include_attachments=False,
|
|
1801
|
+
resume=True,
|
|
1802
|
+
runtime_event_sink=_capture_runtime_event,
|
|
1803
|
+
),
|
|
1804
|
+
)
|
|
1805
|
+
|
|
1806
|
+
pages_deleted_events = [payload for name, payload in emitted_events if name == "pages_deleted"]
|
|
1807
|
+
assert pages_deleted_events
|
|
1808
|
+
assert pages_deleted_events[-1]["deleted_count"] == 1
|
|
1809
|
+
assert pages_deleted_events[-1]["page_ids"] == ["2"]
|
|
1810
|
+
|
|
1811
|
+
|
|
1812
|
+
def test_crawl_tree_throughput_metrics_and_adaptive_signal(tmp_path: Path) -> None:
|
|
1813
|
+
class DummyClient:
|
|
1814
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1815
|
+
raise AssertionError("search_cql should not be called")
|
|
1816
|
+
|
|
1817
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1818
|
+
return {
|
|
1819
|
+
"id": page_id,
|
|
1820
|
+
"title": f"Page {page_id}",
|
|
1821
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1822
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1823
|
+
"version": {"number": 1},
|
|
1824
|
+
}
|
|
1825
|
+
|
|
1826
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1827
|
+
if page_id == "1":
|
|
1828
|
+
return [{"id": "2"}, {"id": "3"}]
|
|
1829
|
+
return []
|
|
1830
|
+
|
|
1831
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1832
|
+
return []
|
|
1833
|
+
|
|
1834
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1835
|
+
return b""
|
|
1836
|
+
|
|
1837
|
+
def get_runtime_telemetry(self) -> dict[str, Any]:
|
|
1838
|
+
return {
|
|
1839
|
+
"adaptive_throttle": {"effective_concurrency_signal": 0.2, "pressure_score": 0.8},
|
|
1840
|
+
"latency_ms": {"operation": {"p95": 123.4}},
|
|
1841
|
+
"recent_retry_events": [{"reason": "http_429"}, {"reason": "timeout"}],
|
|
1842
|
+
}
|
|
1843
|
+
|
|
1844
|
+
result = crawl_tree_to_disk(
|
|
1845
|
+
http=DummyClient(),
|
|
1846
|
+
root_ref="1",
|
|
1847
|
+
output_dir=tmp_path / "out",
|
|
1848
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, page_concurrency=4),
|
|
1849
|
+
)
|
|
1850
|
+
|
|
1851
|
+
telemetry = result["http_telemetry"]
|
|
1852
|
+
assert telemetry["inflight_workers"] == 1
|
|
1853
|
+
assert telemetry["queue_depth"] == 0
|
|
1854
|
+
assert telemetry["pages_per_second"] > 0
|
|
1855
|
+
assert telemetry["retry_events"] == 2
|
|
1856
|
+
assert telemetry["p95_page_latency_ms"] == 123.4
|
|
1857
|
+
|
|
1858
|
+
progress_event = result["progress_event"]
|
|
1859
|
+
assert progress_event == {
|
|
1860
|
+
"pages_discovered": 3,
|
|
1861
|
+
"pages_fetched": 3,
|
|
1862
|
+
"pages_failed": 0,
|
|
1863
|
+
"pages_skipped": 0,
|
|
1864
|
+
"pages_skipped_unchanged": 0,
|
|
1865
|
+
"throttle": {
|
|
1866
|
+
"configured_page_concurrency": 4,
|
|
1867
|
+
"effective_page_concurrency": 1,
|
|
1868
|
+
"effective_concurrency_signal": 0.2,
|
|
1869
|
+
"pressure_score": 0.8,
|
|
1870
|
+
},
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
|
|
1874
|
+
def test_crawl_tree_resume_continues_from_pending_queue(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
1875
|
+
"""Test that resume continues from where it left off."""
|
|
1876
|
+
pages_fetched: list[str] = []
|
|
1877
|
+
|
|
1878
|
+
class DummyClient:
|
|
1879
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1880
|
+
raise AssertionError("search_cql should not be called")
|
|
1881
|
+
|
|
1882
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1883
|
+
pages_fetched.append(page_id)
|
|
1884
|
+
return {
|
|
1885
|
+
"id": page_id,
|
|
1886
|
+
"title": f"Page {page_id}",
|
|
1887
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
1888
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
1889
|
+
"version": {"number": int(page_id)}, # Different versions
|
|
1890
|
+
}
|
|
1891
|
+
|
|
1892
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1893
|
+
if page_id == "1":
|
|
1894
|
+
return [{"id": "2"}, {"id": "3"}, {"id": "4"}]
|
|
1895
|
+
return []
|
|
1896
|
+
|
|
1897
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1898
|
+
return []
|
|
1899
|
+
|
|
1900
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1901
|
+
return b""
|
|
1902
|
+
|
|
1903
|
+
dummy = DummyClient()
|
|
1904
|
+
out_dir = tmp_path / "out"
|
|
1905
|
+
|
|
1906
|
+
# First crawl with max_nodes=2 (will process 1 and 2, leave 3,4 pending)
|
|
1907
|
+
result1 = crawl_tree_to_disk(
|
|
1908
|
+
http=dummy,
|
|
1909
|
+
root_ref="1",
|
|
1910
|
+
output_dir=out_dir,
|
|
1911
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, max_nodes=2),
|
|
1912
|
+
)
|
|
1913
|
+
|
|
1914
|
+
assert result1["resume_state"]["nodes_processed"] == 2
|
|
1915
|
+
assert result1["resume_state"]["completed"] is False
|
|
1916
|
+
|
|
1917
|
+
pages_fetched.clear()
|
|
1918
|
+
|
|
1919
|
+
# Resume crawl - should continue with remaining pages
|
|
1920
|
+
result2 = crawl_tree_to_disk(
|
|
1921
|
+
http=dummy,
|
|
1922
|
+
root_ref="1",
|
|
1923
|
+
output_dir=out_dir,
|
|
1924
|
+
options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
|
|
1925
|
+
)
|
|
1926
|
+
|
|
1927
|
+
# Should have processed pages 3 and 4 (plus version checks for 1, 2)
|
|
1928
|
+
assert result2["resume_state"]["completed"] is True
|
|
1929
|
+
|
|
1930
|
+
|
|
1931
|
+
def test_crawl_tree_cli_resume_flag(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
1932
|
+
"""Test CLI --resume flag works."""
|
|
1933
|
+
|
|
1934
|
+
class DummyClient:
|
|
1935
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1936
|
+
raise AssertionError("search_cql should not be called")
|
|
1937
|
+
|
|
1938
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1939
|
+
return {
|
|
1940
|
+
"id": page_id,
|
|
1941
|
+
"title": f"Page {page_id}",
|
|
1942
|
+
"body": {"view": {"value": "<p>Content</p>"}},
|
|
1943
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Page"},
|
|
1944
|
+
"version": {"number": 1},
|
|
1945
|
+
}
|
|
1946
|
+
|
|
1947
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
1948
|
+
return []
|
|
1949
|
+
|
|
1950
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
1951
|
+
return []
|
|
1952
|
+
|
|
1953
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
1954
|
+
return b""
|
|
1955
|
+
|
|
1956
|
+
dummy = DummyClient()
|
|
1957
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
1958
|
+
|
|
1959
|
+
out_dir = tmp_path / "out"
|
|
1960
|
+
|
|
1961
|
+
# Run with --resume
|
|
1962
|
+
result = runner.invoke(
|
|
1963
|
+
app,
|
|
1964
|
+
[
|
|
1965
|
+
"--report-dir",
|
|
1966
|
+
str(tmp_path),
|
|
1967
|
+
"crawl-tree",
|
|
1968
|
+
"1",
|
|
1969
|
+
"--out",
|
|
1970
|
+
str(out_dir),
|
|
1971
|
+
"--depth",
|
|
1972
|
+
"0",
|
|
1973
|
+
"--no-include-attachments",
|
|
1974
|
+
"--resume",
|
|
1975
|
+
],
|
|
1976
|
+
)
|
|
1977
|
+
|
|
1978
|
+
assert result.exit_code == 0, result.output
|
|
1979
|
+
payload = json.loads(result.stdout)
|
|
1980
|
+
assert "resume_state" in payload
|
|
1981
|
+
assert payload["resume_state"]["state_path"] == str(out_dir / "crawl_state.json")
|
|
1982
|
+
|
|
1983
|
+
|
|
1984
|
+
def test_crawl_tree_cli_accepts_phase27_compat_flags(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
1985
|
+
"""Phase 27 compatibility: crawl-tree should accept cache/content root flags from audit orchestrator."""
|
|
1986
|
+
|
|
1987
|
+
class DummyClient:
|
|
1988
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
1989
|
+
raise AssertionError("search_cql should not be called")
|
|
1990
|
+
|
|
1991
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
1992
|
+
return {
|
|
1993
|
+
"id": page_id,
|
|
1994
|
+
"title": f"Page {page_id}",
|
|
1995
|
+
"body": {"view": {"value": "<p>Content</p>"}},
|
|
1996
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Page"},
|
|
1997
|
+
"version": {"number": 1},
|
|
1998
|
+
}
|
|
1999
|
+
|
|
2000
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
2001
|
+
return []
|
|
2002
|
+
|
|
2003
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
2004
|
+
return []
|
|
2005
|
+
|
|
2006
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
2007
|
+
return b""
|
|
2008
|
+
|
|
2009
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: DummyClient())
|
|
2010
|
+
|
|
2011
|
+
out_dir = tmp_path / "out"
|
|
2012
|
+
content_root = tmp_path / "content-root"
|
|
2013
|
+
result = runner.invoke(
|
|
2014
|
+
app,
|
|
2015
|
+
[
|
|
2016
|
+
"--report-dir",
|
|
2017
|
+
str(tmp_path),
|
|
2018
|
+
"crawl-tree",
|
|
2019
|
+
"1",
|
|
2020
|
+
"--out",
|
|
2021
|
+
str(out_dir),
|
|
2022
|
+
"--depth",
|
|
2023
|
+
"0",
|
|
2024
|
+
"--no-include-attachments",
|
|
2025
|
+
"--cache-mode",
|
|
2026
|
+
"refresh",
|
|
2027
|
+
"--cache-ttl-hours",
|
|
2028
|
+
"48",
|
|
2029
|
+
"--incremental",
|
|
2030
|
+
"--content-root",
|
|
2031
|
+
str(content_root),
|
|
2032
|
+
"--project-storage-key",
|
|
2033
|
+
"project-x",
|
|
2034
|
+
],
|
|
2035
|
+
)
|
|
2036
|
+
|
|
2037
|
+
assert result.exit_code == 0, result.output
|
|
2038
|
+
payload = json.loads(result.stdout)
|
|
2039
|
+
assert payload["cache"]["mode"] == "refresh"
|
|
2040
|
+
assert payload["cache"]["ttl_hours"] == 48
|
|
2041
|
+
assert payload["cache"]["incremental"] is True
|
|
2042
|
+
assert payload["cache"]["effective_resume"] is True
|
|
2043
|
+
assert payload["cache"]["content_root"] == str(content_root)
|
|
2044
|
+
assert payload["cache"]["project_storage_key"] == "project-x"
|
|
2045
|
+
|
|
2046
|
+
|
|
2047
|
+
def test_crawl_tree_cli_accepts_concurrency_and_staged_mode_flags(
|
|
2048
|
+
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
|
2049
|
+
) -> None:
|
|
2050
|
+
class DummyClient:
|
|
2051
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
2052
|
+
raise AssertionError("search_cql should not be called")
|
|
2053
|
+
|
|
2054
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
2055
|
+
return {
|
|
2056
|
+
"id": page_id,
|
|
2057
|
+
"title": f"Page {page_id}",
|
|
2058
|
+
"body": {"view": {"value": "<p>Content</p>"}},
|
|
2059
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Page"},
|
|
2060
|
+
"version": {"number": 1},
|
|
2061
|
+
}
|
|
2062
|
+
|
|
2063
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
2064
|
+
return []
|
|
2065
|
+
|
|
2066
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
2067
|
+
return []
|
|
2068
|
+
|
|
2069
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
2070
|
+
return b""
|
|
2071
|
+
|
|
2072
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: DummyClient())
|
|
2073
|
+
|
|
2074
|
+
out_dir = tmp_path / "out"
|
|
2075
|
+
result = runner.invoke(
|
|
2076
|
+
app,
|
|
2077
|
+
[
|
|
2078
|
+
"--report-dir",
|
|
2079
|
+
str(tmp_path),
|
|
2080
|
+
"crawl-tree",
|
|
2081
|
+
"1",
|
|
2082
|
+
"--out",
|
|
2083
|
+
str(out_dir),
|
|
2084
|
+
"--depth",
|
|
2085
|
+
"0",
|
|
2086
|
+
"--no-include-attachments",
|
|
2087
|
+
"--page-concurrency",
|
|
2088
|
+
"3",
|
|
2089
|
+
"--staged-mode",
|
|
2090
|
+
"structure-first",
|
|
2091
|
+
],
|
|
2092
|
+
)
|
|
2093
|
+
|
|
2094
|
+
assert result.exit_code == 0, result.output
|
|
2095
|
+
payload = json.loads(result.stdout)
|
|
2096
|
+
assert payload["crawl_mode"]["page_concurrency"] == 3
|
|
2097
|
+
assert payload["crawl_mode"]["staged_mode"] == "structure_first"
|
|
2098
|
+
|
|
2099
|
+
|
|
2100
|
+
# =============================================================================
|
|
2101
|
+
# TSK-189: Bundle Zip Tests
|
|
2102
|
+
# =============================================================================
|
|
2103
|
+
|
|
2104
|
+
|
|
2105
|
+
def test_create_bundle_zip_contains_all_artifacts(tmp_path: Path) -> None:
|
|
2106
|
+
"""Test that bundle zip contains all expected files."""
|
|
2107
|
+
out_dir = tmp_path / "out"
|
|
2108
|
+
out_dir.mkdir()
|
|
2109
|
+
attachments_dir = out_dir / "attachments"
|
|
2110
|
+
attachments_dir.mkdir()
|
|
2111
|
+
|
|
2112
|
+
# Create mock files
|
|
2113
|
+
(out_dir / "123_Root.html").write_text("<p>Root page</p>")
|
|
2114
|
+
(out_dir / "456_Child.html").write_text("<p>Child page</p>")
|
|
2115
|
+
(attachments_dir / "a1_file.txt").write_text("attachment content")
|
|
2116
|
+
(out_dir / "corpus.jsonl").write_text('{"chunk_id": "abc"}\n')
|
|
2117
|
+
(out_dir / "link-graph.json").write_text('{"edges": []}')
|
|
2118
|
+
(out_dir / "tables.jsonl").write_text('{"table_id": "t1"}\n')
|
|
2119
|
+
(out_dir / "mentions.jsonl").write_text('{"mention_id": "m1"}\n')
|
|
2120
|
+
(out_dir / "crawl_state.json").write_text('{"completed": true}')
|
|
2121
|
+
manifest_path = out_dir / "crawl_manifest.json"
|
|
2122
|
+
manifest_path.write_text('{"root_page_id": "123"}')
|
|
2123
|
+
|
|
2124
|
+
# Create bundle
|
|
2125
|
+
bundle_path = _create_bundle_zip(out_dir, "123", manifest_path)
|
|
2126
|
+
|
|
2127
|
+
assert bundle_path.exists()
|
|
2128
|
+
assert bundle_path.suffix == ".zip"
|
|
2129
|
+
assert "crawl_bundle_123_" in bundle_path.name
|
|
2130
|
+
|
|
2131
|
+
# Verify contents
|
|
2132
|
+
with zipfile.ZipFile(bundle_path, "r") as zf:
|
|
2133
|
+
names = zf.namelist()
|
|
2134
|
+
assert "123_Root.html" in names
|
|
2135
|
+
assert "456_Child.html" in names
|
|
2136
|
+
assert "attachments/a1_file.txt" in names
|
|
2137
|
+
assert "corpus.jsonl" in names
|
|
2138
|
+
assert "link-graph.json" in names
|
|
2139
|
+
assert "tables.jsonl" in names
|
|
2140
|
+
assert "mentions.jsonl" in names
|
|
2141
|
+
assert "crawl_state.json" in names
|
|
2142
|
+
assert "crawl_manifest.json" in names
|
|
2143
|
+
|
|
2144
|
+
|
|
2145
|
+
def test_crawl_tree_bundle_option(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
2146
|
+
"""Test --bundle flag creates zip archive."""
|
|
2147
|
+
|
|
2148
|
+
class DummyClient:
|
|
2149
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
2150
|
+
raise AssertionError("search_cql should not be called")
|
|
2151
|
+
|
|
2152
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
2153
|
+
return {
|
|
2154
|
+
"id": page_id,
|
|
2155
|
+
"title": f"Page {page_id}",
|
|
2156
|
+
"body": {"view": {"value": f"<p>Content {page_id}</p>"}},
|
|
2157
|
+
"_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
|
|
2158
|
+
"version": {"number": 1},
|
|
2159
|
+
}
|
|
2160
|
+
|
|
2161
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
2162
|
+
return []
|
|
2163
|
+
|
|
2164
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
2165
|
+
return [{"id": "a1", "title": "file.txt", "version": {"number": 1}}]
|
|
2166
|
+
|
|
2167
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
2168
|
+
return b"attachment content"
|
|
2169
|
+
|
|
2170
|
+
dummy = DummyClient()
|
|
2171
|
+
monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
|
|
2172
|
+
|
|
2173
|
+
out_dir = tmp_path / "out"
|
|
2174
|
+
result = runner.invoke(
|
|
2175
|
+
app,
|
|
2176
|
+
[
|
|
2177
|
+
"--report-dir",
|
|
2178
|
+
str(tmp_path),
|
|
2179
|
+
"crawl-tree",
|
|
2180
|
+
"1",
|
|
2181
|
+
"--out",
|
|
2182
|
+
str(out_dir),
|
|
2183
|
+
"--depth",
|
|
2184
|
+
"0",
|
|
2185
|
+
"--emit-corpus",
|
|
2186
|
+
"--emit-link-graph",
|
|
2187
|
+
"--bundle",
|
|
2188
|
+
],
|
|
2189
|
+
)
|
|
2190
|
+
|
|
2191
|
+
assert result.exit_code == 0, result.output
|
|
2192
|
+
payload = json.loads(result.stdout)
|
|
2193
|
+
|
|
2194
|
+
# Bundle should be created
|
|
2195
|
+
assert "bundle_path" in payload
|
|
2196
|
+
bundle_path = Path(payload["bundle_path"])
|
|
2197
|
+
assert bundle_path.exists()
|
|
2198
|
+
|
|
2199
|
+
# Verify bundle contents
|
|
2200
|
+
with zipfile.ZipFile(bundle_path, "r") as zf:
|
|
2201
|
+
names = zf.namelist()
|
|
2202
|
+
assert any(name.endswith(".html") for name in names)
|
|
2203
|
+
assert "corpus.jsonl" in names
|
|
2204
|
+
assert "link-graph.json" in names
|
|
2205
|
+
assert "crawl_manifest.json" in names
|
|
2206
|
+
|
|
2207
|
+
|
|
2208
|
+
def test_crawl_tree_bundle_with_no_attachments(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
2209
|
+
"""Test bundle works even without attachments."""
|
|
2210
|
+
|
|
2211
|
+
class DummyClient:
|
|
2212
|
+
def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
|
|
2213
|
+
raise AssertionError("search_cql should not be called")
|
|
2214
|
+
|
|
2215
|
+
def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
|
|
2216
|
+
return {
|
|
2217
|
+
"id": page_id,
|
|
2218
|
+
"title": "Root",
|
|
2219
|
+
"body": {"view": {"value": "<p>Content</p>"}},
|
|
2220
|
+
"_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
|
|
2221
|
+
"version": {"number": 1},
|
|
2222
|
+
}
|
|
2223
|
+
|
|
2224
|
+
def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
|
|
2225
|
+
return []
|
|
2226
|
+
|
|
2227
|
+
def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
|
|
2228
|
+
return []
|
|
2229
|
+
|
|
2230
|
+
def get_attachment_content(self, attachment_id: str) -> bytes:
|
|
2231
|
+
return b""
|
|
2232
|
+
|
|
2233
|
+
dummy = DummyClient()
|
|
2234
|
+
out_dir = tmp_path / "out"
|
|
2235
|
+
|
|
2236
|
+
result = crawl_tree_to_disk(
|
|
2237
|
+
http=dummy,
|
|
2238
|
+
root_ref="1",
|
|
2239
|
+
output_dir=out_dir,
|
|
2240
|
+
options=CrawlOptions(max_depth=0, include_attachments=False, bundle=True),
|
|
2241
|
+
)
|
|
2242
|
+
|
|
2243
|
+
assert "bundle_path" in result
|
|
2244
|
+
bundle_path = Path(result["bundle_path"])
|
|
2245
|
+
assert bundle_path.exists()
|
|
2246
|
+
|
|
2247
|
+
with zipfile.ZipFile(bundle_path, "r") as zf:
|
|
2248
|
+
names = zf.namelist()
|
|
2249
|
+
assert "1_Root.html" in names
|
|
2250
|
+
assert "crawl_manifest.json" in names
|