@ngocsangairvds/vsaf 3.1.27 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/global.js +70 -10
- package/tools/skills/vds-scripts-skill/.openskills.json +6 -0
- package/tools/skills/vds-scripts-skill/QUALITY.md +44 -0
- package/tools/skills/vds-scripts-skill/SKILL.md +135 -0
- package/tools/skills/vds-scripts-skill/references/audit-commands.md +171 -0
- package/tools/skills/vds-scripts-skill/references/capability-index.md +34 -0
- package/tools/skills/vds-scripts-skill/references/development-commands.md +12 -0
- package/tools/skills/vds-scripts-skill/references/google-sheets.md +73 -0
- package/tools/skills/vds-scripts-skill/references/integration-commands.md +17 -0
- package/tools/skills/vds-scripts-skill/references/platform-bootstrap.md +31 -0
- package/tools/skills/vds-scripts-skill/references/specialist-routing.md +14 -0
- package/tools/skills/vds-scripts-skill/references/validation-commands.md +15 -0
- package/tools/skills/vsaf-build/SKILL.md +32 -2
- package/tools/skills/vsaf-ship/SKILL.md +41 -10
- package/tools/skills/vsaf-test/SKILL.md +8 -0
- package/tools/vds-scripts/.mcp.json +11 -0
- package/tools/vds-scripts/.secrets.baseline +133 -0
- package/tools/vds-scripts/AGENTS.md +152 -0
- package/tools/vds-scripts/CLAUDE.md +101 -0
- package/tools/vds-scripts/CLI_COMMAND_OPTIMIZATION.md +156 -0
- package/tools/vds-scripts/PACKAGE_P125B_IMPLEMENTATION_SUMMARY.md +131 -0
- package/tools/vds-scripts/PROJECT_COMPLETION_SUMMARY.md +45 -0
- package/tools/vds-scripts/README.md +97 -0
- package/tools/vds-scripts/bitbucket_manifest_mapping.toml +34 -0
- package/tools/vds-scripts/bitbucket_orchestrator/ARCHITECTURE_ANALYSIS.md +258 -0
- package/tools/vds-scripts/bitbucket_orchestrator/BITBUCKET_API_PRACTICES.md +393 -0
- package/tools/vds-scripts/bitbucket_orchestrator/EVALUATION_REPORT.md +61 -0
- package/tools/vds-scripts/bitbucket_orchestrator/FEATURES.md +908 -0
- package/tools/vds-scripts/bitbucket_orchestrator/README.md +687 -0
- package/tools/vds-scripts/bitbucket_orchestrator/pyproject.toml +40 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/__init__.py +20 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/async_client.py +657 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/cli.py +2108 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/client.py +2534 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/config.py +171 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/errors.py +67 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/factory.py +185 -0
- package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/protocols.py +244 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/__init__.py +8 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/conftest.py +65 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_advanced_search.py +151 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_async_client.py +546 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_branch_permissions.py +145 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_cli.py +115 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client.py +157 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_branch_conditions.py +79 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_code_advanced.py +163 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_code_file.py +32 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_deployment_environments.py +194 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_issues.py +164 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_pipelines_advanced.py +179 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_pr_blockers.py +119 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_repository_variables.py +156 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code.py +98 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code_advanced.py +282 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code_insights.py +335 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_conditions.py +147 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_config.py +131 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_deployment_env.py +352 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_factory.py +371 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_fork_operations.py +204 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_issue_cli.py +261 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_pipeline_advanced.py +270 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_pr_blocker.py +204 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_protocols.py +334 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_repo_settings.py +343 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_repo_variables.py +270 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_webhooks.py +189 -0
- package/tools/vds-scripts/bitbucket_orchestrator/tests/test_workspace.py +233 -0
- package/tools/vds-scripts/bitbucket_orchestrator/uv.lock +742 -0
- package/tools/vds-scripts/confluence_orchestrator/Dockerfile +19 -0
- package/tools/vds-scripts/confluence_orchestrator/README.md +412 -0
- package/tools/vds-scripts/confluence_orchestrator/SYNC_SCRIPTS.md +127 -0
- package/tools/vds-scripts/confluence_orchestrator/SYNC_STANDARDIZATION.md +108 -0
- package/tools/vds-scripts/confluence_orchestrator/pyproject.toml +48 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/__init__.py +20 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/cli.py +2532 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/config.py +175 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/content.py +290 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/content_v2.py +94 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/crawl_tree.py +1835 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/errors.py +80 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/eventing.py +109 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/http.py +1114 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/orchestration.py +165 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/reporting.py +78 -0
- package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/tree.py +121 -0
- package/tools/vds-scripts/confluence_orchestrator/sync_pdfs_from_markdown.py +213 -0
- package/tools/vds-scripts/confluence_orchestrator/sync_pdfs_to_confluence.py +305 -0
- package/tools/vds-scripts/confluence_orchestrator/sync_png_attachments.py +305 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/__init__.py +0 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/conftest.py +8 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_advanced_content.py +224 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_advanced_search.py +188 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_cache_management.py +247 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_cli.py +499 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_config.py +83 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_content.py +186 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_content_flags.py +27 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_crawl_tree.py +2250 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_draft_management.py +223 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing.py +71 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_chaos.py +37 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_rate_limit.py +44 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_timeout.py +49 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_export.py +230 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_history.py +204 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_http.py +117 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_orchestration.py +91 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_reporting.py +24 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_search_cql.py +34 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_space_management.py +237 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_space_permissions.py +332 -0
- package/tools/vds-scripts/confluence_orchestrator/tests/test_user_group_management.py +388 -0
- package/tools/vds-scripts/confluence_orchestrator/uv.lock +1023 -0
- package/tools/vds-scripts/git_orchestrator/ENHANCEMENT_SUMMARY.md +119 -0
- package/tools/vds-scripts/git_orchestrator/README.md +280 -0
- package/tools/vds-scripts/git_orchestrator/VERIFICATION_REPORT.md +152 -0
- package/tools/vds-scripts/git_orchestrator/pyproject.toml +35 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/__init__.py +7 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/__main__.py +4 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/cli.py +847 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/logging_config.py +63 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/manifest.py +129 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/orchestrator.py +819 -0
- package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/reporting.py +53 -0
- package/tools/vds-scripts/git_orchestrator/tests/__init__.py +0 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_cli_settings.py +21 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_integration.py +74 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_manifest.py +79 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_orchestrator.py +204 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_public_api.py +236 -0
- package/tools/vds-scripts/git_orchestrator/tests/test_resilience.py +345 -0
- package/tools/vds-scripts/git_orchestrator/uv.lock +271 -0
- package/tools/vds-scripts/jira_orchestrator/README.md +770 -0
- package/tools/vds-scripts/jira_orchestrator/pyproject.toml +39 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/__init__.py +1 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/adapter.py +1320 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/cli.py +2271 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/config.py +138 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/errors.py +67 -0
- package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/reporting.py +65 -0
- package/tools/vds-scripts/jira_orchestrator/tests/__init__.py +1 -0
- package/tools/vds-scripts/jira_orchestrator/tests/conftest.py +86 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_agile_list_payloads.py +54 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_bulk_operations.py +69 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_components.py +57 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_createmeta.py +45 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_dashboard.py +117 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_issue_properties.py +54 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_permissions_compat.py +42 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_reindex.py +42 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_remote_links.py +76 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_transitions.py +91 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_user_management.py +110 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_version_management.py +133 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_watchers.py +41 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_advanced_search.py +164 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_agile.py +256 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_application_properties.py +193 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_backlog.py +91 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_bulk_operations.py +277 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_cli.py +106 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_components.py +106 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_config.py +164 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_dashboard.py +122 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_discover_fields.py +207 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_filter_management.py +333 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_issue_archiving.py +164 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_issue_links.py +257 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_issue_properties.py +171 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_link_types.py +314 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_parse_set.py +37 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_permissions.py +273 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_reindex.py +81 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_remote_links.py +254 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_security_schemes.py +170 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_transitions_changelog.py +114 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_user_management.py +226 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_version_management.py +339 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_watchers.py +101 -0
- package/tools/vds-scripts/jira_orchestrator/tests/test_worklog.py +223 -0
- package/tools/vds-scripts/jira_orchestrator/uv.lock +738 -0
- package/tools/vds-scripts/mcp_server/Dockerfile +34 -0
- package/tools/vds-scripts/mcp_server/README.md +140 -0
- package/tools/vds-scripts/mcp_server/pyproject.toml +42 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/__init__.py +4 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/config.py +36 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/server.py +66 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/__init__.py +14 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/bitbucket_tools.py +47 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/confluence_tools.py +59 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/git_tools.py +71 -0
- package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/jira_tools.py +63 -0
- package/tools/vds-scripts/mcp_server/tests/__init__.py +2 -0
- package/tools/vds-scripts/mcp_server/tests/conftest.py +29 -0
- package/tools/vds-scripts/mcp_server/tests/unit/__init__.py +2 -0
- package/tools/vds-scripts/mcp_server/tests/unit/test_bitbucket_tools.py +25 -0
- package/tools/vds-scripts/mcp_server/tests/unit/test_confluence_tools.py +25 -0
- package/tools/vds-scripts/mcp_server/tests/unit/test_git_tools.py +32 -0
- package/tools/vds-scripts/mcp_server/tests/unit/test_jira_tools.py +32 -0
- package/tools/vds-scripts/mcp_server/tests/verification/__init__.py +2 -0
- package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_confluence_tools.py +40 -0
- package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_jira_tools.py +37 -0
- package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_tool_registration.py +47 -0
- package/tools/vds-scripts/mcp_server/uv.lock +1032 -0
- package/tools/vds-scripts/mypy.ini +5 -0
- package/tools/vds-scripts/pyproject.toml +29 -0
- package/tools/vds-scripts/repo-manifest.yaml +273 -0
- package/tools/vds-scripts/repo-manifest.yaml.example +25 -0
- package/tools/vds-scripts/scripts/BRD-Validation-API.postman_collection.json +706 -0
- package/tools/vds-scripts/scripts/BRD-Validation-README.md +308 -0
- package/tools/vds-scripts/scripts/README.md +162 -0
- package/tools/vds-scripts/scripts/bootstrap_uv.sh +30 -0
- package/tools/vds-scripts/scripts/brd-validation-environment.json +51 -0
- package/tools/vds-scripts/scripts/brd-validation-test-results.json +13023 -0
- package/tools/vds-scripts/scripts/brd_coverage_report.json +276 -0
- package/tools/vds-scripts/scripts/create_memory_session.py +35 -0
- package/tools/vds-scripts/scripts/deployment/load_docker_images_offline.sh +90 -0
- package/tools/vds-scripts/scripts/final_completion_report.md +139 -0
- package/tools/vds-scripts/scripts/folder_structure_report.json +321 -0
- package/tools/vds-scripts/scripts/generate_completion_report.py +125 -0
- package/tools/vds-scripts/scripts/generate_intellij_modules.py +150 -0
- package/tools/vds-scripts/scripts/link_integrity_report.json +807 -0
- package/tools/vds-scripts/scripts/move_audit_artifact_pages.py +255 -0
- package/tools/vds-scripts/scripts/move_audit_artifact_pages_rest.py +165 -0
- package/tools/vds-scripts/scripts/move_wrong_dept_pages.py +216 -0
- package/tools/vds-scripts/scripts/save_intellij_memories.py +120 -0
- package/tools/vds-scripts/scripts/save_memories_to_vds_ai.py +83 -0
- package/tools/vds-scripts/scripts/save_memories_vds_style.py +129 -0
- package/tools/vds-scripts/scripts/search_intellij_memories.py +50 -0
- package/tools/vds-scripts/scripts/setup_intellij_workspace.py +65 -0
- package/tools/vds-scripts/scripts/target-state-automation/README.md +89 -0
- package/tools/vds-scripts/scripts/target-state-automation/confluence_sync_coordinator.sh +27 -0
- package/tools/vds-scripts/scripts/target-state-automation/coordination.sh +114 -0
- package/tools/vds-scripts/scripts/target-state-automation/diagram_coordinator.sh +25 -0
- package/tools/vds-scripts/scripts/target-state-automation/docs_root.sh +22 -0
- package/tools/vds-scripts/scripts/target-state-automation/generate_diagrams.sh +22 -0
- package/tools/vds-scripts/scripts/target-state-automation/markdown_coordinator.sh +25 -0
- package/tools/vds-scripts/scripts/target-state-automation/progress_dashboard.sh +17 -0
- package/tools/vds-scripts/scripts/target-state-automation/schema_coordinator.sh +25 -0
- package/tools/vds-scripts/scripts/target-state-automation/sync_confluence.sh +30 -0
- package/tools/vds-scripts/scripts/target-state-automation/update_dependencies.sh +19 -0
- package/tools/vds-scripts/scripts/target-state-automation/validate_links.sh +86 -0
- package/tools/vds-scripts/scripts/target-state-automation/validate_markdown.sh +52 -0
- package/tools/vds-scripts/scripts/target-state-automation/validate_schemas.sh +26 -0
- package/tools/vds-scripts/scripts/target-state-automation/validate_structure.sh +98 -0
- package/tools/vds-scripts/scripts/update_modules_xml.py +190 -0
- package/tools/vds-scripts/scripts/uv-workspace-alignment-verification-2026-03-25.md +128 -0
- package/tools/vds-scripts/scripts/validate_brd_coverage.py +179 -0
- package/tools/vds-scripts/scripts/validate_folder_structure.py +240 -0
- package/tools/vds-scripts/scripts/validate_link_integrity.py +272 -0
- package/tools/vds-scripts/scripts/vds_sh_helpers.sh +180 -0
- package/tools/vds-scripts/scripts/verification/phase2_portable_paths_ubuntu_docker.sh +26 -0
- package/tools/vds-scripts/scripts/worktree_uv.sh +48 -0
- package/tools/vds-scripts/uv.lock +8 -0
- package/tools/vds-scripts/vds_cli/README.md +126 -0
- package/tools/vds-scripts/vds_cli/VERIFICATION_REPORT.md +41 -0
- package/tools/vds-scripts/vds_cli/pyproject.toml +38 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/__init__.py +3 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/cli.py +173 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/docs_sync.py +1203 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/env.py +41 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/google_sheets_orchestrator/__init__.py +3 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/google_sheets_orchestrator/google_sheets_orchestrator.py +198 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/router.py +93 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/sync_api.py +647 -0
- package/tools/vds-scripts/vds_cli/src/vds_cli/sync_service.py +266 -0
- package/tools/vds-scripts/vds_cli/tests/__init__.py +2 -0
- package/tools/vds-scripts/vds_cli/tests/conftest.py +49 -0
- package/tools/vds-scripts/vds_cli/tests/unit/__init__.py +2 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_cli.py +143 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_docs_sync.py +422 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_env.py +51 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_router.py +72 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_sync_api.py +357 -0
- package/tools/vds-scripts/vds_cli/tests/unit/test_sync_service.py +160 -0
- package/tools/vds-scripts/vds_cli/tests/verification/__init__.py +2 -0
- package/tools/vds-scripts/vds_cli/tests/verification/test_bitbucket_real.py +33 -0
- package/tools/vds-scripts/vds_cli/tests/verification/test_confluence_real.py +35 -0
- package/tools/vds-scripts/vds_cli/tests/verification/test_jira_real.py +41 -0
- package/tools/vds-scripts/vds_cli/uv.lock +524 -0
- package/tools/vds-scripts/vds_cli_common/README.md +190 -0
- package/tools/vds-scripts/vds_cli_common/pyproject.toml +92 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/__init__.py +34 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/completers.py +139 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/context.py +201 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/env.py +119 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/errors.py +318 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/output.py +284 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/paths.py +78 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/testing.py +213 -0
- package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/version.py +85 -0
- package/tools/vds-scripts/vds_cli_common/tests/__init__.py +1 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_completers.py +148 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_context.py +192 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_env.py +102 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_errors.py +186 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_output.py +229 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_paths.py +61 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_testing.py +138 -0
- package/tools/vds-scripts/vds_cli_common/tests/test_version.py +64 -0
|
@@ -0,0 +1,1835 @@
|
|
|
1
|
+
"""Confluence tree crawling and on-disk export helpers.
|
|
2
|
+
|
|
3
|
+
This module is intentionally dependency-light (std lib + BeautifulSoup) so the output
|
|
4
|
+
can be used as input for downstream LLM/document analysis pipelines.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
import re
|
|
12
|
+
import time
|
|
13
|
+
import zipfile
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from datetime import UTC, datetime, timedelta
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
from urllib.parse import parse_qs, unquote, urlparse
|
|
21
|
+
|
|
22
|
+
from bs4 import BeautifulSoup
|
|
23
|
+
|
|
24
|
+
from .http import ConfluenceClient
|
|
25
|
+
|
|
26
|
+
_JIRA_KEY_RE = re.compile(r"\b[A-Z][A-Z0-9]+-\d+\b")
|
|
27
|
+
_MIME_SUFFIX_MAP: dict[str, str] = {
|
|
28
|
+
"application/json": ".json",
|
|
29
|
+
"application/msword": ".doc",
|
|
30
|
+
"application/pdf": ".pdf",
|
|
31
|
+
"application/vnd.jgraph.mxfile": ".drawio",
|
|
32
|
+
"application/vnd.ms-excel": ".xls",
|
|
33
|
+
"application/vnd.ms-powerpoint": ".ppt",
|
|
34
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
35
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
36
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
37
|
+
"image/gif": ".gif",
|
|
38
|
+
"image/jpeg": ".jpg",
|
|
39
|
+
"image/png": ".png",
|
|
40
|
+
"image/svg+xml": ".svg",
|
|
41
|
+
"text/csv": ".csv",
|
|
42
|
+
"text/plain": ".txt",
|
|
43
|
+
}
|
|
44
|
+
def _now_iso() -> str:
|
|
45
|
+
return datetime.now(UTC).isoformat()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _sha256_bytes(data: bytes) -> str:
|
|
49
|
+
return hashlib.sha256(data).hexdigest()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _sha256_text(text: str) -> str:
|
|
53
|
+
return _sha256_bytes(text.encode("utf-8"))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _parse_iso_timestamp(value: str | None) -> datetime | None:
|
|
57
|
+
if not value:
|
|
58
|
+
return None
|
|
59
|
+
cleaned = value.strip()
|
|
60
|
+
if not cleaned:
|
|
61
|
+
return None
|
|
62
|
+
if cleaned.endswith("Z"):
|
|
63
|
+
cleaned = f"{cleaned[:-1]}+00:00"
|
|
64
|
+
try:
|
|
65
|
+
parsed = datetime.fromisoformat(cleaned)
|
|
66
|
+
except ValueError:
|
|
67
|
+
return None
|
|
68
|
+
if parsed.tzinfo is None:
|
|
69
|
+
return parsed.replace(tzinfo=UTC)
|
|
70
|
+
return parsed.astimezone(UTC)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _safe_filename(name: str) -> str:
|
|
74
|
+
# Preserve unicode for VN titles, but remove path separators + control chars.
|
|
75
|
+
cleaned = name.replace("/", "-").replace("\\", "-").replace("\0", "")
|
|
76
|
+
cleaned = re.sub(r"[\r\n\t]+", " ", cleaned).strip()
|
|
77
|
+
return cleaned or "untitled"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _normalise_suffix(value: str) -> str:
|
|
81
|
+
cleaned = (value or "").strip().lower()
|
|
82
|
+
if not cleaned:
|
|
83
|
+
return ""
|
|
84
|
+
return cleaned if cleaned.startswith(".") else f".{cleaned}"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _guess_attachment_suffix(attachment: dict[str, Any]) -> str | None:
|
|
88
|
+
extensions = attachment.get("extensions") or {}
|
|
89
|
+
metadata = attachment.get("metadata") or {}
|
|
90
|
+
|
|
91
|
+
if isinstance(extensions, dict):
|
|
92
|
+
ext = extensions.get("fileExtension") or extensions.get("file_extension")
|
|
93
|
+
if isinstance(ext, str) and ext.strip():
|
|
94
|
+
normalised = _normalise_suffix(ext)
|
|
95
|
+
return normalised if normalised else None
|
|
96
|
+
|
|
97
|
+
media_type = None
|
|
98
|
+
if isinstance(extensions, dict):
|
|
99
|
+
media_type = extensions.get("mediaType") or extensions.get("media_type")
|
|
100
|
+
if not media_type and isinstance(metadata, dict):
|
|
101
|
+
media_type = metadata.get("mediaType") or metadata.get("media_type")
|
|
102
|
+
|
|
103
|
+
if isinstance(media_type, str) and media_type.strip():
|
|
104
|
+
mapped = _MIME_SUFFIX_MAP.get(media_type.strip().lower())
|
|
105
|
+
if mapped:
|
|
106
|
+
return mapped
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _safe_attachment_filename(attachment_id: str, title: str, *, fallback_suffix: str | None = None) -> str:
|
|
111
|
+
safe_title = _safe_filename(title)
|
|
112
|
+
path = Path(safe_title)
|
|
113
|
+
stem = path.stem or "attachment"
|
|
114
|
+
suffix = path.suffix
|
|
115
|
+
if not suffix and fallback_suffix:
|
|
116
|
+
suffix = _normalise_suffix(fallback_suffix)
|
|
117
|
+
|
|
118
|
+
# Keep filenames under typical filesystem limits (255 bytes). We preserve the
|
|
119
|
+
# attachment id prefix for stability and trim the human title if needed.
|
|
120
|
+
max_stem_length = 180
|
|
121
|
+
if len(stem) > max_stem_length:
|
|
122
|
+
stem = stem[:max_stem_length].rstrip()
|
|
123
|
+
safe_title = f"{stem}{suffix}"
|
|
124
|
+
return f"{attachment_id}_{safe_title}"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _escape_cql(value: str) -> str:
|
|
128
|
+
return value.replace("\\", "\\\\").replace('"', '\\"')
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def resolve_page_ref(page_ref: str, http: ConfluenceClient) -> str:
|
|
132
|
+
"""Resolve a page reference to a numeric page ID.
|
|
133
|
+
|
|
134
|
+
Supported forms:
|
|
135
|
+
- numeric ID: "74014717"
|
|
136
|
+
- viewpage: ".../pages/viewpage.action?pageId=74014717"
|
|
137
|
+
- display URL: ".../display/TDOV/13.02.10.+BRD"
|
|
138
|
+
"""
|
|
139
|
+
page_ref = (page_ref or "").strip()
|
|
140
|
+
if not page_ref:
|
|
141
|
+
raise ValueError("page_ref is empty")
|
|
142
|
+
|
|
143
|
+
if page_ref.isdigit():
|
|
144
|
+
return page_ref
|
|
145
|
+
|
|
146
|
+
parsed = urlparse(page_ref)
|
|
147
|
+
if parsed.query:
|
|
148
|
+
qs = parse_qs(parsed.query)
|
|
149
|
+
page_ids = qs.get("pageId") or qs.get("pageid")
|
|
150
|
+
if page_ids and page_ids[0].isdigit():
|
|
151
|
+
return page_ids[0]
|
|
152
|
+
|
|
153
|
+
# /display/<SPACE>/<TITLE>
|
|
154
|
+
match = re.search(r"/display/([^/]+)/(.+)$", parsed.path or "")
|
|
155
|
+
if match:
|
|
156
|
+
space = unquote(match.group(1))
|
|
157
|
+
# Confluence "display" URLs often encode spaces as '+'
|
|
158
|
+
title = unquote(match.group(2)).replace("+", " ").strip()
|
|
159
|
+
cql = f'space="{_escape_cql(space)}" and title="{_escape_cql(title)}" and type=page'
|
|
160
|
+
result = http.search_cql(cql, limit=5)
|
|
161
|
+
results = result.get("results", []) if isinstance(result, dict) else []
|
|
162
|
+
for item in results:
|
|
163
|
+
if not isinstance(item, dict):
|
|
164
|
+
continue
|
|
165
|
+
# Confluence CQL results usually wrap content under `content`.
|
|
166
|
+
raw_id = item.get("id")
|
|
167
|
+
if raw_id is None and isinstance(item.get("content"), dict):
|
|
168
|
+
raw_id = item["content"].get("id")
|
|
169
|
+
if raw_id is not None and str(raw_id).isdigit():
|
|
170
|
+
return str(raw_id)
|
|
171
|
+
|
|
172
|
+
raise ValueError(f"Unable to resolve display URL to page ID (space={space!r}, title={title!r})")
|
|
173
|
+
|
|
174
|
+
raise ValueError(f"Unsupported page reference: {page_ref}")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@dataclass(slots=True)
|
|
178
|
+
class CrawlOptions:
|
|
179
|
+
max_depth: int = 3
|
|
180
|
+
include_attachments: bool = True
|
|
181
|
+
emit_corpus: bool = False
|
|
182
|
+
emit_link_graph: bool = False
|
|
183
|
+
emit_tables: bool = False
|
|
184
|
+
emit_mentions: bool = False
|
|
185
|
+
body_format: str = "view"
|
|
186
|
+
chunk_max_chars: int | None = 4000
|
|
187
|
+
resume: bool = False
|
|
188
|
+
max_nodes: int | None = 500
|
|
189
|
+
bundle: bool = False
|
|
190
|
+
page_concurrency: int = 5
|
|
191
|
+
staged_mode: str = "off"
|
|
192
|
+
incremental_cql: bool = True
|
|
193
|
+
incremental_overlap_minutes: int = 5
|
|
194
|
+
full_reconciliation_interval_days: int = 7
|
|
195
|
+
rate_limit_budget: int = 50_000
|
|
196
|
+
runtime_event_sink: Callable[[str, dict[str, Any]], None] | None = None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _emit_runtime_event(options: CrawlOptions, event: str, **payload: Any) -> None:
|
|
200
|
+
if not callable(options.runtime_event_sink):
|
|
201
|
+
return
|
|
202
|
+
try:
|
|
203
|
+
options.runtime_event_sink(event, payload)
|
|
204
|
+
except Exception:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _create_bundle_zip(output_dir: Path, root_page_id: str, manifest_path: Path) -> Path:
|
|
209
|
+
"""Create a zip bundle of all crawl artifacts for offline handoff (TSK-189)."""
|
|
210
|
+
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
|
|
211
|
+
bundle_name = f"crawl_bundle_{root_page_id}_{timestamp}.zip"
|
|
212
|
+
bundle_path = output_dir / bundle_name
|
|
213
|
+
|
|
214
|
+
with zipfile.ZipFile(bundle_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
215
|
+
# Add all HTML files
|
|
216
|
+
for html_file in output_dir.glob("*.html"):
|
|
217
|
+
zf.write(html_file, html_file.name)
|
|
218
|
+
|
|
219
|
+
# Add attachments directory
|
|
220
|
+
attachments_dir = output_dir / "attachments"
|
|
221
|
+
if attachments_dir.exists():
|
|
222
|
+
for att_file in attachments_dir.iterdir():
|
|
223
|
+
if att_file.is_file():
|
|
224
|
+
zf.write(att_file, f"attachments/{att_file.name}")
|
|
225
|
+
|
|
226
|
+
# Add artifact files
|
|
227
|
+
artifact_files = [
|
|
228
|
+
"corpus.jsonl",
|
|
229
|
+
"link-graph.json",
|
|
230
|
+
"tables.jsonl",
|
|
231
|
+
"mentions.jsonl",
|
|
232
|
+
"crawl_state.json",
|
|
233
|
+
]
|
|
234
|
+
for artifact_name in artifact_files:
|
|
235
|
+
artifact_path = output_dir / artifact_name
|
|
236
|
+
if artifact_path.exists():
|
|
237
|
+
zf.write(artifact_path, artifact_name)
|
|
238
|
+
|
|
239
|
+
# Add manifest
|
|
240
|
+
if manifest_path.exists():
|
|
241
|
+
zf.write(manifest_path, manifest_path.name)
|
|
242
|
+
|
|
243
|
+
return bundle_path
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
@dataclass(slots=True)
|
|
247
|
+
class CrawlState:
|
|
248
|
+
"""Persistent crawl state for resume/incremental crawling (TSK-188)."""
|
|
249
|
+
|
|
250
|
+
root_page_id: str
|
|
251
|
+
body_format: str
|
|
252
|
+
visited_pages: dict[str, dict[str, Any]] # page_id -> {version, html_sha256, visited_at}
|
|
253
|
+
pending_queue: list[tuple[str, str | None, int]] # [(page_id, parent_id, depth), ...]
|
|
254
|
+
started_at: str
|
|
255
|
+
last_updated_at: str
|
|
256
|
+
completed: bool = False
|
|
257
|
+
last_sync_timestamp: str | None = None
|
|
258
|
+
last_full_reconciliation_timestamp: str | None = None
|
|
259
|
+
run_diagnostics: dict[str, Any] = field(default_factory=dict)
|
|
260
|
+
|
|
261
|
+
def to_dict(self) -> dict[str, Any]:
|
|
262
|
+
"""Serialize state to JSON-compatible dict."""
|
|
263
|
+
return {
|
|
264
|
+
"root_page_id": self.root_page_id,
|
|
265
|
+
"body_format": self.body_format,
|
|
266
|
+
"visited_pages": self.visited_pages,
|
|
267
|
+
"pending_queue": [list(item) for item in self.pending_queue],
|
|
268
|
+
"started_at": self.started_at,
|
|
269
|
+
"last_updated_at": self.last_updated_at,
|
|
270
|
+
"completed": self.completed,
|
|
271
|
+
"last_sync_timestamp": self.last_sync_timestamp,
|
|
272
|
+
"last_full_reconciliation_timestamp": self.last_full_reconciliation_timestamp,
|
|
273
|
+
"run_diagnostics": self.run_diagnostics,
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
@classmethod
|
|
277
|
+
def from_dict(cls, data: dict[str, Any]) -> CrawlState:
|
|
278
|
+
"""Deserialize state from JSON-compatible dict."""
|
|
279
|
+
return cls(
|
|
280
|
+
root_page_id=data["root_page_id"],
|
|
281
|
+
body_format=data.get("body_format", "view"),
|
|
282
|
+
visited_pages=data.get("visited_pages", {}),
|
|
283
|
+
pending_queue=[tuple(item) for item in data.get("pending_queue", [])],
|
|
284
|
+
started_at=data.get("started_at", _now_iso()),
|
|
285
|
+
last_updated_at=data.get("last_updated_at", _now_iso()),
|
|
286
|
+
completed=data.get("completed", False),
|
|
287
|
+
last_sync_timestamp=data.get("last_sync_timestamp"),
|
|
288
|
+
last_full_reconciliation_timestamp=data.get("last_full_reconciliation_timestamp"),
|
|
289
|
+
run_diagnostics=data.get("run_diagnostics", {}),
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
def is_page_unchanged(self, page_id: str, version: int | None, html_sha256: str | None) -> bool:
|
|
293
|
+
"""Check if a page is unchanged since last crawl."""
|
|
294
|
+
if page_id not in self.visited_pages:
|
|
295
|
+
return False
|
|
296
|
+
prev = self.visited_pages[page_id]
|
|
297
|
+
prev_version = prev.get("version")
|
|
298
|
+
prev_hash = prev.get("html_sha256")
|
|
299
|
+
# If both version and hash match, page is unchanged
|
|
300
|
+
if prev_version is not None and version is not None and prev_version == version:
|
|
301
|
+
return True
|
|
302
|
+
if prev_hash is not None and html_sha256 is not None and prev_hash == html_sha256:
|
|
303
|
+
return True
|
|
304
|
+
return False
|
|
305
|
+
|
|
306
|
+
def record_page(self, page_id: str, version: int | None, html_sha256: str | None) -> None:
|
|
307
|
+
"""Record a visited page."""
|
|
308
|
+
self.visited_pages[page_id] = {
|
|
309
|
+
"version": version,
|
|
310
|
+
"html_sha256": html_sha256,
|
|
311
|
+
"visited_at": _now_iso(),
|
|
312
|
+
}
|
|
313
|
+
self.last_updated_at = _now_iso()
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@dataclass(slots=True)
|
|
317
|
+
class _FetchedPage:
|
|
318
|
+
page_id: str
|
|
319
|
+
parent_id: str | None
|
|
320
|
+
depth: int
|
|
321
|
+
status: str # fetched|unchanged|error
|
|
322
|
+
page_payload: dict[str, Any] | None = None
|
|
323
|
+
version_payload: dict[str, Any] | None = None
|
|
324
|
+
children: list[tuple[str, str | None, int]] = field(default_factory=list)
|
|
325
|
+
error: str | None = None
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _load_crawl_state(state_path: Path) -> CrawlState | None:
|
|
329
|
+
"""Load crawl state from disk if it exists."""
|
|
330
|
+
if not state_path.exists():
|
|
331
|
+
return None
|
|
332
|
+
try:
|
|
333
|
+
data = json.loads(state_path.read_text(encoding="utf-8"))
|
|
334
|
+
return CrawlState.from_dict(data)
|
|
335
|
+
except (json.JSONDecodeError, KeyError, TypeError):
|
|
336
|
+
return None
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _save_crawl_state(state: CrawlState, state_path: Path) -> None:
|
|
340
|
+
"""Persist crawl state to disk."""
|
|
341
|
+
state.last_updated_at = _now_iso()
|
|
342
|
+
state_path.write_text(json.dumps(state.to_dict(), ensure_ascii=False, indent=2), encoding="utf-8")
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _normalise_body_format(value: str | None) -> str:
|
|
346
|
+
normalised = (value or "view").strip().lower().replace("-", "_")
|
|
347
|
+
if normalised in {"view", "storage", "export_view"}:
|
|
348
|
+
return normalised
|
|
349
|
+
raise ValueError(f"Unsupported body format: {value!r} (expected: view|export_view|storage)")
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _normalise_page_concurrency(value: int | None) -> int:
|
|
353
|
+
if value is None:
|
|
354
|
+
return 5
|
|
355
|
+
return max(1, int(value))
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _normalise_staged_mode(value: str | None) -> str:
|
|
359
|
+
normalised = (value or "off").strip().lower().replace("-", "_")
|
|
360
|
+
if normalised in {"off", "structure_first"}:
|
|
361
|
+
return normalised
|
|
362
|
+
raise ValueError(f"Unsupported staged mode: {value!r} (expected: off|structure_first)")
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _split_text_preserving_words(text: str, max_chars: int) -> list[str]:
|
|
366
|
+
"""Split a long string into <= max_chars chunks, preferring whitespace boundaries."""
|
|
367
|
+
stripped = (text or "").strip()
|
|
368
|
+
if not stripped:
|
|
369
|
+
return []
|
|
370
|
+
if max_chars <= 0:
|
|
371
|
+
return [stripped]
|
|
372
|
+
if len(stripped) <= max_chars:
|
|
373
|
+
return [stripped]
|
|
374
|
+
|
|
375
|
+
words = stripped.split()
|
|
376
|
+
parts: list[str] = []
|
|
377
|
+
current: list[str] = []
|
|
378
|
+
current_len = 0
|
|
379
|
+
|
|
380
|
+
def flush() -> None:
|
|
381
|
+
nonlocal current, current_len
|
|
382
|
+
if not current:
|
|
383
|
+
return
|
|
384
|
+
parts.append(" ".join(current))
|
|
385
|
+
current = []
|
|
386
|
+
current_len = 0
|
|
387
|
+
|
|
388
|
+
for word in words:
|
|
389
|
+
if not current:
|
|
390
|
+
current = [word]
|
|
391
|
+
current_len = len(word)
|
|
392
|
+
elif current_len + 1 + len(word) <= max_chars:
|
|
393
|
+
current.append(word)
|
|
394
|
+
current_len += 1 + len(word)
|
|
395
|
+
else:
|
|
396
|
+
flush()
|
|
397
|
+
if len(word) <= max_chars:
|
|
398
|
+
current = [word]
|
|
399
|
+
current_len = len(word)
|
|
400
|
+
else:
|
|
401
|
+
# Pathological case: a single token exceeds max_chars; hard-split.
|
|
402
|
+
for offset in range(0, len(word), max_chars):
|
|
403
|
+
parts.append(word[offset : offset + max_chars])
|
|
404
|
+
|
|
405
|
+
flush()
|
|
406
|
+
return parts
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _pack_segments(segments: list[str], max_chars: int) -> list[str]:
|
|
410
|
+
if max_chars <= 0:
|
|
411
|
+
joined = "\n".join([seg for seg in segments if seg.strip()]).strip()
|
|
412
|
+
return [joined] if joined else []
|
|
413
|
+
|
|
414
|
+
packed: list[str] = []
|
|
415
|
+
current: list[str] = []
|
|
416
|
+
current_len = 0
|
|
417
|
+
|
|
418
|
+
def flush() -> None:
|
|
419
|
+
nonlocal current, current_len
|
|
420
|
+
if not current:
|
|
421
|
+
return
|
|
422
|
+
packed.append("\n".join(current).strip())
|
|
423
|
+
current = []
|
|
424
|
+
current_len = 0
|
|
425
|
+
|
|
426
|
+
for seg in segments:
|
|
427
|
+
seg = (seg or "").strip()
|
|
428
|
+
if not seg:
|
|
429
|
+
continue
|
|
430
|
+
sep_len = 1 if current else 0 # newline
|
|
431
|
+
if not current or current_len + sep_len + len(seg) <= max_chars:
|
|
432
|
+
if current:
|
|
433
|
+
current_len += 1
|
|
434
|
+
current.append(seg)
|
|
435
|
+
current_len += len(seg)
|
|
436
|
+
continue
|
|
437
|
+
|
|
438
|
+
flush()
|
|
439
|
+
current = [seg]
|
|
440
|
+
current_len = len(seg)
|
|
441
|
+
|
|
442
|
+
flush()
|
|
443
|
+
return [chunk for chunk in packed if chunk.strip()]
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def _extract_target_page_id(target_url: str | None) -> str | None:
|
|
447
|
+
if not target_url:
|
|
448
|
+
return None
|
|
449
|
+
|
|
450
|
+
parsed = urlparse(target_url)
|
|
451
|
+
if parsed.query:
|
|
452
|
+
qs = parse_qs(parsed.query)
|
|
453
|
+
for key, values in qs.items():
|
|
454
|
+
if key.lower() != "pageid" or not values:
|
|
455
|
+
continue
|
|
456
|
+
candidate = values[0]
|
|
457
|
+
if isinstance(candidate, str) and candidate.isdigit():
|
|
458
|
+
return candidate
|
|
459
|
+
|
|
460
|
+
match = re.search(r"/pages/(\d+)(?:/|$)", parsed.path or "")
|
|
461
|
+
if match:
|
|
462
|
+
return match.group(1)
|
|
463
|
+
return None
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _classify_target_type(target_url: str | None) -> str | None:
|
|
467
|
+
if not target_url:
|
|
468
|
+
return None
|
|
469
|
+
|
|
470
|
+
url = target_url.strip()
|
|
471
|
+
if not url:
|
|
472
|
+
return None
|
|
473
|
+
if url.startswith("mailto:"):
|
|
474
|
+
return "email"
|
|
475
|
+
|
|
476
|
+
parsed = urlparse(url)
|
|
477
|
+
host = (parsed.netloc or "").lower()
|
|
478
|
+
path = parsed.path or ""
|
|
479
|
+
|
|
480
|
+
if "/display/" in path or "viewpage.action" in path or re.search(r"/pages/\d+(?:/|$)", path):
|
|
481
|
+
return "confluence_page"
|
|
482
|
+
if "jira" in host or "/browse/" in path:
|
|
483
|
+
return "jira"
|
|
484
|
+
if "bitbucket" in host or "/scm/" in path:
|
|
485
|
+
return "bitbucket"
|
|
486
|
+
if parsed.scheme in {"http", "https"}:
|
|
487
|
+
return "external"
|
|
488
|
+
return "external"
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def _extract_jira_key_from_url(url: str) -> str | None:
|
|
492
|
+
parsed = urlparse(url)
|
|
493
|
+
match = re.search(r"/browse/([A-Z][A-Z0-9]+-\d+)", parsed.path or "")
|
|
494
|
+
if match:
|
|
495
|
+
return match.group(1)
|
|
496
|
+
return None
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def _extract_links(html: str, *, base_url: str | None = None) -> list[dict[str, Any]]:
|
|
500
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
501
|
+
links: list[dict[str, Any]] = []
|
|
502
|
+
for a in soup.find_all("a"):
|
|
503
|
+
href = a.get("href")
|
|
504
|
+
if not href:
|
|
505
|
+
continue
|
|
506
|
+
href = str(href).strip()
|
|
507
|
+
if not href:
|
|
508
|
+
continue
|
|
509
|
+
text = a.get_text(" ", strip=True) or None
|
|
510
|
+
links.append({"href": href, "text": text})
|
|
511
|
+
if not base_url:
|
|
512
|
+
return links
|
|
513
|
+
|
|
514
|
+
normalized: list[dict[str, Any]] = []
|
|
515
|
+
base = base_url.rstrip("/")
|
|
516
|
+
for entry in links:
|
|
517
|
+
href = entry["href"]
|
|
518
|
+
if href.startswith("#"):
|
|
519
|
+
continue
|
|
520
|
+
if href.startswith(("http://", "https://")):
|
|
521
|
+
url = href
|
|
522
|
+
elif href.startswith("/"):
|
|
523
|
+
url = f"{base}{href}"
|
|
524
|
+
else:
|
|
525
|
+
# best-effort: keep as-is (could be relative to current page)
|
|
526
|
+
url = href
|
|
527
|
+
# Drop fragments for canonicalization
|
|
528
|
+
parsed = urlparse(url)
|
|
529
|
+
url = parsed._replace(fragment="").geturl()
|
|
530
|
+
normalized.append({"url": url, "text": entry.get("text")})
|
|
531
|
+
return normalized
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def _chunk_html_by_headings(html: str, *, max_chars: int | None = None) -> list[dict[str, Any]]:
|
|
535
|
+
"""Chunk HTML into heading-scoped blocks for JSONL corpus output."""
|
|
536
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
537
|
+
|
|
538
|
+
chunks: list[dict[str, Any]] = []
|
|
539
|
+
heading_stack: list[str] = []
|
|
540
|
+
buffer: list[str] = []
|
|
541
|
+
|
|
542
|
+
def flush() -> None:
|
|
543
|
+
nonlocal buffer
|
|
544
|
+
lines = [line for line in buffer if line.strip()]
|
|
545
|
+
buffer = []
|
|
546
|
+
if not lines:
|
|
547
|
+
return
|
|
548
|
+
|
|
549
|
+
segments: list[str] = []
|
|
550
|
+
max_limit = max_chars if isinstance(max_chars, int) and max_chars > 0 else None
|
|
551
|
+
for line in lines:
|
|
552
|
+
if max_limit and len(line) > max_limit:
|
|
553
|
+
segments.extend(_split_text_preserving_words(line, max_limit))
|
|
554
|
+
else:
|
|
555
|
+
segments.append(line.strip())
|
|
556
|
+
|
|
557
|
+
if max_limit is None:
|
|
558
|
+
text = "\n".join([seg for seg in segments if seg.strip()]).strip()
|
|
559
|
+
if text:
|
|
560
|
+
chunks.append({"heading_path": list(heading_stack), "text": text, "ordinal": 0})
|
|
561
|
+
return
|
|
562
|
+
|
|
563
|
+
for ordinal, text in enumerate(_pack_segments(segments, max_limit)):
|
|
564
|
+
chunks.append({"heading_path": list(heading_stack), "text": text, "ordinal": ordinal})
|
|
565
|
+
|
|
566
|
+
for el in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre"]):
|
|
567
|
+
name = el.name.lower()
|
|
568
|
+
if name.startswith("h"):
|
|
569
|
+
flush()
|
|
570
|
+
level = int(name[1])
|
|
571
|
+
title = el.get_text(" ", strip=True)
|
|
572
|
+
if not title:
|
|
573
|
+
continue
|
|
574
|
+
heading_stack[:] = heading_stack[: max(0, level - 1)]
|
|
575
|
+
heading_stack.append(title)
|
|
576
|
+
continue
|
|
577
|
+
|
|
578
|
+
text = el.get_text(" ", strip=True)
|
|
579
|
+
if not text:
|
|
580
|
+
continue
|
|
581
|
+
buffer.append(text)
|
|
582
|
+
|
|
583
|
+
flush()
|
|
584
|
+
return chunks
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _extract_tables(html: str) -> list[dict[str, Any]]:
|
|
588
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
589
|
+
extracted: list[dict[str, Any]] = []
|
|
590
|
+
|
|
591
|
+
for index, table in enumerate(soup.find_all("table")):
|
|
592
|
+
headers: list[str] | None = None
|
|
593
|
+
rows: list[list[str]] = []
|
|
594
|
+
|
|
595
|
+
# Prefer explicit <thead> headers when present.
|
|
596
|
+
thead = table.find("thead")
|
|
597
|
+
if thead:
|
|
598
|
+
head_row = thead.find("tr")
|
|
599
|
+
if head_row:
|
|
600
|
+
header_cells = head_row.find_all(["th", "td"])
|
|
601
|
+
header_texts = [cell.get_text(" ", strip=True) for cell in header_cells]
|
|
602
|
+
headers = [h for h in header_texts if h]
|
|
603
|
+
|
|
604
|
+
# Extract all row cells.
|
|
605
|
+
for tr in table.find_all("tr"):
|
|
606
|
+
cells = tr.find_all(["th", "td"])
|
|
607
|
+
if not cells:
|
|
608
|
+
continue
|
|
609
|
+
row = [cell.get_text(" ", strip=True) for cell in cells]
|
|
610
|
+
row = [cell for cell in row if cell]
|
|
611
|
+
if row:
|
|
612
|
+
rows.append(row)
|
|
613
|
+
|
|
614
|
+
# If we did not find a <thead>, treat the first row with <th> as headers.
|
|
615
|
+
if headers is None and rows:
|
|
616
|
+
first_tr = table.find("tr")
|
|
617
|
+
if first_tr and first_tr.find("th"):
|
|
618
|
+
headers = rows[0]
|
|
619
|
+
rows = rows[1:]
|
|
620
|
+
|
|
621
|
+
extracted.append(
|
|
622
|
+
{
|
|
623
|
+
"table_index": index,
|
|
624
|
+
"headers": headers,
|
|
625
|
+
"rows": rows,
|
|
626
|
+
"row_count": len(rows),
|
|
627
|
+
"col_count": max((len(row) for row in rows), default=0),
|
|
628
|
+
}
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
return extracted
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def _extract_mentions(html: str, *, base_url: str | None = None) -> list[dict[str, Any]]:
|
|
635
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
636
|
+
extracted: list[dict[str, Any]] = []
|
|
637
|
+
|
|
638
|
+
# 1) Link-based mentions (already normalized by base_url when provided).
|
|
639
|
+
for link in _extract_links(html, base_url=base_url):
|
|
640
|
+
url = str(link.get("url") or "").strip()
|
|
641
|
+
if not url:
|
|
642
|
+
continue
|
|
643
|
+
|
|
644
|
+
mention_type = _classify_target_type(url)
|
|
645
|
+
if mention_type == "confluence_page":
|
|
646
|
+
value = _extract_target_page_id(url) or url
|
|
647
|
+
elif mention_type == "jira":
|
|
648
|
+
value = _extract_jira_key_from_url(url) or url
|
|
649
|
+
elif mention_type == "email":
|
|
650
|
+
value = url.removeprefix("mailto:").strip() or url
|
|
651
|
+
else:
|
|
652
|
+
value = url
|
|
653
|
+
|
|
654
|
+
extracted.append(
|
|
655
|
+
{
|
|
656
|
+
"type": mention_type or "external",
|
|
657
|
+
"value": value,
|
|
658
|
+
"url": url,
|
|
659
|
+
"anchor_text": link.get("text"),
|
|
660
|
+
"source": "link",
|
|
661
|
+
}
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
# 2) Plain-text Jira keys (best-effort).
|
|
665
|
+
plain_text = soup.get_text(" ", strip=True)
|
|
666
|
+
for key in sorted(set(_JIRA_KEY_RE.findall(plain_text or ""))):
|
|
667
|
+
extracted.append(
|
|
668
|
+
{
|
|
669
|
+
"type": "jira_key",
|
|
670
|
+
"value": key,
|
|
671
|
+
"url": None,
|
|
672
|
+
"anchor_text": None,
|
|
673
|
+
"source": "text",
|
|
674
|
+
}
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
# Deduplicate mentions by stable key and keep deterministic ordering.
|
|
678
|
+
seen: set[tuple[str, str, str | None]] = set()
|
|
679
|
+
unique: list[dict[str, Any]] = []
|
|
680
|
+
for entry in extracted:
|
|
681
|
+
mention_type = str(entry.get("type") or "").strip() or "external"
|
|
682
|
+
value = str(entry.get("value") or "").strip()
|
|
683
|
+
url = entry.get("url")
|
|
684
|
+
url = str(url).strip() if isinstance(url, str) else None
|
|
685
|
+
if not value:
|
|
686
|
+
continue
|
|
687
|
+
key = (mention_type, value, url)
|
|
688
|
+
if key in seen:
|
|
689
|
+
continue
|
|
690
|
+
seen.add(key)
|
|
691
|
+
unique.append({**entry, "type": mention_type, "value": value, "url": url})
|
|
692
|
+
|
|
693
|
+
unique.sort(key=lambda m: (m.get("type") or "", m.get("value") or "", m.get("url") or ""))
|
|
694
|
+
return unique
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def crawl_tree_to_disk(
|
|
698
|
+
*,
|
|
699
|
+
http: ConfluenceClient,
|
|
700
|
+
root_ref: str,
|
|
701
|
+
output_dir: Path,
|
|
702
|
+
options: CrawlOptions,
|
|
703
|
+
manifest_path: Path | None = None,
|
|
704
|
+
) -> dict[str, Any]:
|
|
705
|
+
"""Crawl a Confluence page tree and download content/attachments to disk.
|
|
706
|
+
|
|
707
|
+
Supports resume/incremental crawling (TSK-188):
|
|
708
|
+
- resume=True: Load state from previous run, skip unchanged pages
|
|
709
|
+
- max_nodes: Limit total pages processed (safety for large trees)
|
|
710
|
+
"""
|
|
711
|
+
body_format = _normalise_body_format(options.body_format)
|
|
712
|
+
page_concurrency = _normalise_page_concurrency(options.page_concurrency)
|
|
713
|
+
staged_mode = _normalise_staged_mode(options.staged_mode)
|
|
714
|
+
use_structure_first = staged_mode == "structure_first"
|
|
715
|
+
chunk_max_chars = options.chunk_max_chars if isinstance(options.chunk_max_chars, int) else None
|
|
716
|
+
resolved_root_id = resolve_page_ref(root_ref, http)
|
|
717
|
+
output_dir = output_dir.resolve()
|
|
718
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
719
|
+
attachments_dir = output_dir / "attachments"
|
|
720
|
+
attachments_dir.mkdir(exist_ok=True)
|
|
721
|
+
resolved_manifest_path = manifest_path or (output_dir / "crawl_manifest.json")
|
|
722
|
+
|
|
723
|
+
def _meta_from_existing_html(
|
|
724
|
+
*,
|
|
725
|
+
page_id: str,
|
|
726
|
+
parent_id: str | None,
|
|
727
|
+
depth: int,
|
|
728
|
+
page_payload: dict[str, Any] | None,
|
|
729
|
+
) -> dict[str, Any]:
|
|
730
|
+
html_candidates = sorted(output_dir.glob(f"{page_id}_*.html"))
|
|
731
|
+
html_path = html_candidates[0] if html_candidates else None
|
|
732
|
+
title = None
|
|
733
|
+
page_url = None
|
|
734
|
+
page_version = None
|
|
735
|
+
page_updated = None
|
|
736
|
+
space_key = None
|
|
737
|
+
if isinstance(page_payload, dict):
|
|
738
|
+
raw_title = page_payload.get("title")
|
|
739
|
+
if raw_title:
|
|
740
|
+
title = str(raw_title)
|
|
741
|
+
links = page_payload.get("_links")
|
|
742
|
+
if isinstance(links, dict):
|
|
743
|
+
base_url = str(links.get("base") or "")
|
|
744
|
+
web_ui = str(links.get("webui") or "")
|
|
745
|
+
page_url = f"{base_url}{web_ui}" if base_url and web_ui else None
|
|
746
|
+
version = page_payload.get("version")
|
|
747
|
+
if isinstance(version, dict):
|
|
748
|
+
page_version = version.get("number")
|
|
749
|
+
page_updated = version.get("when")
|
|
750
|
+
space = page_payload.get("space")
|
|
751
|
+
if isinstance(space, dict):
|
|
752
|
+
raw_space = space.get("key")
|
|
753
|
+
if raw_space:
|
|
754
|
+
space_key = str(raw_space)
|
|
755
|
+
if title is None and html_path is not None:
|
|
756
|
+
title = html_path.stem.split("_", 1)[1] if "_" in html_path.stem else html_path.stem
|
|
757
|
+
|
|
758
|
+
meta: dict[str, Any] = {
|
|
759
|
+
"page_id": page_id,
|
|
760
|
+
"title": title,
|
|
761
|
+
"page_url": page_url,
|
|
762
|
+
"parent_id": parent_id,
|
|
763
|
+
"depth": depth,
|
|
764
|
+
"status": "downloaded" if html_path is not None else "unchanged",
|
|
765
|
+
"html_path": str(html_path) if html_path is not None else None,
|
|
766
|
+
"html_sha256": None,
|
|
767
|
+
"html_size_bytes": None,
|
|
768
|
+
"source_format": "html-view",
|
|
769
|
+
"space_key": space_key,
|
|
770
|
+
"page_version": page_version,
|
|
771
|
+
"page_updated": page_updated,
|
|
772
|
+
"ancestor_ids": [],
|
|
773
|
+
"ancestor_titles": [],
|
|
774
|
+
"error": None,
|
|
775
|
+
}
|
|
776
|
+
if html_path is not None and html_path.exists():
|
|
777
|
+
html_bytes = html_path.read_bytes()
|
|
778
|
+
meta["html_sha256"] = _sha256_bytes(html_bytes)
|
|
779
|
+
meta["html_size_bytes"] = html_path.stat().st_size
|
|
780
|
+
return meta
|
|
781
|
+
|
|
782
|
+
previous_page_meta_by_id: dict[str, dict[str, Any]] = {}
|
|
783
|
+
previous_attachments_by_page: dict[str, list[dict[str, Any]]] = {}
|
|
784
|
+
if options.resume and resolved_manifest_path.exists():
|
|
785
|
+
try:
|
|
786
|
+
loaded = json.loads(resolved_manifest_path.read_text(encoding="utf-8"))
|
|
787
|
+
if isinstance(loaded, dict):
|
|
788
|
+
root_meta = loaded.get("page")
|
|
789
|
+
if isinstance(root_meta, dict):
|
|
790
|
+
root_id = root_meta.get("page_id")
|
|
791
|
+
if root_id:
|
|
792
|
+
previous_page_meta_by_id[str(root_id)] = root_meta
|
|
793
|
+
raw_children = loaded.get("child_pages")
|
|
794
|
+
if isinstance(raw_children, list):
|
|
795
|
+
for child in raw_children:
|
|
796
|
+
if not isinstance(child, dict):
|
|
797
|
+
continue
|
|
798
|
+
child_id = child.get("page_id")
|
|
799
|
+
if child_id:
|
|
800
|
+
previous_page_meta_by_id[str(child_id)] = child
|
|
801
|
+
raw_attachments = loaded.get("attachments")
|
|
802
|
+
if isinstance(raw_attachments, list):
|
|
803
|
+
for attachment in raw_attachments:
|
|
804
|
+
if not isinstance(attachment, dict):
|
|
805
|
+
continue
|
|
806
|
+
page_id = attachment.get("page_id")
|
|
807
|
+
if page_id is None:
|
|
808
|
+
continue
|
|
809
|
+
previous_attachments_by_page.setdefault(str(page_id), []).append(attachment)
|
|
810
|
+
except Exception:
|
|
811
|
+
previous_page_meta_by_id = {}
|
|
812
|
+
previous_attachments_by_page = {}
|
|
813
|
+
|
|
814
|
+
previous_corpus_chunks_by_page: dict[str, list[dict[str, Any]]] = {}
|
|
815
|
+
if options.resume and options.emit_corpus:
|
|
816
|
+
previous_corpus_path = output_dir / "corpus.jsonl"
|
|
817
|
+
if previous_corpus_path.exists():
|
|
818
|
+
try:
|
|
819
|
+
with previous_corpus_path.open("r", encoding="utf-8") as previous_corpus_fp:
|
|
820
|
+
for raw_line in previous_corpus_fp:
|
|
821
|
+
line = raw_line.strip()
|
|
822
|
+
if not line:
|
|
823
|
+
continue
|
|
824
|
+
try:
|
|
825
|
+
payload = json.loads(line)
|
|
826
|
+
except json.JSONDecodeError:
|
|
827
|
+
continue
|
|
828
|
+
if not isinstance(payload, dict):
|
|
829
|
+
continue
|
|
830
|
+
page_id = str(payload.get("page_id") or "").strip()
|
|
831
|
+
if not page_id:
|
|
832
|
+
continue
|
|
833
|
+
previous_corpus_chunks_by_page.setdefault(page_id, []).append(payload)
|
|
834
|
+
except Exception:
|
|
835
|
+
previous_corpus_chunks_by_page = {}
|
|
836
|
+
|
|
837
|
+
# State file for resume support (TSK-188)
|
|
838
|
+
state_path = output_dir / "crawl_state.json"
|
|
839
|
+
crawl_state: CrawlState | None = None
|
|
840
|
+
skipped_unchanged: int = 0
|
|
841
|
+
skipped_already_visited: int = 0
|
|
842
|
+
nodes_processed: int = 0
|
|
843
|
+
pages_fetched: int = 0
|
|
844
|
+
pages_failed: int = 0
|
|
845
|
+
termination_reason = "queue_exhausted"
|
|
846
|
+
unchanged_page_ids: set[str] = set()
|
|
847
|
+
|
|
848
|
+
# Track previously visited pages for skip-unchanged logic (separate from current run)
|
|
849
|
+
previous_visited: dict[str, dict[str, Any]] = {}
|
|
850
|
+
|
|
851
|
+
if options.resume:
|
|
852
|
+
crawl_state = _load_crawl_state(state_path)
|
|
853
|
+
if crawl_state is not None:
|
|
854
|
+
# Validate state matches current crawl parameters
|
|
855
|
+
if crawl_state.root_page_id != resolved_root_id:
|
|
856
|
+
# Root changed, start fresh
|
|
857
|
+
crawl_state = None
|
|
858
|
+
elif crawl_state.body_format != body_format:
|
|
859
|
+
# Format changed, start fresh
|
|
860
|
+
crawl_state = None
|
|
861
|
+
elif crawl_state.completed:
|
|
862
|
+
# Previous crawl completed - keep visited_pages for skip-unchanged checks
|
|
863
|
+
# but reset queue to re-crawl from root
|
|
864
|
+
previous_visited = dict(crawl_state.visited_pages)
|
|
865
|
+
previous_sync_timestamp = crawl_state.last_sync_timestamp
|
|
866
|
+
previous_full_reconciliation_timestamp = crawl_state.last_full_reconciliation_timestamp
|
|
867
|
+
crawl_state = CrawlState(
|
|
868
|
+
root_page_id=resolved_root_id,
|
|
869
|
+
body_format=body_format,
|
|
870
|
+
visited_pages=previous_visited, # Keep for version checks
|
|
871
|
+
pending_queue=[(resolved_root_id, None, 0)],
|
|
872
|
+
started_at=_now_iso(),
|
|
873
|
+
last_updated_at=_now_iso(),
|
|
874
|
+
completed=False,
|
|
875
|
+
last_sync_timestamp=previous_sync_timestamp,
|
|
876
|
+
last_full_reconciliation_timestamp=previous_full_reconciliation_timestamp,
|
|
877
|
+
)
|
|
878
|
+
else:
|
|
879
|
+
# Incomplete crawl - restore visited pages for skip logic
|
|
880
|
+
previous_visited = dict(crawl_state.visited_pages)
|
|
881
|
+
|
|
882
|
+
# Initialize state if needed
|
|
883
|
+
if crawl_state is None:
|
|
884
|
+
crawl_state = CrawlState(
|
|
885
|
+
root_page_id=resolved_root_id,
|
|
886
|
+
body_format=body_format,
|
|
887
|
+
visited_pages={},
|
|
888
|
+
pending_queue=[(resolved_root_id, None, 0)],
|
|
889
|
+
started_at=_now_iso(),
|
|
890
|
+
last_updated_at=_now_iso(),
|
|
891
|
+
completed=False,
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
# For resume mode, track pages visited in current session to avoid re-processing
|
|
895
|
+
visited_pages: set[str] = set()
|
|
896
|
+
downloaded_attachments: set[str] = set()
|
|
897
|
+
|
|
898
|
+
queue: list[tuple[str, str | None, int]] = list(crawl_state.pending_queue)
|
|
899
|
+
discovered_page_ids: set[str] = {resolved_root_id}
|
|
900
|
+
for queued_page_id, _, _ in queue:
|
|
901
|
+
discovered_page_ids.add(queued_page_id)
|
|
902
|
+
|
|
903
|
+
root_meta: dict[str, Any] | None = None
|
|
904
|
+
child_pages: list[dict[str, Any]] = []
|
|
905
|
+
attachments: list[dict[str, Any]] = []
|
|
906
|
+
link_edges: list[dict[str, Any]] = []
|
|
907
|
+
corpus_chunks: list[dict[str, Any]] = []
|
|
908
|
+
tables: list[dict[str, Any]] = []
|
|
909
|
+
mentions: list[dict[str, Any]] = []
|
|
910
|
+
telemetry_getter = getattr(http, "get_runtime_telemetry", None)
|
|
911
|
+
configure_rate_limit_budget = getattr(http, "configure_rate_limit_budget", None)
|
|
912
|
+
if callable(configure_rate_limit_budget):
|
|
913
|
+
try:
|
|
914
|
+
configure_rate_limit_budget(int(options.rate_limit_budget))
|
|
915
|
+
except Exception:
|
|
916
|
+
pass
|
|
917
|
+
|
|
918
|
+
def _children_for(page_id: str, depth: int) -> list[tuple[str, str | None, int]]:
|
|
919
|
+
if options.max_depth >= 0 and depth >= options.max_depth:
|
|
920
|
+
return []
|
|
921
|
+
try:
|
|
922
|
+
child_payload = http.get_child_pages(page_id)
|
|
923
|
+
except Exception:
|
|
924
|
+
return []
|
|
925
|
+
collected: list[tuple[str, str | None, int]] = []
|
|
926
|
+
for child in child_payload:
|
|
927
|
+
if not isinstance(child, dict):
|
|
928
|
+
continue
|
|
929
|
+
child_id = child.get("id")
|
|
930
|
+
if child_id is None:
|
|
931
|
+
continue
|
|
932
|
+
collected.append((str(child_id), page_id, depth + 1))
|
|
933
|
+
return collected
|
|
934
|
+
|
|
935
|
+
body_expands: list[str]
|
|
936
|
+
if body_format == "export_view":
|
|
937
|
+
body_expands = ["body.export_view", "body.view", "body.storage"]
|
|
938
|
+
elif body_format == "storage":
|
|
939
|
+
body_expands = ["body.storage", "body.view"]
|
|
940
|
+
else:
|
|
941
|
+
body_expands = ["body.view", "body.storage"]
|
|
942
|
+
|
|
943
|
+
def _fetch_page(node: tuple[str, str | None, int]) -> _FetchedPage:
|
|
944
|
+
page_id, parent_id, depth = node
|
|
945
|
+
children: list[tuple[str, str | None, int]] = []
|
|
946
|
+
children_future: Future[list[tuple[str, str | None, int]]] | None = None
|
|
947
|
+
children_executor: ThreadPoolExecutor | None = None
|
|
948
|
+
if not use_structure_first and (options.max_depth < 0 or depth < options.max_depth):
|
|
949
|
+
children_executor = ThreadPoolExecutor(max_workers=1)
|
|
950
|
+
children_future = children_executor.submit(_children_for, page_id, depth)
|
|
951
|
+
|
|
952
|
+
try:
|
|
953
|
+
if options.resume and page_id in crawl_state.visited_pages:
|
|
954
|
+
try:
|
|
955
|
+
version_check = http.get_page(page_id, expand=["version"])
|
|
956
|
+
except Exception as exc:
|
|
957
|
+
if children_future is not None:
|
|
958
|
+
children = children_future.result()
|
|
959
|
+
return _FetchedPage(
|
|
960
|
+
page_id=page_id,
|
|
961
|
+
parent_id=parent_id,
|
|
962
|
+
depth=depth,
|
|
963
|
+
status="error",
|
|
964
|
+
error=str(exc),
|
|
965
|
+
children=children,
|
|
966
|
+
)
|
|
967
|
+
check_version = (version_check.get("version") or {}).get("number")
|
|
968
|
+
if crawl_state.is_page_unchanged(page_id, check_version, None):
|
|
969
|
+
if children_future is not None:
|
|
970
|
+
children = children_future.result()
|
|
971
|
+
return _FetchedPage(
|
|
972
|
+
page_id=page_id,
|
|
973
|
+
parent_id=parent_id,
|
|
974
|
+
depth=depth,
|
|
975
|
+
status="unchanged",
|
|
976
|
+
version_payload=version_check,
|
|
977
|
+
children=children,
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
try:
|
|
981
|
+
page_payload = http.get_page(page_id, expand=[*body_expands, "version", "space", "ancestors"])
|
|
982
|
+
except Exception as exc:
|
|
983
|
+
if children_future is not None:
|
|
984
|
+
children = children_future.result()
|
|
985
|
+
return _FetchedPage(
|
|
986
|
+
page_id=page_id,
|
|
987
|
+
parent_id=parent_id,
|
|
988
|
+
depth=depth,
|
|
989
|
+
status="error",
|
|
990
|
+
error=str(exc),
|
|
991
|
+
children=children,
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
if children_future is not None:
|
|
995
|
+
children = children_future.result()
|
|
996
|
+
|
|
997
|
+
return _FetchedPage(
|
|
998
|
+
page_id=page_id,
|
|
999
|
+
parent_id=parent_id,
|
|
1000
|
+
depth=depth,
|
|
1001
|
+
status="fetched",
|
|
1002
|
+
page_payload=page_payload,
|
|
1003
|
+
children=children,
|
|
1004
|
+
)
|
|
1005
|
+
finally:
|
|
1006
|
+
if children_executor is not None:
|
|
1007
|
+
children_executor.shutdown(wait=True)
|
|
1008
|
+
|
|
1009
|
+
def _extract_page_id_from_search_item(item: Any) -> str | None:
|
|
1010
|
+
if not isinstance(item, dict):
|
|
1011
|
+
return None
|
|
1012
|
+
raw_id = item.get("id")
|
|
1013
|
+
if raw_id is None and isinstance(item.get("content"), dict):
|
|
1014
|
+
raw_id = item["content"].get("id")
|
|
1015
|
+
if raw_id is None:
|
|
1016
|
+
return None
|
|
1017
|
+
page_id = str(raw_id).strip()
|
|
1018
|
+
return page_id if page_id.isdigit() else None
|
|
1019
|
+
|
|
1020
|
+
search_by_space_and_type = getattr(http, "search_by_space_and_type", None)
|
|
1021
|
+
|
|
1022
|
+
def _search_space_page_ids(
|
|
1023
|
+
*,
|
|
1024
|
+
space_key: str,
|
|
1025
|
+
last_modified_after: datetime | None = None,
|
|
1026
|
+
overlap_minutes: int = 0,
|
|
1027
|
+
) -> tuple[set[str], bool]:
|
|
1028
|
+
if not callable(search_by_space_and_type):
|
|
1029
|
+
return set(), False
|
|
1030
|
+
|
|
1031
|
+
page_ids: set[str] = set()
|
|
1032
|
+
start = 0
|
|
1033
|
+
limit = 100
|
|
1034
|
+
successful = False
|
|
1035
|
+
max_pages = 500
|
|
1036
|
+
|
|
1037
|
+
for _ in range(max_pages):
|
|
1038
|
+
response = search_by_space_and_type(
|
|
1039
|
+
space_key=space_key,
|
|
1040
|
+
content_type="page",
|
|
1041
|
+
limit=limit,
|
|
1042
|
+
start=start,
|
|
1043
|
+
expand=["version"],
|
|
1044
|
+
last_modified_after=last_modified_after,
|
|
1045
|
+
overlap_minutes=overlap_minutes,
|
|
1046
|
+
)
|
|
1047
|
+
if not isinstance(response, dict):
|
|
1048
|
+
break
|
|
1049
|
+
successful = True
|
|
1050
|
+
results = response.get("results")
|
|
1051
|
+
if not isinstance(results, list) or not results:
|
|
1052
|
+
break
|
|
1053
|
+
for item in results:
|
|
1054
|
+
page_id = _extract_page_id_from_search_item(item)
|
|
1055
|
+
if page_id:
|
|
1056
|
+
page_ids.add(page_id)
|
|
1057
|
+
if len(results) < limit:
|
|
1058
|
+
break
|
|
1059
|
+
start += limit
|
|
1060
|
+
|
|
1061
|
+
return page_ids, successful
|
|
1062
|
+
|
|
1063
|
+
def _resolve_known_space_key() -> str | None:
|
|
1064
|
+
root_meta = previous_page_meta_by_id.get(resolved_root_id)
|
|
1065
|
+
if not isinstance(root_meta, dict):
|
|
1066
|
+
return None
|
|
1067
|
+
raw_space_key = root_meta.get("space_key")
|
|
1068
|
+
if not isinstance(raw_space_key, str):
|
|
1069
|
+
return None
|
|
1070
|
+
space_key = raw_space_key.strip()
|
|
1071
|
+
return space_key or None
|
|
1072
|
+
|
|
1073
|
+
def _discover_structure_first_tree(
|
|
1074
|
+
initial_queue: list[tuple[str, str | None, int]],
|
|
1075
|
+
) -> tuple[list[tuple[str, str | None, int]], list[tuple[str, str | None, int]], set[str]]:
|
|
1076
|
+
ordered: list[tuple[str, str | None, int]] = []
|
|
1077
|
+
pending: list[tuple[str, str | None, int]] = []
|
|
1078
|
+
discovered: set[str] = set()
|
|
1079
|
+
pending_ids: set[str] = set()
|
|
1080
|
+
seen: set[str] = set()
|
|
1081
|
+
current_level: list[tuple[str, str | None, int]] = list(initial_queue)
|
|
1082
|
+
for page_id, _, _ in current_level:
|
|
1083
|
+
discovered.add(page_id)
|
|
1084
|
+
|
|
1085
|
+
while current_level:
|
|
1086
|
+
level_nodes: list[tuple[str, str | None, int]] = []
|
|
1087
|
+
for page_id, parent_id, depth in current_level:
|
|
1088
|
+
if page_id in seen:
|
|
1089
|
+
continue
|
|
1090
|
+
if options.max_nodes is not None and len(ordered) >= options.max_nodes:
|
|
1091
|
+
if page_id not in pending_ids:
|
|
1092
|
+
pending.append((page_id, parent_id, depth))
|
|
1093
|
+
pending_ids.add(page_id)
|
|
1094
|
+
continue
|
|
1095
|
+
seen.add(page_id)
|
|
1096
|
+
ordered.append((page_id, parent_id, depth))
|
|
1097
|
+
level_nodes.append((page_id, parent_id, depth))
|
|
1098
|
+
|
|
1099
|
+
if options.max_nodes is not None and len(ordered) >= options.max_nodes:
|
|
1100
|
+
for page_id, parent_id, depth in current_level:
|
|
1101
|
+
if page_id in seen or page_id in pending_ids:
|
|
1102
|
+
continue
|
|
1103
|
+
pending.append((page_id, parent_id, depth))
|
|
1104
|
+
pending_ids.add(page_id)
|
|
1105
|
+
break
|
|
1106
|
+
|
|
1107
|
+
parents_for_children: list[tuple[str, int]] = []
|
|
1108
|
+
for page_id, _, depth in level_nodes:
|
|
1109
|
+
if options.max_depth >= 0 and depth >= options.max_depth:
|
|
1110
|
+
continue
|
|
1111
|
+
parents_for_children.append((page_id, depth))
|
|
1112
|
+
|
|
1113
|
+
if not parents_for_children:
|
|
1114
|
+
current_level = []
|
|
1115
|
+
continue
|
|
1116
|
+
|
|
1117
|
+
child_batches: list[list[tuple[str, str | None, int]]]
|
|
1118
|
+
if len(parents_for_children) == 1:
|
|
1119
|
+
parent_id, parent_depth = parents_for_children[0]
|
|
1120
|
+
child_batches = [_children_for(parent_id, parent_depth)]
|
|
1121
|
+
else:
|
|
1122
|
+
max_workers = min(page_concurrency, len(parents_for_children))
|
|
1123
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1124
|
+
futures: list[Future[list[tuple[str, str | None, int]]]] = [
|
|
1125
|
+
executor.submit(_children_for, parent_id, parent_depth)
|
|
1126
|
+
for parent_id, parent_depth in parents_for_children
|
|
1127
|
+
]
|
|
1128
|
+
child_batches = [future.result() for future in futures]
|
|
1129
|
+
|
|
1130
|
+
next_level: list[tuple[str, str | None, int]] = []
|
|
1131
|
+
next_level_ids: set[str] = set()
|
|
1132
|
+
for children in child_batches:
|
|
1133
|
+
for child_id, child_parent, child_depth in children:
|
|
1134
|
+
if child_id in seen or child_id in next_level_ids:
|
|
1135
|
+
continue
|
|
1136
|
+
discovered.add(child_id)
|
|
1137
|
+
next_level_ids.add(child_id)
|
|
1138
|
+
next_level.append((child_id, child_parent, child_depth))
|
|
1139
|
+
current_level = next_level
|
|
1140
|
+
|
|
1141
|
+
return ordered, pending, discovered
|
|
1142
|
+
|
|
1143
|
+
incremental_candidate_ids: set[str] = set()
|
|
1144
|
+
incremental_high_water_mark_ran = False
|
|
1145
|
+
incremental_high_water_mark_skipped = False
|
|
1146
|
+
incremental_high_water_mark_skip_reason: str | None = None
|
|
1147
|
+
incremental_known_page_count = 0
|
|
1148
|
+
full_reconciliation_deleted_ids: list[str] = []
|
|
1149
|
+
full_reconciliation_ran = False
|
|
1150
|
+
full_reconciliation_skipped = False
|
|
1151
|
+
full_reconciliation_skip_reason: str | None = None
|
|
1152
|
+
full_reconciliation_known_page_count = 0
|
|
1153
|
+
space_key_for_incremental = _resolve_known_space_key()
|
|
1154
|
+
if options.resume and options.incremental_cql and callable(search_by_space_and_type):
|
|
1155
|
+
known_page_ids = set(crawl_state.visited_pages.keys())
|
|
1156
|
+
incremental_known_page_count = len(known_page_ids)
|
|
1157
|
+
last_sync_dt = _parse_iso_timestamp(crawl_state.last_sync_timestamp)
|
|
1158
|
+
if space_key_for_incremental and last_sync_dt is not None:
|
|
1159
|
+
if incremental_known_page_count <= 1:
|
|
1160
|
+
incremental_high_water_mark_skipped = True
|
|
1161
|
+
incremental_high_water_mark_skip_reason = "known_pages_lte_1"
|
|
1162
|
+
else:
|
|
1163
|
+
incremental_high_water_mark_ran = True
|
|
1164
|
+
try:
|
|
1165
|
+
incremental_ids, incremental_ok = _search_space_page_ids(
|
|
1166
|
+
space_key=space_key_for_incremental,
|
|
1167
|
+
last_modified_after=last_sync_dt,
|
|
1168
|
+
overlap_minutes=max(0, int(options.incremental_overlap_minutes)),
|
|
1169
|
+
)
|
|
1170
|
+
except Exception:
|
|
1171
|
+
incremental_ids, incremental_ok = set(), False
|
|
1172
|
+
if incremental_ok:
|
|
1173
|
+
incremental_candidate_ids = {page_id for page_id in incremental_ids if page_id in known_page_ids}
|
|
1174
|
+
if incremental_candidate_ids:
|
|
1175
|
+
queued_ids = {queued_page_id for queued_page_id, _, _ in queue}
|
|
1176
|
+
for page_id in sorted(incremental_candidate_ids):
|
|
1177
|
+
if page_id in queued_ids:
|
|
1178
|
+
continue
|
|
1179
|
+
previous_meta = previous_page_meta_by_id.get(page_id) or {}
|
|
1180
|
+
parent_id = previous_meta.get("parent_id")
|
|
1181
|
+
if parent_id is not None:
|
|
1182
|
+
parent_id = str(parent_id)
|
|
1183
|
+
depth = previous_meta.get("depth")
|
|
1184
|
+
if not isinstance(depth, int) or depth < 0:
|
|
1185
|
+
depth = 0
|
|
1186
|
+
queue.append((page_id, parent_id, depth))
|
|
1187
|
+
queued_ids.add(page_id)
|
|
1188
|
+
discovered_page_ids.add(page_id)
|
|
1189
|
+
|
|
1190
|
+
full_reconciliation_due = False
|
|
1191
|
+
last_full_reconciliation_dt = _parse_iso_timestamp(crawl_state.last_full_reconciliation_timestamp)
|
|
1192
|
+
if last_full_reconciliation_dt is None:
|
|
1193
|
+
full_reconciliation_due = True
|
|
1194
|
+
else:
|
|
1195
|
+
full_reconciliation_due = (
|
|
1196
|
+
datetime.now(UTC)
|
|
1197
|
+
- last_full_reconciliation_dt
|
|
1198
|
+
>= timedelta(days=max(1, int(options.full_reconciliation_interval_days)))
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1201
|
+
if space_key_for_incremental and full_reconciliation_due:
|
|
1202
|
+
known_page_ids = set(crawl_state.visited_pages.keys())
|
|
1203
|
+
full_reconciliation_known_page_count = len(known_page_ids)
|
|
1204
|
+
reconciliation_marker_timestamp = _now_iso()
|
|
1205
|
+
if full_reconciliation_known_page_count <= 1:
|
|
1206
|
+
full_reconciliation_skipped = True
|
|
1207
|
+
full_reconciliation_skip_reason = "known_pages_lte_1"
|
|
1208
|
+
crawl_state.last_full_reconciliation_timestamp = reconciliation_marker_timestamp
|
|
1209
|
+
else:
|
|
1210
|
+
try:
|
|
1211
|
+
remote_page_ids, reconciliation_ok = _search_space_page_ids(space_key=space_key_for_incremental)
|
|
1212
|
+
except Exception:
|
|
1213
|
+
remote_page_ids, reconciliation_ok = set(), False
|
|
1214
|
+
if reconciliation_ok:
|
|
1215
|
+
full_reconciliation_ran = True
|
|
1216
|
+
full_reconciliation_deleted_ids = sorted(known_page_ids - remote_page_ids)
|
|
1217
|
+
if full_reconciliation_deleted_ids:
|
|
1218
|
+
for page_id in full_reconciliation_deleted_ids:
|
|
1219
|
+
crawl_state.visited_pages.pop(page_id, None)
|
|
1220
|
+
previous_visited.pop(page_id, None)
|
|
1221
|
+
previous_page_meta_by_id.pop(page_id, None)
|
|
1222
|
+
previous_attachments_by_page.pop(page_id, None)
|
|
1223
|
+
discovered_page_ids.discard(page_id)
|
|
1224
|
+
_emit_runtime_event(
|
|
1225
|
+
options,
|
|
1226
|
+
"pages_deleted",
|
|
1227
|
+
root_page_id=resolved_root_id,
|
|
1228
|
+
deleted_count=len(full_reconciliation_deleted_ids),
|
|
1229
|
+
page_ids=full_reconciliation_deleted_ids,
|
|
1230
|
+
)
|
|
1231
|
+
crawl_state.last_full_reconciliation_timestamp = reconciliation_marker_timestamp
|
|
1232
|
+
|
|
1233
|
+
structure_first_pending_queue: list[tuple[str, str | None, int]] = []
|
|
1234
|
+
structure_pre_discovered_nodes = 0
|
|
1235
|
+
if use_structure_first:
|
|
1236
|
+
queue, structure_first_pending_queue, structure_discovered_ids = _discover_structure_first_tree(queue)
|
|
1237
|
+
structure_pre_discovered_nodes = len(queue) + len(structure_first_pending_queue)
|
|
1238
|
+
discovered_page_ids.update(structure_discovered_ids)
|
|
1239
|
+
|
|
1240
|
+
crawl_started_at = time.perf_counter()
|
|
1241
|
+
max_inflight_workers = 0
|
|
1242
|
+
effective_page_concurrency = page_concurrency
|
|
1243
|
+
effective_concurrency_signal: float | None = None
|
|
1244
|
+
throttle_pressure_score: float | None = None
|
|
1245
|
+
|
|
1246
|
+
while queue:
|
|
1247
|
+
if options.max_nodes is not None and nodes_processed >= options.max_nodes:
|
|
1248
|
+
termination_reason = "max_nodes_reached"
|
|
1249
|
+
break
|
|
1250
|
+
|
|
1251
|
+
if options.max_nodes is None:
|
|
1252
|
+
remaining = len(queue)
|
|
1253
|
+
else:
|
|
1254
|
+
remaining = min(len(queue), options.max_nodes - nodes_processed)
|
|
1255
|
+
if remaining <= 0:
|
|
1256
|
+
termination_reason = "max_nodes_reached"
|
|
1257
|
+
break
|
|
1258
|
+
runtime_page_concurrency = page_concurrency
|
|
1259
|
+
if callable(telemetry_getter):
|
|
1260
|
+
try:
|
|
1261
|
+
telemetry_snapshot = telemetry_getter()
|
|
1262
|
+
except Exception:
|
|
1263
|
+
telemetry_snapshot = None
|
|
1264
|
+
if isinstance(telemetry_snapshot, dict):
|
|
1265
|
+
throttle = telemetry_snapshot.get("adaptive_throttle")
|
|
1266
|
+
if isinstance(throttle, dict):
|
|
1267
|
+
signal = throttle.get("effective_concurrency_signal")
|
|
1268
|
+
if isinstance(signal, (float, int)):
|
|
1269
|
+
effective_concurrency_signal = float(signal)
|
|
1270
|
+
scaled = int(round(page_concurrency * float(signal)))
|
|
1271
|
+
runtime_page_concurrency = max(1, min(page_concurrency, scaled))
|
|
1272
|
+
pressure_score = throttle.get("pressure_score")
|
|
1273
|
+
if isinstance(pressure_score, (float, int)):
|
|
1274
|
+
throttle_pressure_score = float(pressure_score)
|
|
1275
|
+
effective_page_concurrency = runtime_page_concurrency
|
|
1276
|
+
batch_size = min(runtime_page_concurrency, remaining)
|
|
1277
|
+
max_inflight_workers = max(max_inflight_workers, batch_size)
|
|
1278
|
+
|
|
1279
|
+
batch: list[tuple[str, str | None, int]] = []
|
|
1280
|
+
while queue and len(batch) < batch_size:
|
|
1281
|
+
page_id, parent_id, depth = queue.pop(0)
|
|
1282
|
+
if page_id in visited_pages:
|
|
1283
|
+
skipped_already_visited += 1
|
|
1284
|
+
continue
|
|
1285
|
+
visited_pages.add(page_id)
|
|
1286
|
+
nodes_processed += 1
|
|
1287
|
+
batch.append((page_id, parent_id, depth))
|
|
1288
|
+
|
|
1289
|
+
crawl_state.pending_queue = list(queue)
|
|
1290
|
+
if not batch:
|
|
1291
|
+
continue
|
|
1292
|
+
|
|
1293
|
+
fetched: list[_FetchedPage] = []
|
|
1294
|
+
if batch_size == 1:
|
|
1295
|
+
fetched = [_fetch_page(node) for node in batch]
|
|
1296
|
+
else:
|
|
1297
|
+
with ThreadPoolExecutor(max_workers=batch_size) as executor:
|
|
1298
|
+
futures: dict[Future[_FetchedPage], int] = {
|
|
1299
|
+
executor.submit(_fetch_page, node): idx for idx, node in enumerate(batch)
|
|
1300
|
+
}
|
|
1301
|
+
fetched_by_index: dict[int, _FetchedPage] = {}
|
|
1302
|
+
for future, idx in futures.items():
|
|
1303
|
+
fetched_by_index[idx] = future.result()
|
|
1304
|
+
fetched = [fetched_by_index[idx] for idx in range(len(batch))]
|
|
1305
|
+
|
|
1306
|
+
for result in fetched:
|
|
1307
|
+
page_id = result.page_id
|
|
1308
|
+
parent_id = result.parent_id
|
|
1309
|
+
depth = result.depth
|
|
1310
|
+
|
|
1311
|
+
if result.status == "unchanged":
|
|
1312
|
+
skipped_unchanged += 1
|
|
1313
|
+
unchanged_page_ids.add(page_id)
|
|
1314
|
+
previous_meta = previous_page_meta_by_id.get(page_id)
|
|
1315
|
+
if previous_meta is None:
|
|
1316
|
+
previous_meta = _meta_from_existing_html(
|
|
1317
|
+
page_id=page_id,
|
|
1318
|
+
parent_id=parent_id,
|
|
1319
|
+
depth=depth,
|
|
1320
|
+
page_payload=result.version_payload if isinstance(result.version_payload, dict) else None,
|
|
1321
|
+
)
|
|
1322
|
+
if previous_meta is not None:
|
|
1323
|
+
if depth == 0:
|
|
1324
|
+
root_meta = previous_meta
|
|
1325
|
+
else:
|
|
1326
|
+
child_pages.append(previous_meta)
|
|
1327
|
+
if options.include_attachments:
|
|
1328
|
+
for previous_attachment in previous_attachments_by_page.get(page_id, []):
|
|
1329
|
+
attachment_id = previous_attachment.get("attachment_id")
|
|
1330
|
+
if attachment_id is not None:
|
|
1331
|
+
downloaded_attachments.add(str(attachment_id))
|
|
1332
|
+
attachments.append(previous_attachment)
|
|
1333
|
+
for child_id, child_parent, child_depth in result.children:
|
|
1334
|
+
if child_id in visited_pages:
|
|
1335
|
+
continue
|
|
1336
|
+
discovered_page_ids.add(child_id)
|
|
1337
|
+
queue.append((child_id, child_parent, child_depth))
|
|
1338
|
+
continue
|
|
1339
|
+
|
|
1340
|
+
if result.status == "error":
|
|
1341
|
+
pages_failed += 1
|
|
1342
|
+
meta = {
|
|
1343
|
+
"page_id": page_id,
|
|
1344
|
+
"title": None,
|
|
1345
|
+
"page_url": None,
|
|
1346
|
+
"parent_id": parent_id,
|
|
1347
|
+
"depth": depth,
|
|
1348
|
+
"status": "error",
|
|
1349
|
+
"error": result.error,
|
|
1350
|
+
}
|
|
1351
|
+
if depth == 0:
|
|
1352
|
+
root_meta = meta
|
|
1353
|
+
else:
|
|
1354
|
+
child_pages.append(meta)
|
|
1355
|
+
continue
|
|
1356
|
+
|
|
1357
|
+
pages_fetched += 1
|
|
1358
|
+
page = result.page_payload if isinstance(result.page_payload, dict) else {}
|
|
1359
|
+
title = str(page.get("title") or "untitled")
|
|
1360
|
+
safe_title = _safe_filename(title)
|
|
1361
|
+
body_obj = page.get("body") or {}
|
|
1362
|
+
view_html = (body_obj.get("view") or {}).get("value", "")
|
|
1363
|
+
export_html = (body_obj.get("export_view") or {}).get("value", "")
|
|
1364
|
+
storage_html = (body_obj.get("storage") or {}).get("value", "")
|
|
1365
|
+
candidates = {
|
|
1366
|
+
"view": view_html if isinstance(view_html, str) else "",
|
|
1367
|
+
"export_view": export_html if isinstance(export_html, str) else "",
|
|
1368
|
+
"storage": storage_html if isinstance(storage_html, str) else "",
|
|
1369
|
+
}
|
|
1370
|
+
chosen_html = candidates.get(body_format, "") or ""
|
|
1371
|
+
chosen_format = body_format
|
|
1372
|
+
if not chosen_html.strip():
|
|
1373
|
+
fallback_order = (
|
|
1374
|
+
["view", "storage", "export_view"]
|
|
1375
|
+
if body_format == "view"
|
|
1376
|
+
else ["storage", "view", "export_view"]
|
|
1377
|
+
if body_format == "storage"
|
|
1378
|
+
else ["export_view", "view", "storage"]
|
|
1379
|
+
)
|
|
1380
|
+
for fmt in fallback_order:
|
|
1381
|
+
candidate = candidates.get(fmt, "") or ""
|
|
1382
|
+
if isinstance(candidate, str) and candidate.strip():
|
|
1383
|
+
chosen_html = candidate
|
|
1384
|
+
chosen_format = fmt
|
|
1385
|
+
break
|
|
1386
|
+
|
|
1387
|
+
html = chosen_html if isinstance(chosen_html, str) else ""
|
|
1388
|
+
source_format = f"html-{chosen_format.replace('_', '-')}"
|
|
1389
|
+
|
|
1390
|
+
links = page.get("_links", {}) if isinstance(page.get("_links"), dict) else {}
|
|
1391
|
+
base_url = str(links.get("base") or "")
|
|
1392
|
+
web_ui = str(links.get("webui") or "")
|
|
1393
|
+
page_url = f"{base_url}{web_ui}" if base_url and web_ui else None
|
|
1394
|
+
|
|
1395
|
+
version = page.get("version") or {}
|
|
1396
|
+
page_version = version.get("number") if isinstance(version, dict) else None
|
|
1397
|
+
page_updated = version.get("when") if isinstance(version, dict) else None
|
|
1398
|
+
space_key = (page.get("space") or {}).get("key") if isinstance(page.get("space"), dict) else None
|
|
1399
|
+
|
|
1400
|
+
ancestors = page.get("ancestors") or []
|
|
1401
|
+
ancestor_ids: list[str] = []
|
|
1402
|
+
ancestor_titles: list[str] = []
|
|
1403
|
+
if isinstance(ancestors, list):
|
|
1404
|
+
for entry in ancestors:
|
|
1405
|
+
if not isinstance(entry, dict):
|
|
1406
|
+
continue
|
|
1407
|
+
if entry.get("id") is not None:
|
|
1408
|
+
ancestor_ids.append(str(entry["id"]))
|
|
1409
|
+
if entry.get("title"):
|
|
1410
|
+
ancestor_titles.append(str(entry["title"]))
|
|
1411
|
+
|
|
1412
|
+
html_path = output_dir / f"{page_id}_{safe_title}.html"
|
|
1413
|
+
html_path.write_text(html, encoding="utf-8")
|
|
1414
|
+
html_bytes = html.encode("utf-8")
|
|
1415
|
+
html_sha256 = _sha256_bytes(html_bytes)
|
|
1416
|
+
crawl_state.record_page(page_id, page_version, html_sha256)
|
|
1417
|
+
|
|
1418
|
+
meta = {
|
|
1419
|
+
"page_id": page_id,
|
|
1420
|
+
"title": title,
|
|
1421
|
+
"page_url": page_url,
|
|
1422
|
+
"parent_id": parent_id,
|
|
1423
|
+
"depth": depth,
|
|
1424
|
+
"status": "downloaded",
|
|
1425
|
+
"html_path": str(html_path),
|
|
1426
|
+
"html_sha256": html_sha256,
|
|
1427
|
+
"html_size_bytes": html_path.stat().st_size,
|
|
1428
|
+
"source_format": source_format,
|
|
1429
|
+
"space_key": space_key,
|
|
1430
|
+
"page_version": page_version,
|
|
1431
|
+
"page_updated": page_updated,
|
|
1432
|
+
"ancestor_ids": ancestor_ids,
|
|
1433
|
+
"ancestor_titles": ancestor_titles,
|
|
1434
|
+
"error": None,
|
|
1435
|
+
}
|
|
1436
|
+
if depth == 0:
|
|
1437
|
+
root_meta = meta
|
|
1438
|
+
else:
|
|
1439
|
+
child_pages.append(meta)
|
|
1440
|
+
|
|
1441
|
+
if options.emit_link_graph:
|
|
1442
|
+
for link in _extract_links(html, base_url=base_url):
|
|
1443
|
+
target_url = link.get("url")
|
|
1444
|
+
target_type = _classify_target_type(str(target_url)) if target_url else None
|
|
1445
|
+
target_page_id = _extract_target_page_id(str(target_url)) if target_url else None
|
|
1446
|
+
edge: dict[str, Any] = {
|
|
1447
|
+
"source_page_id": page_id,
|
|
1448
|
+
"source_page_url": page_url,
|
|
1449
|
+
"target_url": target_url,
|
|
1450
|
+
"anchor_text": link.get("text"),
|
|
1451
|
+
}
|
|
1452
|
+
if target_type:
|
|
1453
|
+
edge["target_type"] = target_type
|
|
1454
|
+
if target_type == "confluence_page" and target_page_id:
|
|
1455
|
+
edge["target_page_id"] = target_page_id
|
|
1456
|
+
link_edges.append(edge)
|
|
1457
|
+
|
|
1458
|
+
if options.emit_corpus:
|
|
1459
|
+
for chunk in _chunk_html_by_headings(html, max_chars=chunk_max_chars):
|
|
1460
|
+
heading_path = chunk.get("heading_path") or []
|
|
1461
|
+
ordinal = chunk.get("ordinal") if isinstance(chunk.get("ordinal"), int) else 0
|
|
1462
|
+
base_key = f"{page_id}|{'/'.join(heading_path)}"
|
|
1463
|
+
key = base_key if ordinal == 0 else f"{base_key}|{ordinal}"
|
|
1464
|
+
corpus_chunks.append(
|
|
1465
|
+
{
|
|
1466
|
+
"chunk_id": _sha256_text(key)[:16],
|
|
1467
|
+
"page_id": page_id,
|
|
1468
|
+
"page_url": page_url,
|
|
1469
|
+
"title": title,
|
|
1470
|
+
"depth": depth,
|
|
1471
|
+
"parent_id": parent_id,
|
|
1472
|
+
"ancestors": ancestor_ids,
|
|
1473
|
+
"ancestor_ids": ancestor_ids,
|
|
1474
|
+
"space_key": space_key,
|
|
1475
|
+
"page_version": page_version,
|
|
1476
|
+
"page_updated": page_updated,
|
|
1477
|
+
"ancestor_titles": ancestor_titles,
|
|
1478
|
+
"heading_path": heading_path,
|
|
1479
|
+
"chunk_ordinal": ordinal,
|
|
1480
|
+
"text": chunk.get("text"),
|
|
1481
|
+
"source_type": "confluence",
|
|
1482
|
+
"doc_class": "canonical_doc",
|
|
1483
|
+
"generated_flag": False,
|
|
1484
|
+
"historical_flag": False,
|
|
1485
|
+
"discovery": "confluence_crawl_page",
|
|
1486
|
+
}
|
|
1487
|
+
)
|
|
1488
|
+
|
|
1489
|
+
if options.emit_tables:
|
|
1490
|
+
for table in _extract_tables(html):
|
|
1491
|
+
key = f"{page_id}|table|{table.get('table_index')}"
|
|
1492
|
+
tables.append(
|
|
1493
|
+
{
|
|
1494
|
+
"table_id": _sha256_text(key)[:16],
|
|
1495
|
+
"page_id": page_id,
|
|
1496
|
+
"page_url": page_url,
|
|
1497
|
+
"title": title,
|
|
1498
|
+
"depth": depth,
|
|
1499
|
+
"parent_id": parent_id,
|
|
1500
|
+
"space_key": space_key,
|
|
1501
|
+
"page_version": page_version,
|
|
1502
|
+
"page_updated": page_updated,
|
|
1503
|
+
"ancestor_titles": ancestor_titles,
|
|
1504
|
+
"table_index": table.get("table_index"),
|
|
1505
|
+
"headers": table.get("headers"),
|
|
1506
|
+
"rows": table.get("rows"),
|
|
1507
|
+
"row_count": table.get("row_count"),
|
|
1508
|
+
"col_count": table.get("col_count"),
|
|
1509
|
+
}
|
|
1510
|
+
)
|
|
1511
|
+
|
|
1512
|
+
if options.emit_mentions:
|
|
1513
|
+
for index, mention in enumerate(_extract_mentions(html, base_url=base_url)):
|
|
1514
|
+
key = f"{page_id}|mention|{mention.get('type')}|{mention.get('value')}|{mention.get('url') or ''}"
|
|
1515
|
+
mentions.append(
|
|
1516
|
+
{
|
|
1517
|
+
"mention_id": _sha256_text(key)[:16],
|
|
1518
|
+
"page_id": page_id,
|
|
1519
|
+
"page_url": page_url,
|
|
1520
|
+
"title": title,
|
|
1521
|
+
"depth": depth,
|
|
1522
|
+
"parent_id": parent_id,
|
|
1523
|
+
"space_key": space_key,
|
|
1524
|
+
"page_version": page_version,
|
|
1525
|
+
"page_updated": page_updated,
|
|
1526
|
+
"ancestor_titles": ancestor_titles,
|
|
1527
|
+
"mention_index": index,
|
|
1528
|
+
**mention,
|
|
1529
|
+
}
|
|
1530
|
+
)
|
|
1531
|
+
|
|
1532
|
+
if options.include_attachments:
|
|
1533
|
+
try:
|
|
1534
|
+
attachment_results = http.get_attachments(page_id)
|
|
1535
|
+
except Exception as exc:
|
|
1536
|
+
attachments.append(
|
|
1537
|
+
{
|
|
1538
|
+
"attachment_id": None,
|
|
1539
|
+
"page_id": page_id,
|
|
1540
|
+
"title": None,
|
|
1541
|
+
"status": "error",
|
|
1542
|
+
"error": str(exc),
|
|
1543
|
+
}
|
|
1544
|
+
)
|
|
1545
|
+
attachment_results = []
|
|
1546
|
+
|
|
1547
|
+
for att in attachment_results:
|
|
1548
|
+
if not isinstance(att, dict):
|
|
1549
|
+
continue
|
|
1550
|
+
attachment_id = att.get("id") or att.get("attachment_id")
|
|
1551
|
+
if attachment_id is None:
|
|
1552
|
+
continue
|
|
1553
|
+
attachment_id = str(attachment_id)
|
|
1554
|
+
if attachment_id in downloaded_attachments:
|
|
1555
|
+
continue
|
|
1556
|
+
downloaded_attachments.add(attachment_id)
|
|
1557
|
+
|
|
1558
|
+
att_title = str(att.get("title") or att.get("name") or f"attachment_{attachment_id}")
|
|
1559
|
+
out_path = attachments_dir / _safe_attachment_filename(
|
|
1560
|
+
attachment_id, att_title, fallback_suffix=_guess_attachment_suffix(att)
|
|
1561
|
+
)
|
|
1562
|
+
|
|
1563
|
+
try:
|
|
1564
|
+
content = http.get_attachment_content(attachment_id)
|
|
1565
|
+
out_path.write_bytes(content)
|
|
1566
|
+
file_sha256 = _sha256_bytes(content)
|
|
1567
|
+
status = "downloaded"
|
|
1568
|
+
error = None
|
|
1569
|
+
except Exception as exc:
|
|
1570
|
+
file_sha256 = None
|
|
1571
|
+
status = "error"
|
|
1572
|
+
error = str(exc)
|
|
1573
|
+
|
|
1574
|
+
version_obj = att.get("version") or {}
|
|
1575
|
+
attachment_version = version_obj.get("number") if isinstance(version_obj, dict) else None
|
|
1576
|
+
extensions = att.get("extensions") or {}
|
|
1577
|
+
attachment_size = None
|
|
1578
|
+
if isinstance(extensions, dict):
|
|
1579
|
+
attachment_size = extensions.get("fileSize") or extensions.get("size")
|
|
1580
|
+
|
|
1581
|
+
attachments.append(
|
|
1582
|
+
{
|
|
1583
|
+
"attachment_id": attachment_id,
|
|
1584
|
+
"page_id": page_id,
|
|
1585
|
+
"title": att_title,
|
|
1586
|
+
"status": status,
|
|
1587
|
+
"file_path": str(out_path) if status == "downloaded" else None,
|
|
1588
|
+
"file_sha256": file_sha256,
|
|
1589
|
+
"file_size_bytes": out_path.stat().st_size
|
|
1590
|
+
if status == "downloaded" and out_path.exists()
|
|
1591
|
+
else None,
|
|
1592
|
+
"attachment_version": attachment_version,
|
|
1593
|
+
"attachment_size": attachment_size,
|
|
1594
|
+
"error": error,
|
|
1595
|
+
}
|
|
1596
|
+
)
|
|
1597
|
+
|
|
1598
|
+
for child_id, child_parent, child_depth in result.children:
|
|
1599
|
+
if child_id in visited_pages:
|
|
1600
|
+
continue
|
|
1601
|
+
discovered_page_ids.add(child_id)
|
|
1602
|
+
queue.append((child_id, child_parent, child_depth))
|
|
1603
|
+
|
|
1604
|
+
_emit_runtime_event(
|
|
1605
|
+
options,
|
|
1606
|
+
"crawl_progress",
|
|
1607
|
+
root_page_id=resolved_root_id,
|
|
1608
|
+
pages_discovered=len(discovered_page_ids),
|
|
1609
|
+
pages_fetched=pages_fetched,
|
|
1610
|
+
pages_failed=pages_failed,
|
|
1611
|
+
pages_skipped_unchanged=skipped_unchanged,
|
|
1612
|
+
pages_skipped_total=skipped_unchanged + skipped_already_visited,
|
|
1613
|
+
pending_queue_size=len(queue),
|
|
1614
|
+
effective_page_concurrency=effective_page_concurrency,
|
|
1615
|
+
effective_concurrency_signal=effective_concurrency_signal,
|
|
1616
|
+
throttle_pressure_score=throttle_pressure_score,
|
|
1617
|
+
)
|
|
1618
|
+
|
|
1619
|
+
if options.emit_corpus and previous_corpus_chunks_by_page:
|
|
1620
|
+
should_merge_unchanged_corpus = pages_fetched > 0 or bool(full_reconciliation_deleted_ids)
|
|
1621
|
+
if should_merge_unchanged_corpus:
|
|
1622
|
+
emitted_chunk_ids = {
|
|
1623
|
+
str(chunk.get("chunk_id") or "").strip()
|
|
1624
|
+
for chunk in corpus_chunks
|
|
1625
|
+
if isinstance(chunk, dict) and str(chunk.get("chunk_id") or "").strip()
|
|
1626
|
+
}
|
|
1627
|
+
for page_id in sorted(unchanged_page_ids):
|
|
1628
|
+
for previous_chunk in previous_corpus_chunks_by_page.get(page_id, []):
|
|
1629
|
+
if not isinstance(previous_chunk, dict):
|
|
1630
|
+
continue
|
|
1631
|
+
chunk_id = str(previous_chunk.get("chunk_id") or "").strip()
|
|
1632
|
+
if chunk_id and chunk_id in emitted_chunk_ids:
|
|
1633
|
+
continue
|
|
1634
|
+
corpus_chunks.append(dict(previous_chunk))
|
|
1635
|
+
if chunk_id:
|
|
1636
|
+
emitted_chunk_ids.add(chunk_id)
|
|
1637
|
+
|
|
1638
|
+
if structure_first_pending_queue:
|
|
1639
|
+
queued_page_ids = {queued_page_id for queued_page_id, _, _ in queue}
|
|
1640
|
+
for pending_page_id, pending_parent_id, pending_depth in structure_first_pending_queue:
|
|
1641
|
+
if pending_page_id in visited_pages or pending_page_id in queued_page_ids:
|
|
1642
|
+
continue
|
|
1643
|
+
queue.append((pending_page_id, pending_parent_id, pending_depth))
|
|
1644
|
+
queued_page_ids.add(pending_page_id)
|
|
1645
|
+
if queue:
|
|
1646
|
+
termination_reason = "max_nodes_reached"
|
|
1647
|
+
|
|
1648
|
+
# Update and save crawl state (TSK-188)
|
|
1649
|
+
http_telemetry: dict[str, Any] | None = None
|
|
1650
|
+
if callable(telemetry_getter):
|
|
1651
|
+
try:
|
|
1652
|
+
telemetry_payload = telemetry_getter()
|
|
1653
|
+
if isinstance(telemetry_payload, dict):
|
|
1654
|
+
http_telemetry = telemetry_payload
|
|
1655
|
+
except Exception:
|
|
1656
|
+
http_telemetry = None
|
|
1657
|
+
if http_telemetry is None:
|
|
1658
|
+
http_telemetry = {}
|
|
1659
|
+
|
|
1660
|
+
elapsed_seconds = max(0.001, time.perf_counter() - crawl_started_at)
|
|
1661
|
+
throughput_pages_per_second = round(nodes_processed / elapsed_seconds, 3)
|
|
1662
|
+
latency_metrics = http_telemetry.get("latency_ms") if isinstance(http_telemetry, dict) else None
|
|
1663
|
+
p95_page_latency_ms = None
|
|
1664
|
+
if isinstance(latency_metrics, dict):
|
|
1665
|
+
operation_latency = latency_metrics.get("operation")
|
|
1666
|
+
if isinstance(operation_latency, dict):
|
|
1667
|
+
p95_value = operation_latency.get("p95")
|
|
1668
|
+
if isinstance(p95_value, (float, int)):
|
|
1669
|
+
p95_page_latency_ms = round(float(p95_value), 3)
|
|
1670
|
+
|
|
1671
|
+
retry_events_total = None
|
|
1672
|
+
if isinstance(http_telemetry, dict):
|
|
1673
|
+
retry_events_list = http_telemetry.get("recent_retry_events")
|
|
1674
|
+
if isinstance(retry_events_list, list):
|
|
1675
|
+
retry_events_total = len(retry_events_list)
|
|
1676
|
+
|
|
1677
|
+
if isinstance(http_telemetry, dict):
|
|
1678
|
+
http_telemetry["inflight_workers"] = max_inflight_workers
|
|
1679
|
+
http_telemetry["queue_depth"] = len(queue)
|
|
1680
|
+
http_telemetry["pages_per_second"] = throughput_pages_per_second
|
|
1681
|
+
http_telemetry["retry_events"] = retry_events_total
|
|
1682
|
+
http_telemetry["p95_page_latency_ms"] = p95_page_latency_ms
|
|
1683
|
+
adaptive_throttle = http_telemetry.get("adaptive_throttle")
|
|
1684
|
+
if isinstance(adaptive_throttle, dict):
|
|
1685
|
+
signal_value = adaptive_throttle.get("effective_concurrency_signal")
|
|
1686
|
+
if isinstance(signal_value, (float, int)):
|
|
1687
|
+
effective_concurrency_signal = float(signal_value)
|
|
1688
|
+
pressure_value = adaptive_throttle.get("pressure_score")
|
|
1689
|
+
if isinstance(pressure_value, (float, int)):
|
|
1690
|
+
throttle_pressure_score = float(pressure_value)
|
|
1691
|
+
|
|
1692
|
+
crawl_state.last_sync_timestamp = _now_iso()
|
|
1693
|
+
pages_skipped = skipped_unchanged + skipped_already_visited
|
|
1694
|
+
progress_event = {
|
|
1695
|
+
"pages_discovered": len(discovered_page_ids),
|
|
1696
|
+
"pages_fetched": pages_fetched,
|
|
1697
|
+
"pages_failed": pages_failed,
|
|
1698
|
+
"pages_skipped": pages_skipped,
|
|
1699
|
+
"pages_skipped_unchanged": skipped_unchanged,
|
|
1700
|
+
"throttle": {
|
|
1701
|
+
"configured_page_concurrency": page_concurrency,
|
|
1702
|
+
"effective_page_concurrency": effective_page_concurrency,
|
|
1703
|
+
"effective_concurrency_signal": effective_concurrency_signal,
|
|
1704
|
+
"pressure_score": throttle_pressure_score,
|
|
1705
|
+
},
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1708
|
+
crawl_state.pending_queue = list(queue)
|
|
1709
|
+
crawl_state.completed = len(queue) == 0
|
|
1710
|
+
if crawl_state.completed:
|
|
1711
|
+
termination_reason = "queue_exhausted"
|
|
1712
|
+
crawl_state.run_diagnostics = {
|
|
1713
|
+
"termination_reason": termination_reason,
|
|
1714
|
+
"nodes_processed": nodes_processed,
|
|
1715
|
+
"skipped_unchanged": skipped_unchanged,
|
|
1716
|
+
"skipped_already_visited": skipped_already_visited,
|
|
1717
|
+
"pages_fetched": pages_fetched,
|
|
1718
|
+
"pages_failed": pages_failed,
|
|
1719
|
+
"pages_discovered": len(discovered_page_ids),
|
|
1720
|
+
"pages_skipped": pages_skipped,
|
|
1721
|
+
"pending_queue_size": len(queue),
|
|
1722
|
+
"page_concurrency": page_concurrency,
|
|
1723
|
+
"staged_mode": staged_mode,
|
|
1724
|
+
"incremental_cql": bool(options.incremental_cql),
|
|
1725
|
+
"rate_limit_budget": int(options.rate_limit_budget),
|
|
1726
|
+
"full_reconciliation_interval_days": max(1, int(options.full_reconciliation_interval_days)),
|
|
1727
|
+
"last_sync_timestamp": crawl_state.last_sync_timestamp,
|
|
1728
|
+
"last_full_reconciliation_timestamp": crawl_state.last_full_reconciliation_timestamp,
|
|
1729
|
+
"incremental_overlap_minutes": max(0, int(options.incremental_overlap_minutes)),
|
|
1730
|
+
"incremental_known_page_count": incremental_known_page_count,
|
|
1731
|
+
"incremental_candidate_count": len(incremental_candidate_ids),
|
|
1732
|
+
"incremental_high_water_mark_ran": incremental_high_water_mark_ran,
|
|
1733
|
+
"incremental_high_water_mark_skipped": incremental_high_water_mark_skipped,
|
|
1734
|
+
"incremental_high_water_mark_skip_reason": incremental_high_water_mark_skip_reason,
|
|
1735
|
+
"full_reconciliation_ran": full_reconciliation_ran,
|
|
1736
|
+
"full_reconciliation_deleted_ids": full_reconciliation_deleted_ids,
|
|
1737
|
+
"full_reconciliation_skipped": full_reconciliation_skipped,
|
|
1738
|
+
"full_reconciliation_skip_reason": full_reconciliation_skip_reason,
|
|
1739
|
+
"full_reconciliation_known_page_count": full_reconciliation_known_page_count,
|
|
1740
|
+
"structure_pre_discovered_nodes": structure_pre_discovered_nodes if use_structure_first else None,
|
|
1741
|
+
"progress_event": progress_event,
|
|
1742
|
+
"telemetry": http_telemetry,
|
|
1743
|
+
"http_telemetry": http_telemetry,
|
|
1744
|
+
}
|
|
1745
|
+
_save_crawl_state(crawl_state, state_path)
|
|
1746
|
+
|
|
1747
|
+
payload: dict[str, Any] = {
|
|
1748
|
+
"root_ref": root_ref,
|
|
1749
|
+
"root_page_id": resolved_root_id,
|
|
1750
|
+
"out_dir": str(output_dir),
|
|
1751
|
+
"generated_at": _now_iso(),
|
|
1752
|
+
"body_format": body_format,
|
|
1753
|
+
"chunk_max_chars": chunk_max_chars,
|
|
1754
|
+
"crawl_mode": {
|
|
1755
|
+
"page_concurrency": page_concurrency,
|
|
1756
|
+
"staged_mode": staged_mode,
|
|
1757
|
+
"incremental_cql": bool(options.incremental_cql),
|
|
1758
|
+
"rate_limit_budget": int(options.rate_limit_budget),
|
|
1759
|
+
"incremental_overlap_minutes": max(0, int(options.incremental_overlap_minutes)),
|
|
1760
|
+
"full_reconciliation_interval_days": max(1, int(options.full_reconciliation_interval_days)),
|
|
1761
|
+
},
|
|
1762
|
+
"page": root_meta,
|
|
1763
|
+
"child_pages": child_pages,
|
|
1764
|
+
"attachments": attachments,
|
|
1765
|
+
"corpus_path": None,
|
|
1766
|
+
"link_graph_path": None,
|
|
1767
|
+
"tables_path": None,
|
|
1768
|
+
"mentions_path": None,
|
|
1769
|
+
"progress_event": progress_event,
|
|
1770
|
+
"resume_state": {
|
|
1771
|
+
"skipped_unchanged": skipped_unchanged,
|
|
1772
|
+
"skipped_already_visited": skipped_already_visited,
|
|
1773
|
+
"pages_fetched": pages_fetched,
|
|
1774
|
+
"pages_failed": pages_failed,
|
|
1775
|
+
"pages_discovered": len(discovered_page_ids),
|
|
1776
|
+
"pages_skipped": pages_skipped,
|
|
1777
|
+
"pages_skipped_unchanged": skipped_unchanged,
|
|
1778
|
+
"nodes_processed": nodes_processed,
|
|
1779
|
+
"completed": crawl_state.completed,
|
|
1780
|
+
"termination_reason": termination_reason,
|
|
1781
|
+
"pending_queue_size": len(queue),
|
|
1782
|
+
"state_path": str(state_path),
|
|
1783
|
+
"page_concurrency": page_concurrency,
|
|
1784
|
+
"staged_mode": staged_mode,
|
|
1785
|
+
"last_sync_timestamp": crawl_state.last_sync_timestamp,
|
|
1786
|
+
"last_full_reconciliation_timestamp": crawl_state.last_full_reconciliation_timestamp,
|
|
1787
|
+
},
|
|
1788
|
+
"telemetry": http_telemetry,
|
|
1789
|
+
"http_telemetry": http_telemetry,
|
|
1790
|
+
}
|
|
1791
|
+
|
|
1792
|
+
if options.emit_corpus:
|
|
1793
|
+
corpus_path = output_dir / "corpus.jsonl"
|
|
1794
|
+
if corpus_chunks or not (options.resume and corpus_path.exists()):
|
|
1795
|
+
with corpus_path.open("w", encoding="utf-8") as fp:
|
|
1796
|
+
for item in corpus_chunks:
|
|
1797
|
+
fp.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
1798
|
+
payload["corpus_path"] = str(corpus_path)
|
|
1799
|
+
|
|
1800
|
+
if options.emit_link_graph:
|
|
1801
|
+
graph_path = output_dir / "link-graph.json"
|
|
1802
|
+
if link_edges or not (options.resume and graph_path.exists()):
|
|
1803
|
+
graph_payload = {
|
|
1804
|
+
"generated_at": payload["generated_at"],
|
|
1805
|
+
"root_page_id": resolved_root_id,
|
|
1806
|
+
"edges": [edge for edge in link_edges if edge.get("target_url")],
|
|
1807
|
+
}
|
|
1808
|
+
graph_path.write_text(json.dumps(graph_payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
1809
|
+
payload["link_graph_path"] = str(graph_path)
|
|
1810
|
+
|
|
1811
|
+
if options.emit_tables:
|
|
1812
|
+
tables_path = output_dir / "tables.jsonl"
|
|
1813
|
+
if tables or not (options.resume and tables_path.exists()):
|
|
1814
|
+
with tables_path.open("w", encoding="utf-8") as fp:
|
|
1815
|
+
for item in tables:
|
|
1816
|
+
fp.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
1817
|
+
payload["tables_path"] = str(tables_path)
|
|
1818
|
+
|
|
1819
|
+
if options.emit_mentions:
|
|
1820
|
+
mentions_path = output_dir / "mentions.jsonl"
|
|
1821
|
+
if mentions or not (options.resume and mentions_path.exists()):
|
|
1822
|
+
with mentions_path.open("w", encoding="utf-8") as fp:
|
|
1823
|
+
for item in mentions:
|
|
1824
|
+
fp.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
1825
|
+
payload["mentions_path"] = str(mentions_path)
|
|
1826
|
+
|
|
1827
|
+
resolved_manifest_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
1828
|
+
payload["manifest_path"] = str(resolved_manifest_path)
|
|
1829
|
+
|
|
1830
|
+
# Create bundle zip if requested (TSK-189)
|
|
1831
|
+
if options.bundle:
|
|
1832
|
+
bundle_path = _create_bundle_zip(output_dir, resolved_root_id, resolved_manifest_path)
|
|
1833
|
+
payload["bundle_path"] = str(bundle_path)
|
|
1834
|
+
|
|
1835
|
+
return payload
|