@ngocsangairvds/vsaf 3.1.27 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (303) hide show
  1. package/package.json +2 -2
  2. package/src/global.js +70 -10
  3. package/tools/skills/vds-scripts-skill/.openskills.json +6 -0
  4. package/tools/skills/vds-scripts-skill/QUALITY.md +44 -0
  5. package/tools/skills/vds-scripts-skill/SKILL.md +135 -0
  6. package/tools/skills/vds-scripts-skill/references/audit-commands.md +171 -0
  7. package/tools/skills/vds-scripts-skill/references/capability-index.md +34 -0
  8. package/tools/skills/vds-scripts-skill/references/development-commands.md +12 -0
  9. package/tools/skills/vds-scripts-skill/references/google-sheets.md +73 -0
  10. package/tools/skills/vds-scripts-skill/references/integration-commands.md +17 -0
  11. package/tools/skills/vds-scripts-skill/references/platform-bootstrap.md +31 -0
  12. package/tools/skills/vds-scripts-skill/references/specialist-routing.md +14 -0
  13. package/tools/skills/vds-scripts-skill/references/validation-commands.md +15 -0
  14. package/tools/skills/vsaf-build/SKILL.md +32 -2
  15. package/tools/skills/vsaf-ship/SKILL.md +41 -10
  16. package/tools/skills/vsaf-test/SKILL.md +8 -0
  17. package/tools/vds-scripts/.mcp.json +11 -0
  18. package/tools/vds-scripts/.secrets.baseline +133 -0
  19. package/tools/vds-scripts/AGENTS.md +152 -0
  20. package/tools/vds-scripts/CLAUDE.md +101 -0
  21. package/tools/vds-scripts/CLI_COMMAND_OPTIMIZATION.md +156 -0
  22. package/tools/vds-scripts/PACKAGE_P125B_IMPLEMENTATION_SUMMARY.md +131 -0
  23. package/tools/vds-scripts/PROJECT_COMPLETION_SUMMARY.md +45 -0
  24. package/tools/vds-scripts/README.md +97 -0
  25. package/tools/vds-scripts/bitbucket_manifest_mapping.toml +34 -0
  26. package/tools/vds-scripts/bitbucket_orchestrator/ARCHITECTURE_ANALYSIS.md +258 -0
  27. package/tools/vds-scripts/bitbucket_orchestrator/BITBUCKET_API_PRACTICES.md +393 -0
  28. package/tools/vds-scripts/bitbucket_orchestrator/EVALUATION_REPORT.md +61 -0
  29. package/tools/vds-scripts/bitbucket_orchestrator/FEATURES.md +908 -0
  30. package/tools/vds-scripts/bitbucket_orchestrator/README.md +687 -0
  31. package/tools/vds-scripts/bitbucket_orchestrator/pyproject.toml +40 -0
  32. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/__init__.py +20 -0
  33. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/async_client.py +657 -0
  34. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/cli.py +2108 -0
  35. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/client.py +2534 -0
  36. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/config.py +171 -0
  37. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/errors.py +67 -0
  38. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/factory.py +185 -0
  39. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/protocols.py +244 -0
  40. package/tools/vds-scripts/bitbucket_orchestrator/tests/__init__.py +8 -0
  41. package/tools/vds-scripts/bitbucket_orchestrator/tests/conftest.py +65 -0
  42. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_advanced_search.py +151 -0
  43. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_async_client.py +546 -0
  44. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_branch_permissions.py +145 -0
  45. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_cli.py +115 -0
  46. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client.py +157 -0
  47. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_branch_conditions.py +79 -0
  48. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_code_advanced.py +163 -0
  49. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_code_file.py +32 -0
  50. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_deployment_environments.py +194 -0
  51. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_issues.py +164 -0
  52. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_pipelines_advanced.py +179 -0
  53. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_pr_blockers.py +119 -0
  54. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_repository_variables.py +156 -0
  55. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code.py +98 -0
  56. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code_advanced.py +282 -0
  57. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code_insights.py +335 -0
  58. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_conditions.py +147 -0
  59. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_config.py +131 -0
  60. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_deployment_env.py +352 -0
  61. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_factory.py +371 -0
  62. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_fork_operations.py +204 -0
  63. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_issue_cli.py +261 -0
  64. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_pipeline_advanced.py +270 -0
  65. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_pr_blocker.py +204 -0
  66. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_protocols.py +334 -0
  67. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_repo_settings.py +343 -0
  68. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_repo_variables.py +270 -0
  69. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_webhooks.py +189 -0
  70. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_workspace.py +233 -0
  71. package/tools/vds-scripts/bitbucket_orchestrator/uv.lock +742 -0
  72. package/tools/vds-scripts/confluence_orchestrator/Dockerfile +19 -0
  73. package/tools/vds-scripts/confluence_orchestrator/README.md +412 -0
  74. package/tools/vds-scripts/confluence_orchestrator/SYNC_SCRIPTS.md +127 -0
  75. package/tools/vds-scripts/confluence_orchestrator/SYNC_STANDARDIZATION.md +108 -0
  76. package/tools/vds-scripts/confluence_orchestrator/pyproject.toml +48 -0
  77. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/__init__.py +20 -0
  78. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/cli.py +2532 -0
  79. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/config.py +175 -0
  80. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/content.py +290 -0
  81. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/content_v2.py +94 -0
  82. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/crawl_tree.py +1835 -0
  83. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/errors.py +80 -0
  84. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/eventing.py +109 -0
  85. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/http.py +1114 -0
  86. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/orchestration.py +165 -0
  87. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/reporting.py +78 -0
  88. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/tree.py +121 -0
  89. package/tools/vds-scripts/confluence_orchestrator/sync_pdfs_from_markdown.py +213 -0
  90. package/tools/vds-scripts/confluence_orchestrator/sync_pdfs_to_confluence.py +305 -0
  91. package/tools/vds-scripts/confluence_orchestrator/sync_png_attachments.py +305 -0
  92. package/tools/vds-scripts/confluence_orchestrator/tests/__init__.py +0 -0
  93. package/tools/vds-scripts/confluence_orchestrator/tests/conftest.py +8 -0
  94. package/tools/vds-scripts/confluence_orchestrator/tests/test_advanced_content.py +224 -0
  95. package/tools/vds-scripts/confluence_orchestrator/tests/test_advanced_search.py +188 -0
  96. package/tools/vds-scripts/confluence_orchestrator/tests/test_cache_management.py +247 -0
  97. package/tools/vds-scripts/confluence_orchestrator/tests/test_cli.py +499 -0
  98. package/tools/vds-scripts/confluence_orchestrator/tests/test_config.py +83 -0
  99. package/tools/vds-scripts/confluence_orchestrator/tests/test_content.py +186 -0
  100. package/tools/vds-scripts/confluence_orchestrator/tests/test_content_flags.py +27 -0
  101. package/tools/vds-scripts/confluence_orchestrator/tests/test_crawl_tree.py +2250 -0
  102. package/tools/vds-scripts/confluence_orchestrator/tests/test_draft_management.py +223 -0
  103. package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing.py +71 -0
  104. package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_chaos.py +37 -0
  105. package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_rate_limit.py +44 -0
  106. package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_timeout.py +49 -0
  107. package/tools/vds-scripts/confluence_orchestrator/tests/test_export.py +230 -0
  108. package/tools/vds-scripts/confluence_orchestrator/tests/test_history.py +204 -0
  109. package/tools/vds-scripts/confluence_orchestrator/tests/test_http.py +117 -0
  110. package/tools/vds-scripts/confluence_orchestrator/tests/test_orchestration.py +91 -0
  111. package/tools/vds-scripts/confluence_orchestrator/tests/test_reporting.py +24 -0
  112. package/tools/vds-scripts/confluence_orchestrator/tests/test_search_cql.py +34 -0
  113. package/tools/vds-scripts/confluence_orchestrator/tests/test_space_management.py +237 -0
  114. package/tools/vds-scripts/confluence_orchestrator/tests/test_space_permissions.py +332 -0
  115. package/tools/vds-scripts/confluence_orchestrator/tests/test_user_group_management.py +388 -0
  116. package/tools/vds-scripts/confluence_orchestrator/uv.lock +1023 -0
  117. package/tools/vds-scripts/git_orchestrator/ENHANCEMENT_SUMMARY.md +119 -0
  118. package/tools/vds-scripts/git_orchestrator/README.md +280 -0
  119. package/tools/vds-scripts/git_orchestrator/VERIFICATION_REPORT.md +152 -0
  120. package/tools/vds-scripts/git_orchestrator/pyproject.toml +35 -0
  121. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/__init__.py +7 -0
  122. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/__main__.py +4 -0
  123. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/cli.py +847 -0
  124. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/logging_config.py +63 -0
  125. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/manifest.py +129 -0
  126. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/orchestrator.py +819 -0
  127. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/reporting.py +53 -0
  128. package/tools/vds-scripts/git_orchestrator/tests/__init__.py +0 -0
  129. package/tools/vds-scripts/git_orchestrator/tests/test_cli_settings.py +21 -0
  130. package/tools/vds-scripts/git_orchestrator/tests/test_integration.py +74 -0
  131. package/tools/vds-scripts/git_orchestrator/tests/test_manifest.py +79 -0
  132. package/tools/vds-scripts/git_orchestrator/tests/test_orchestrator.py +204 -0
  133. package/tools/vds-scripts/git_orchestrator/tests/test_public_api.py +236 -0
  134. package/tools/vds-scripts/git_orchestrator/tests/test_resilience.py +345 -0
  135. package/tools/vds-scripts/git_orchestrator/uv.lock +271 -0
  136. package/tools/vds-scripts/jira_orchestrator/README.md +770 -0
  137. package/tools/vds-scripts/jira_orchestrator/pyproject.toml +39 -0
  138. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/__init__.py +1 -0
  139. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/adapter.py +1320 -0
  140. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/cli.py +2271 -0
  141. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/config.py +138 -0
  142. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/errors.py +67 -0
  143. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/reporting.py +65 -0
  144. package/tools/vds-scripts/jira_orchestrator/tests/__init__.py +1 -0
  145. package/tools/vds-scripts/jira_orchestrator/tests/conftest.py +86 -0
  146. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_agile_list_payloads.py +54 -0
  147. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_bulk_operations.py +69 -0
  148. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_components.py +57 -0
  149. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_createmeta.py +45 -0
  150. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_dashboard.py +117 -0
  151. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_issue_properties.py +54 -0
  152. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_permissions_compat.py +42 -0
  153. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_reindex.py +42 -0
  154. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_remote_links.py +76 -0
  155. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_transitions.py +91 -0
  156. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_user_management.py +110 -0
  157. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_version_management.py +133 -0
  158. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_watchers.py +41 -0
  159. package/tools/vds-scripts/jira_orchestrator/tests/test_advanced_search.py +164 -0
  160. package/tools/vds-scripts/jira_orchestrator/tests/test_agile.py +256 -0
  161. package/tools/vds-scripts/jira_orchestrator/tests/test_application_properties.py +193 -0
  162. package/tools/vds-scripts/jira_orchestrator/tests/test_backlog.py +91 -0
  163. package/tools/vds-scripts/jira_orchestrator/tests/test_bulk_operations.py +277 -0
  164. package/tools/vds-scripts/jira_orchestrator/tests/test_cli.py +106 -0
  165. package/tools/vds-scripts/jira_orchestrator/tests/test_components.py +106 -0
  166. package/tools/vds-scripts/jira_orchestrator/tests/test_config.py +164 -0
  167. package/tools/vds-scripts/jira_orchestrator/tests/test_dashboard.py +122 -0
  168. package/tools/vds-scripts/jira_orchestrator/tests/test_discover_fields.py +207 -0
  169. package/tools/vds-scripts/jira_orchestrator/tests/test_filter_management.py +333 -0
  170. package/tools/vds-scripts/jira_orchestrator/tests/test_issue_archiving.py +164 -0
  171. package/tools/vds-scripts/jira_orchestrator/tests/test_issue_links.py +257 -0
  172. package/tools/vds-scripts/jira_orchestrator/tests/test_issue_properties.py +171 -0
  173. package/tools/vds-scripts/jira_orchestrator/tests/test_link_types.py +314 -0
  174. package/tools/vds-scripts/jira_orchestrator/tests/test_parse_set.py +37 -0
  175. package/tools/vds-scripts/jira_orchestrator/tests/test_permissions.py +273 -0
  176. package/tools/vds-scripts/jira_orchestrator/tests/test_reindex.py +81 -0
  177. package/tools/vds-scripts/jira_orchestrator/tests/test_remote_links.py +254 -0
  178. package/tools/vds-scripts/jira_orchestrator/tests/test_security_schemes.py +170 -0
  179. package/tools/vds-scripts/jira_orchestrator/tests/test_transitions_changelog.py +114 -0
  180. package/tools/vds-scripts/jira_orchestrator/tests/test_user_management.py +226 -0
  181. package/tools/vds-scripts/jira_orchestrator/tests/test_version_management.py +339 -0
  182. package/tools/vds-scripts/jira_orchestrator/tests/test_watchers.py +101 -0
  183. package/tools/vds-scripts/jira_orchestrator/tests/test_worklog.py +223 -0
  184. package/tools/vds-scripts/jira_orchestrator/uv.lock +738 -0
  185. package/tools/vds-scripts/mcp_server/Dockerfile +34 -0
  186. package/tools/vds-scripts/mcp_server/README.md +140 -0
  187. package/tools/vds-scripts/mcp_server/pyproject.toml +42 -0
  188. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/__init__.py +4 -0
  189. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/config.py +36 -0
  190. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/server.py +66 -0
  191. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/__init__.py +14 -0
  192. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/bitbucket_tools.py +47 -0
  193. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/confluence_tools.py +59 -0
  194. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/git_tools.py +71 -0
  195. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/jira_tools.py +63 -0
  196. package/tools/vds-scripts/mcp_server/tests/__init__.py +2 -0
  197. package/tools/vds-scripts/mcp_server/tests/conftest.py +29 -0
  198. package/tools/vds-scripts/mcp_server/tests/unit/__init__.py +2 -0
  199. package/tools/vds-scripts/mcp_server/tests/unit/test_bitbucket_tools.py +25 -0
  200. package/tools/vds-scripts/mcp_server/tests/unit/test_confluence_tools.py +25 -0
  201. package/tools/vds-scripts/mcp_server/tests/unit/test_git_tools.py +32 -0
  202. package/tools/vds-scripts/mcp_server/tests/unit/test_jira_tools.py +32 -0
  203. package/tools/vds-scripts/mcp_server/tests/verification/__init__.py +2 -0
  204. package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_confluence_tools.py +40 -0
  205. package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_jira_tools.py +37 -0
  206. package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_tool_registration.py +47 -0
  207. package/tools/vds-scripts/mcp_server/uv.lock +1032 -0
  208. package/tools/vds-scripts/mypy.ini +5 -0
  209. package/tools/vds-scripts/pyproject.toml +29 -0
  210. package/tools/vds-scripts/repo-manifest.yaml +273 -0
  211. package/tools/vds-scripts/repo-manifest.yaml.example +25 -0
  212. package/tools/vds-scripts/scripts/BRD-Validation-API.postman_collection.json +706 -0
  213. package/tools/vds-scripts/scripts/BRD-Validation-README.md +308 -0
  214. package/tools/vds-scripts/scripts/README.md +162 -0
  215. package/tools/vds-scripts/scripts/bootstrap_uv.sh +30 -0
  216. package/tools/vds-scripts/scripts/brd-validation-environment.json +51 -0
  217. package/tools/vds-scripts/scripts/brd-validation-test-results.json +13023 -0
  218. package/tools/vds-scripts/scripts/brd_coverage_report.json +276 -0
  219. package/tools/vds-scripts/scripts/create_memory_session.py +35 -0
  220. package/tools/vds-scripts/scripts/deployment/load_docker_images_offline.sh +90 -0
  221. package/tools/vds-scripts/scripts/final_completion_report.md +139 -0
  222. package/tools/vds-scripts/scripts/folder_structure_report.json +321 -0
  223. package/tools/vds-scripts/scripts/generate_completion_report.py +125 -0
  224. package/tools/vds-scripts/scripts/generate_intellij_modules.py +150 -0
  225. package/tools/vds-scripts/scripts/link_integrity_report.json +807 -0
  226. package/tools/vds-scripts/scripts/move_audit_artifact_pages.py +255 -0
  227. package/tools/vds-scripts/scripts/move_audit_artifact_pages_rest.py +165 -0
  228. package/tools/vds-scripts/scripts/move_wrong_dept_pages.py +216 -0
  229. package/tools/vds-scripts/scripts/save_intellij_memories.py +120 -0
  230. package/tools/vds-scripts/scripts/save_memories_to_vds_ai.py +83 -0
  231. package/tools/vds-scripts/scripts/save_memories_vds_style.py +129 -0
  232. package/tools/vds-scripts/scripts/search_intellij_memories.py +50 -0
  233. package/tools/vds-scripts/scripts/setup_intellij_workspace.py +65 -0
  234. package/tools/vds-scripts/scripts/target-state-automation/README.md +89 -0
  235. package/tools/vds-scripts/scripts/target-state-automation/confluence_sync_coordinator.sh +27 -0
  236. package/tools/vds-scripts/scripts/target-state-automation/coordination.sh +114 -0
  237. package/tools/vds-scripts/scripts/target-state-automation/diagram_coordinator.sh +25 -0
  238. package/tools/vds-scripts/scripts/target-state-automation/docs_root.sh +22 -0
  239. package/tools/vds-scripts/scripts/target-state-automation/generate_diagrams.sh +22 -0
  240. package/tools/vds-scripts/scripts/target-state-automation/markdown_coordinator.sh +25 -0
  241. package/tools/vds-scripts/scripts/target-state-automation/progress_dashboard.sh +17 -0
  242. package/tools/vds-scripts/scripts/target-state-automation/schema_coordinator.sh +25 -0
  243. package/tools/vds-scripts/scripts/target-state-automation/sync_confluence.sh +30 -0
  244. package/tools/vds-scripts/scripts/target-state-automation/update_dependencies.sh +19 -0
  245. package/tools/vds-scripts/scripts/target-state-automation/validate_links.sh +86 -0
  246. package/tools/vds-scripts/scripts/target-state-automation/validate_markdown.sh +52 -0
  247. package/tools/vds-scripts/scripts/target-state-automation/validate_schemas.sh +26 -0
  248. package/tools/vds-scripts/scripts/target-state-automation/validate_structure.sh +98 -0
  249. package/tools/vds-scripts/scripts/update_modules_xml.py +190 -0
  250. package/tools/vds-scripts/scripts/uv-workspace-alignment-verification-2026-03-25.md +128 -0
  251. package/tools/vds-scripts/scripts/validate_brd_coverage.py +179 -0
  252. package/tools/vds-scripts/scripts/validate_folder_structure.py +240 -0
  253. package/tools/vds-scripts/scripts/validate_link_integrity.py +272 -0
  254. package/tools/vds-scripts/scripts/vds_sh_helpers.sh +180 -0
  255. package/tools/vds-scripts/scripts/verification/phase2_portable_paths_ubuntu_docker.sh +26 -0
  256. package/tools/vds-scripts/scripts/worktree_uv.sh +48 -0
  257. package/tools/vds-scripts/uv.lock +8 -0
  258. package/tools/vds-scripts/vds_cli/README.md +126 -0
  259. package/tools/vds-scripts/vds_cli/VERIFICATION_REPORT.md +41 -0
  260. package/tools/vds-scripts/vds_cli/pyproject.toml +38 -0
  261. package/tools/vds-scripts/vds_cli/src/vds_cli/__init__.py +3 -0
  262. package/tools/vds-scripts/vds_cli/src/vds_cli/cli.py +173 -0
  263. package/tools/vds-scripts/vds_cli/src/vds_cli/docs_sync.py +1203 -0
  264. package/tools/vds-scripts/vds_cli/src/vds_cli/env.py +41 -0
  265. package/tools/vds-scripts/vds_cli/src/vds_cli/google_sheets_orchestrator/__init__.py +3 -0
  266. package/tools/vds-scripts/vds_cli/src/vds_cli/google_sheets_orchestrator/google_sheets_orchestrator.py +198 -0
  267. package/tools/vds-scripts/vds_cli/src/vds_cli/router.py +93 -0
  268. package/tools/vds-scripts/vds_cli/src/vds_cli/sync_api.py +647 -0
  269. package/tools/vds-scripts/vds_cli/src/vds_cli/sync_service.py +266 -0
  270. package/tools/vds-scripts/vds_cli/tests/__init__.py +2 -0
  271. package/tools/vds-scripts/vds_cli/tests/conftest.py +49 -0
  272. package/tools/vds-scripts/vds_cli/tests/unit/__init__.py +2 -0
  273. package/tools/vds-scripts/vds_cli/tests/unit/test_cli.py +143 -0
  274. package/tools/vds-scripts/vds_cli/tests/unit/test_docs_sync.py +422 -0
  275. package/tools/vds-scripts/vds_cli/tests/unit/test_env.py +51 -0
  276. package/tools/vds-scripts/vds_cli/tests/unit/test_router.py +72 -0
  277. package/tools/vds-scripts/vds_cli/tests/unit/test_sync_api.py +357 -0
  278. package/tools/vds-scripts/vds_cli/tests/unit/test_sync_service.py +160 -0
  279. package/tools/vds-scripts/vds_cli/tests/verification/__init__.py +2 -0
  280. package/tools/vds-scripts/vds_cli/tests/verification/test_bitbucket_real.py +33 -0
  281. package/tools/vds-scripts/vds_cli/tests/verification/test_confluence_real.py +35 -0
  282. package/tools/vds-scripts/vds_cli/tests/verification/test_jira_real.py +41 -0
  283. package/tools/vds-scripts/vds_cli/uv.lock +524 -0
  284. package/tools/vds-scripts/vds_cli_common/README.md +190 -0
  285. package/tools/vds-scripts/vds_cli_common/pyproject.toml +92 -0
  286. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/__init__.py +34 -0
  287. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/completers.py +139 -0
  288. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/context.py +201 -0
  289. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/env.py +119 -0
  290. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/errors.py +318 -0
  291. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/output.py +284 -0
  292. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/paths.py +78 -0
  293. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/testing.py +213 -0
  294. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/version.py +85 -0
  295. package/tools/vds-scripts/vds_cli_common/tests/__init__.py +1 -0
  296. package/tools/vds-scripts/vds_cli_common/tests/test_completers.py +148 -0
  297. package/tools/vds-scripts/vds_cli_common/tests/test_context.py +192 -0
  298. package/tools/vds-scripts/vds_cli_common/tests/test_env.py +102 -0
  299. package/tools/vds-scripts/vds_cli_common/tests/test_errors.py +186 -0
  300. package/tools/vds-scripts/vds_cli_common/tests/test_output.py +229 -0
  301. package/tools/vds-scripts/vds_cli_common/tests/test_paths.py +61 -0
  302. package/tools/vds-scripts/vds_cli_common/tests/test_testing.py +138 -0
  303. package/tools/vds-scripts/vds_cli_common/tests/test_version.py +64 -0
@@ -0,0 +1,1835 @@
1
+ """Confluence tree crawling and on-disk export helpers.
2
+
3
+ This module is intentionally dependency-light (std lib + BeautifulSoup) so the output
4
+ can be used as input for downstream LLM/document analysis pipelines.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import json
11
+ import re
12
+ import time
13
+ import zipfile
14
+ from collections.abc import Callable
15
+ from concurrent.futures import Future, ThreadPoolExecutor
16
+ from dataclasses import dataclass, field
17
+ from datetime import UTC, datetime, timedelta
18
+ from pathlib import Path
19
+ from typing import Any
20
+ from urllib.parse import parse_qs, unquote, urlparse
21
+
22
+ from bs4 import BeautifulSoup
23
+
24
+ from .http import ConfluenceClient
25
+
26
+ _JIRA_KEY_RE = re.compile(r"\b[A-Z][A-Z0-9]+-\d+\b")
27
+ _MIME_SUFFIX_MAP: dict[str, str] = {
28
+ "application/json": ".json",
29
+ "application/msword": ".doc",
30
+ "application/pdf": ".pdf",
31
+ "application/vnd.jgraph.mxfile": ".drawio",
32
+ "application/vnd.ms-excel": ".xls",
33
+ "application/vnd.ms-powerpoint": ".ppt",
34
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
35
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
36
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
37
+ "image/gif": ".gif",
38
+ "image/jpeg": ".jpg",
39
+ "image/png": ".png",
40
+ "image/svg+xml": ".svg",
41
+ "text/csv": ".csv",
42
+ "text/plain": ".txt",
43
+ }
44
+ def _now_iso() -> str:
45
+ return datetime.now(UTC).isoformat()
46
+
47
+
48
+ def _sha256_bytes(data: bytes) -> str:
49
+ return hashlib.sha256(data).hexdigest()
50
+
51
+
52
+ def _sha256_text(text: str) -> str:
53
+ return _sha256_bytes(text.encode("utf-8"))
54
+
55
+
56
+ def _parse_iso_timestamp(value: str | None) -> datetime | None:
57
+ if not value:
58
+ return None
59
+ cleaned = value.strip()
60
+ if not cleaned:
61
+ return None
62
+ if cleaned.endswith("Z"):
63
+ cleaned = f"{cleaned[:-1]}+00:00"
64
+ try:
65
+ parsed = datetime.fromisoformat(cleaned)
66
+ except ValueError:
67
+ return None
68
+ if parsed.tzinfo is None:
69
+ return parsed.replace(tzinfo=UTC)
70
+ return parsed.astimezone(UTC)
71
+
72
+
73
+ def _safe_filename(name: str) -> str:
74
+ # Preserve unicode for VN titles, but remove path separators + control chars.
75
+ cleaned = name.replace("/", "-").replace("\\", "-").replace("\0", "")
76
+ cleaned = re.sub(r"[\r\n\t]+", " ", cleaned).strip()
77
+ return cleaned or "untitled"
78
+
79
+
80
+ def _normalise_suffix(value: str) -> str:
81
+ cleaned = (value or "").strip().lower()
82
+ if not cleaned:
83
+ return ""
84
+ return cleaned if cleaned.startswith(".") else f".{cleaned}"
85
+
86
+
87
+ def _guess_attachment_suffix(attachment: dict[str, Any]) -> str | None:
88
+ extensions = attachment.get("extensions") or {}
89
+ metadata = attachment.get("metadata") or {}
90
+
91
+ if isinstance(extensions, dict):
92
+ ext = extensions.get("fileExtension") or extensions.get("file_extension")
93
+ if isinstance(ext, str) and ext.strip():
94
+ normalised = _normalise_suffix(ext)
95
+ return normalised if normalised else None
96
+
97
+ media_type = None
98
+ if isinstance(extensions, dict):
99
+ media_type = extensions.get("mediaType") or extensions.get("media_type")
100
+ if not media_type and isinstance(metadata, dict):
101
+ media_type = metadata.get("mediaType") or metadata.get("media_type")
102
+
103
+ if isinstance(media_type, str) and media_type.strip():
104
+ mapped = _MIME_SUFFIX_MAP.get(media_type.strip().lower())
105
+ if mapped:
106
+ return mapped
107
+ return None
108
+
109
+
110
+ def _safe_attachment_filename(attachment_id: str, title: str, *, fallback_suffix: str | None = None) -> str:
111
+ safe_title = _safe_filename(title)
112
+ path = Path(safe_title)
113
+ stem = path.stem or "attachment"
114
+ suffix = path.suffix
115
+ if not suffix and fallback_suffix:
116
+ suffix = _normalise_suffix(fallback_suffix)
117
+
118
+ # Keep filenames under typical filesystem limits (255 bytes). We preserve the
119
+ # attachment id prefix for stability and trim the human title if needed.
120
+ max_stem_length = 180
121
+ if len(stem) > max_stem_length:
122
+ stem = stem[:max_stem_length].rstrip()
123
+ safe_title = f"{stem}{suffix}"
124
+ return f"{attachment_id}_{safe_title}"
125
+
126
+
127
+ def _escape_cql(value: str) -> str:
128
+ return value.replace("\\", "\\\\").replace('"', '\\"')
129
+
130
+
131
+ def resolve_page_ref(page_ref: str, http: ConfluenceClient) -> str:
132
+ """Resolve a page reference to a numeric page ID.
133
+
134
+ Supported forms:
135
+ - numeric ID: "74014717"
136
+ - viewpage: ".../pages/viewpage.action?pageId=74014717"
137
+ - display URL: ".../display/TDOV/13.02.10.+BRD"
138
+ """
139
+ page_ref = (page_ref or "").strip()
140
+ if not page_ref:
141
+ raise ValueError("page_ref is empty")
142
+
143
+ if page_ref.isdigit():
144
+ return page_ref
145
+
146
+ parsed = urlparse(page_ref)
147
+ if parsed.query:
148
+ qs = parse_qs(parsed.query)
149
+ page_ids = qs.get("pageId") or qs.get("pageid")
150
+ if page_ids and page_ids[0].isdigit():
151
+ return page_ids[0]
152
+
153
+ # /display/<SPACE>/<TITLE>
154
+ match = re.search(r"/display/([^/]+)/(.+)$", parsed.path or "")
155
+ if match:
156
+ space = unquote(match.group(1))
157
+ # Confluence "display" URLs often encode spaces as '+'
158
+ title = unquote(match.group(2)).replace("+", " ").strip()
159
+ cql = f'space="{_escape_cql(space)}" and title="{_escape_cql(title)}" and type=page'
160
+ result = http.search_cql(cql, limit=5)
161
+ results = result.get("results", []) if isinstance(result, dict) else []
162
+ for item in results:
163
+ if not isinstance(item, dict):
164
+ continue
165
+ # Confluence CQL results usually wrap content under `content`.
166
+ raw_id = item.get("id")
167
+ if raw_id is None and isinstance(item.get("content"), dict):
168
+ raw_id = item["content"].get("id")
169
+ if raw_id is not None and str(raw_id).isdigit():
170
+ return str(raw_id)
171
+
172
+ raise ValueError(f"Unable to resolve display URL to page ID (space={space!r}, title={title!r})")
173
+
174
+ raise ValueError(f"Unsupported page reference: {page_ref}")
175
+
176
+
177
+ @dataclass(slots=True)
178
+ class CrawlOptions:
179
+ max_depth: int = 3
180
+ include_attachments: bool = True
181
+ emit_corpus: bool = False
182
+ emit_link_graph: bool = False
183
+ emit_tables: bool = False
184
+ emit_mentions: bool = False
185
+ body_format: str = "view"
186
+ chunk_max_chars: int | None = 4000
187
+ resume: bool = False
188
+ max_nodes: int | None = 500
189
+ bundle: bool = False
190
+ page_concurrency: int = 5
191
+ staged_mode: str = "off"
192
+ incremental_cql: bool = True
193
+ incremental_overlap_minutes: int = 5
194
+ full_reconciliation_interval_days: int = 7
195
+ rate_limit_budget: int = 50_000
196
+ runtime_event_sink: Callable[[str, dict[str, Any]], None] | None = None
197
+
198
+
199
+ def _emit_runtime_event(options: CrawlOptions, event: str, **payload: Any) -> None:
200
+ if not callable(options.runtime_event_sink):
201
+ return
202
+ try:
203
+ options.runtime_event_sink(event, payload)
204
+ except Exception:
205
+ return
206
+
207
+
208
+ def _create_bundle_zip(output_dir: Path, root_page_id: str, manifest_path: Path) -> Path:
209
+ """Create a zip bundle of all crawl artifacts for offline handoff (TSK-189)."""
210
+ timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
211
+ bundle_name = f"crawl_bundle_{root_page_id}_{timestamp}.zip"
212
+ bundle_path = output_dir / bundle_name
213
+
214
+ with zipfile.ZipFile(bundle_path, "w", zipfile.ZIP_DEFLATED) as zf:
215
+ # Add all HTML files
216
+ for html_file in output_dir.glob("*.html"):
217
+ zf.write(html_file, html_file.name)
218
+
219
+ # Add attachments directory
220
+ attachments_dir = output_dir / "attachments"
221
+ if attachments_dir.exists():
222
+ for att_file in attachments_dir.iterdir():
223
+ if att_file.is_file():
224
+ zf.write(att_file, f"attachments/{att_file.name}")
225
+
226
+ # Add artifact files
227
+ artifact_files = [
228
+ "corpus.jsonl",
229
+ "link-graph.json",
230
+ "tables.jsonl",
231
+ "mentions.jsonl",
232
+ "crawl_state.json",
233
+ ]
234
+ for artifact_name in artifact_files:
235
+ artifact_path = output_dir / artifact_name
236
+ if artifact_path.exists():
237
+ zf.write(artifact_path, artifact_name)
238
+
239
+ # Add manifest
240
+ if manifest_path.exists():
241
+ zf.write(manifest_path, manifest_path.name)
242
+
243
+ return bundle_path
244
+
245
+
246
+ @dataclass(slots=True)
247
+ class CrawlState:
248
+ """Persistent crawl state for resume/incremental crawling (TSK-188)."""
249
+
250
+ root_page_id: str
251
+ body_format: str
252
+ visited_pages: dict[str, dict[str, Any]] # page_id -> {version, html_sha256, visited_at}
253
+ pending_queue: list[tuple[str, str | None, int]] # [(page_id, parent_id, depth), ...]
254
+ started_at: str
255
+ last_updated_at: str
256
+ completed: bool = False
257
+ last_sync_timestamp: str | None = None
258
+ last_full_reconciliation_timestamp: str | None = None
259
+ run_diagnostics: dict[str, Any] = field(default_factory=dict)
260
+
261
+ def to_dict(self) -> dict[str, Any]:
262
+ """Serialize state to JSON-compatible dict."""
263
+ return {
264
+ "root_page_id": self.root_page_id,
265
+ "body_format": self.body_format,
266
+ "visited_pages": self.visited_pages,
267
+ "pending_queue": [list(item) for item in self.pending_queue],
268
+ "started_at": self.started_at,
269
+ "last_updated_at": self.last_updated_at,
270
+ "completed": self.completed,
271
+ "last_sync_timestamp": self.last_sync_timestamp,
272
+ "last_full_reconciliation_timestamp": self.last_full_reconciliation_timestamp,
273
+ "run_diagnostics": self.run_diagnostics,
274
+ }
275
+
276
+ @classmethod
277
+ def from_dict(cls, data: dict[str, Any]) -> CrawlState:
278
+ """Deserialize state from JSON-compatible dict."""
279
+ return cls(
280
+ root_page_id=data["root_page_id"],
281
+ body_format=data.get("body_format", "view"),
282
+ visited_pages=data.get("visited_pages", {}),
283
+ pending_queue=[tuple(item) for item in data.get("pending_queue", [])],
284
+ started_at=data.get("started_at", _now_iso()),
285
+ last_updated_at=data.get("last_updated_at", _now_iso()),
286
+ completed=data.get("completed", False),
287
+ last_sync_timestamp=data.get("last_sync_timestamp"),
288
+ last_full_reconciliation_timestamp=data.get("last_full_reconciliation_timestamp"),
289
+ run_diagnostics=data.get("run_diagnostics", {}),
290
+ )
291
+
292
+ def is_page_unchanged(self, page_id: str, version: int | None, html_sha256: str | None) -> bool:
293
+ """Check if a page is unchanged since last crawl."""
294
+ if page_id not in self.visited_pages:
295
+ return False
296
+ prev = self.visited_pages[page_id]
297
+ prev_version = prev.get("version")
298
+ prev_hash = prev.get("html_sha256")
299
+ # If both version and hash match, page is unchanged
300
+ if prev_version is not None and version is not None and prev_version == version:
301
+ return True
302
+ if prev_hash is not None and html_sha256 is not None and prev_hash == html_sha256:
303
+ return True
304
+ return False
305
+
306
+ def record_page(self, page_id: str, version: int | None, html_sha256: str | None) -> None:
307
+ """Record a visited page."""
308
+ self.visited_pages[page_id] = {
309
+ "version": version,
310
+ "html_sha256": html_sha256,
311
+ "visited_at": _now_iso(),
312
+ }
313
+ self.last_updated_at = _now_iso()
314
+
315
+
316
+ @dataclass(slots=True)
317
+ class _FetchedPage:
318
+ page_id: str
319
+ parent_id: str | None
320
+ depth: int
321
+ status: str # fetched|unchanged|error
322
+ page_payload: dict[str, Any] | None = None
323
+ version_payload: dict[str, Any] | None = None
324
+ children: list[tuple[str, str | None, int]] = field(default_factory=list)
325
+ error: str | None = None
326
+
327
+
328
+ def _load_crawl_state(state_path: Path) -> CrawlState | None:
329
+ """Load crawl state from disk if it exists."""
330
+ if not state_path.exists():
331
+ return None
332
+ try:
333
+ data = json.loads(state_path.read_text(encoding="utf-8"))
334
+ return CrawlState.from_dict(data)
335
+ except (json.JSONDecodeError, KeyError, TypeError):
336
+ return None
337
+
338
+
339
+ def _save_crawl_state(state: CrawlState, state_path: Path) -> None:
340
+ """Persist crawl state to disk."""
341
+ state.last_updated_at = _now_iso()
342
+ state_path.write_text(json.dumps(state.to_dict(), ensure_ascii=False, indent=2), encoding="utf-8")
343
+
344
+
345
+ def _normalise_body_format(value: str | None) -> str:
346
+ normalised = (value or "view").strip().lower().replace("-", "_")
347
+ if normalised in {"view", "storage", "export_view"}:
348
+ return normalised
349
+ raise ValueError(f"Unsupported body format: {value!r} (expected: view|export_view|storage)")
350
+
351
+
352
+ def _normalise_page_concurrency(value: int | None) -> int:
353
+ if value is None:
354
+ return 5
355
+ return max(1, int(value))
356
+
357
+
358
+ def _normalise_staged_mode(value: str | None) -> str:
359
+ normalised = (value or "off").strip().lower().replace("-", "_")
360
+ if normalised in {"off", "structure_first"}:
361
+ return normalised
362
+ raise ValueError(f"Unsupported staged mode: {value!r} (expected: off|structure_first)")
363
+
364
+
365
+ def _split_text_preserving_words(text: str, max_chars: int) -> list[str]:
366
+ """Split a long string into <= max_chars chunks, preferring whitespace boundaries."""
367
+ stripped = (text or "").strip()
368
+ if not stripped:
369
+ return []
370
+ if max_chars <= 0:
371
+ return [stripped]
372
+ if len(stripped) <= max_chars:
373
+ return [stripped]
374
+
375
+ words = stripped.split()
376
+ parts: list[str] = []
377
+ current: list[str] = []
378
+ current_len = 0
379
+
380
+ def flush() -> None:
381
+ nonlocal current, current_len
382
+ if not current:
383
+ return
384
+ parts.append(" ".join(current))
385
+ current = []
386
+ current_len = 0
387
+
388
+ for word in words:
389
+ if not current:
390
+ current = [word]
391
+ current_len = len(word)
392
+ elif current_len + 1 + len(word) <= max_chars:
393
+ current.append(word)
394
+ current_len += 1 + len(word)
395
+ else:
396
+ flush()
397
+ if len(word) <= max_chars:
398
+ current = [word]
399
+ current_len = len(word)
400
+ else:
401
+ # Pathological case: a single token exceeds max_chars; hard-split.
402
+ for offset in range(0, len(word), max_chars):
403
+ parts.append(word[offset : offset + max_chars])
404
+
405
+ flush()
406
+ return parts
407
+
408
+
409
+ def _pack_segments(segments: list[str], max_chars: int) -> list[str]:
410
+ if max_chars <= 0:
411
+ joined = "\n".join([seg for seg in segments if seg.strip()]).strip()
412
+ return [joined] if joined else []
413
+
414
+ packed: list[str] = []
415
+ current: list[str] = []
416
+ current_len = 0
417
+
418
+ def flush() -> None:
419
+ nonlocal current, current_len
420
+ if not current:
421
+ return
422
+ packed.append("\n".join(current).strip())
423
+ current = []
424
+ current_len = 0
425
+
426
+ for seg in segments:
427
+ seg = (seg or "").strip()
428
+ if not seg:
429
+ continue
430
+ sep_len = 1 if current else 0 # newline
431
+ if not current or current_len + sep_len + len(seg) <= max_chars:
432
+ if current:
433
+ current_len += 1
434
+ current.append(seg)
435
+ current_len += len(seg)
436
+ continue
437
+
438
+ flush()
439
+ current = [seg]
440
+ current_len = len(seg)
441
+
442
+ flush()
443
+ return [chunk for chunk in packed if chunk.strip()]
444
+
445
+
446
+ def _extract_target_page_id(target_url: str | None) -> str | None:
447
+ if not target_url:
448
+ return None
449
+
450
+ parsed = urlparse(target_url)
451
+ if parsed.query:
452
+ qs = parse_qs(parsed.query)
453
+ for key, values in qs.items():
454
+ if key.lower() != "pageid" or not values:
455
+ continue
456
+ candidate = values[0]
457
+ if isinstance(candidate, str) and candidate.isdigit():
458
+ return candidate
459
+
460
+ match = re.search(r"/pages/(\d+)(?:/|$)", parsed.path or "")
461
+ if match:
462
+ return match.group(1)
463
+ return None
464
+
465
+
466
+ def _classify_target_type(target_url: str | None) -> str | None:
467
+ if not target_url:
468
+ return None
469
+
470
+ url = target_url.strip()
471
+ if not url:
472
+ return None
473
+ if url.startswith("mailto:"):
474
+ return "email"
475
+
476
+ parsed = urlparse(url)
477
+ host = (parsed.netloc or "").lower()
478
+ path = parsed.path or ""
479
+
480
+ if "/display/" in path or "viewpage.action" in path or re.search(r"/pages/\d+(?:/|$)", path):
481
+ return "confluence_page"
482
+ if "jira" in host or "/browse/" in path:
483
+ return "jira"
484
+ if "bitbucket" in host or "/scm/" in path:
485
+ return "bitbucket"
486
+ if parsed.scheme in {"http", "https"}:
487
+ return "external"
488
+ return "external"
489
+
490
+
491
+ def _extract_jira_key_from_url(url: str) -> str | None:
492
+ parsed = urlparse(url)
493
+ match = re.search(r"/browse/([A-Z][A-Z0-9]+-\d+)", parsed.path or "")
494
+ if match:
495
+ return match.group(1)
496
+ return None
497
+
498
+
499
+ def _extract_links(html: str, *, base_url: str | None = None) -> list[dict[str, Any]]:
500
+ soup = BeautifulSoup(html, "html.parser")
501
+ links: list[dict[str, Any]] = []
502
+ for a in soup.find_all("a"):
503
+ href = a.get("href")
504
+ if not href:
505
+ continue
506
+ href = str(href).strip()
507
+ if not href:
508
+ continue
509
+ text = a.get_text(" ", strip=True) or None
510
+ links.append({"href": href, "text": text})
511
+ if not base_url:
512
+ return links
513
+
514
+ normalized: list[dict[str, Any]] = []
515
+ base = base_url.rstrip("/")
516
+ for entry in links:
517
+ href = entry["href"]
518
+ if href.startswith("#"):
519
+ continue
520
+ if href.startswith(("http://", "https://")):
521
+ url = href
522
+ elif href.startswith("/"):
523
+ url = f"{base}{href}"
524
+ else:
525
+ # best-effort: keep as-is (could be relative to current page)
526
+ url = href
527
+ # Drop fragments for canonicalization
528
+ parsed = urlparse(url)
529
+ url = parsed._replace(fragment="").geturl()
530
+ normalized.append({"url": url, "text": entry.get("text")})
531
+ return normalized
532
+
533
+
534
+ def _chunk_html_by_headings(html: str, *, max_chars: int | None = None) -> list[dict[str, Any]]:
535
+ """Chunk HTML into heading-scoped blocks for JSONL corpus output."""
536
+ soup = BeautifulSoup(html, "html.parser")
537
+
538
+ chunks: list[dict[str, Any]] = []
539
+ heading_stack: list[str] = []
540
+ buffer: list[str] = []
541
+
542
+ def flush() -> None:
543
+ nonlocal buffer
544
+ lines = [line for line in buffer if line.strip()]
545
+ buffer = []
546
+ if not lines:
547
+ return
548
+
549
+ segments: list[str] = []
550
+ max_limit = max_chars if isinstance(max_chars, int) and max_chars > 0 else None
551
+ for line in lines:
552
+ if max_limit and len(line) > max_limit:
553
+ segments.extend(_split_text_preserving_words(line, max_limit))
554
+ else:
555
+ segments.append(line.strip())
556
+
557
+ if max_limit is None:
558
+ text = "\n".join([seg for seg in segments if seg.strip()]).strip()
559
+ if text:
560
+ chunks.append({"heading_path": list(heading_stack), "text": text, "ordinal": 0})
561
+ return
562
+
563
+ for ordinal, text in enumerate(_pack_segments(segments, max_limit)):
564
+ chunks.append({"heading_path": list(heading_stack), "text": text, "ordinal": ordinal})
565
+
566
+ for el in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre"]):
567
+ name = el.name.lower()
568
+ if name.startswith("h"):
569
+ flush()
570
+ level = int(name[1])
571
+ title = el.get_text(" ", strip=True)
572
+ if not title:
573
+ continue
574
+ heading_stack[:] = heading_stack[: max(0, level - 1)]
575
+ heading_stack.append(title)
576
+ continue
577
+
578
+ text = el.get_text(" ", strip=True)
579
+ if not text:
580
+ continue
581
+ buffer.append(text)
582
+
583
+ flush()
584
+ return chunks
585
+
586
+
587
+ def _extract_tables(html: str) -> list[dict[str, Any]]:
588
+ soup = BeautifulSoup(html, "html.parser")
589
+ extracted: list[dict[str, Any]] = []
590
+
591
+ for index, table in enumerate(soup.find_all("table")):
592
+ headers: list[str] | None = None
593
+ rows: list[list[str]] = []
594
+
595
+ # Prefer explicit <thead> headers when present.
596
+ thead = table.find("thead")
597
+ if thead:
598
+ head_row = thead.find("tr")
599
+ if head_row:
600
+ header_cells = head_row.find_all(["th", "td"])
601
+ header_texts = [cell.get_text(" ", strip=True) for cell in header_cells]
602
+ headers = [h for h in header_texts if h]
603
+
604
+ # Extract all row cells.
605
+ for tr in table.find_all("tr"):
606
+ cells = tr.find_all(["th", "td"])
607
+ if not cells:
608
+ continue
609
+ row = [cell.get_text(" ", strip=True) for cell in cells]
610
+ row = [cell for cell in row if cell]
611
+ if row:
612
+ rows.append(row)
613
+
614
+ # If we did not find a <thead>, treat the first row with <th> as headers.
615
+ if headers is None and rows:
616
+ first_tr = table.find("tr")
617
+ if first_tr and first_tr.find("th"):
618
+ headers = rows[0]
619
+ rows = rows[1:]
620
+
621
+ extracted.append(
622
+ {
623
+ "table_index": index,
624
+ "headers": headers,
625
+ "rows": rows,
626
+ "row_count": len(rows),
627
+ "col_count": max((len(row) for row in rows), default=0),
628
+ }
629
+ )
630
+
631
+ return extracted
632
+
633
+
634
+ def _extract_mentions(html: str, *, base_url: str | None = None) -> list[dict[str, Any]]:
635
+ soup = BeautifulSoup(html, "html.parser")
636
+ extracted: list[dict[str, Any]] = []
637
+
638
+ # 1) Link-based mentions (already normalized by base_url when provided).
639
+ for link in _extract_links(html, base_url=base_url):
640
+ url = str(link.get("url") or "").strip()
641
+ if not url:
642
+ continue
643
+
644
+ mention_type = _classify_target_type(url)
645
+ if mention_type == "confluence_page":
646
+ value = _extract_target_page_id(url) or url
647
+ elif mention_type == "jira":
648
+ value = _extract_jira_key_from_url(url) or url
649
+ elif mention_type == "email":
650
+ value = url.removeprefix("mailto:").strip() or url
651
+ else:
652
+ value = url
653
+
654
+ extracted.append(
655
+ {
656
+ "type": mention_type or "external",
657
+ "value": value,
658
+ "url": url,
659
+ "anchor_text": link.get("text"),
660
+ "source": "link",
661
+ }
662
+ )
663
+
664
+ # 2) Plain-text Jira keys (best-effort).
665
+ plain_text = soup.get_text(" ", strip=True)
666
+ for key in sorted(set(_JIRA_KEY_RE.findall(plain_text or ""))):
667
+ extracted.append(
668
+ {
669
+ "type": "jira_key",
670
+ "value": key,
671
+ "url": None,
672
+ "anchor_text": None,
673
+ "source": "text",
674
+ }
675
+ )
676
+
677
+ # Deduplicate mentions by stable key and keep deterministic ordering.
678
+ seen: set[tuple[str, str, str | None]] = set()
679
+ unique: list[dict[str, Any]] = []
680
+ for entry in extracted:
681
+ mention_type = str(entry.get("type") or "").strip() or "external"
682
+ value = str(entry.get("value") or "").strip()
683
+ url = entry.get("url")
684
+ url = str(url).strip() if isinstance(url, str) else None
685
+ if not value:
686
+ continue
687
+ key = (mention_type, value, url)
688
+ if key in seen:
689
+ continue
690
+ seen.add(key)
691
+ unique.append({**entry, "type": mention_type, "value": value, "url": url})
692
+
693
+ unique.sort(key=lambda m: (m.get("type") or "", m.get("value") or "", m.get("url") or ""))
694
+ return unique
695
+
696
+
697
+ def crawl_tree_to_disk(
698
+ *,
699
+ http: ConfluenceClient,
700
+ root_ref: str,
701
+ output_dir: Path,
702
+ options: CrawlOptions,
703
+ manifest_path: Path | None = None,
704
+ ) -> dict[str, Any]:
705
+ """Crawl a Confluence page tree and download content/attachments to disk.
706
+
707
+ Supports resume/incremental crawling (TSK-188):
708
+ - resume=True: Load state from previous run, skip unchanged pages
709
+ - max_nodes: Limit total pages processed (safety for large trees)
710
+ """
711
+ body_format = _normalise_body_format(options.body_format)
712
+ page_concurrency = _normalise_page_concurrency(options.page_concurrency)
713
+ staged_mode = _normalise_staged_mode(options.staged_mode)
714
+ use_structure_first = staged_mode == "structure_first"
715
+ chunk_max_chars = options.chunk_max_chars if isinstance(options.chunk_max_chars, int) else None
716
+ resolved_root_id = resolve_page_ref(root_ref, http)
717
+ output_dir = output_dir.resolve()
718
+ output_dir.mkdir(parents=True, exist_ok=True)
719
+ attachments_dir = output_dir / "attachments"
720
+ attachments_dir.mkdir(exist_ok=True)
721
+ resolved_manifest_path = manifest_path or (output_dir / "crawl_manifest.json")
722
+
723
+ def _meta_from_existing_html(
724
+ *,
725
+ page_id: str,
726
+ parent_id: str | None,
727
+ depth: int,
728
+ page_payload: dict[str, Any] | None,
729
+ ) -> dict[str, Any]:
730
+ html_candidates = sorted(output_dir.glob(f"{page_id}_*.html"))
731
+ html_path = html_candidates[0] if html_candidates else None
732
+ title = None
733
+ page_url = None
734
+ page_version = None
735
+ page_updated = None
736
+ space_key = None
737
+ if isinstance(page_payload, dict):
738
+ raw_title = page_payload.get("title")
739
+ if raw_title:
740
+ title = str(raw_title)
741
+ links = page_payload.get("_links")
742
+ if isinstance(links, dict):
743
+ base_url = str(links.get("base") or "")
744
+ web_ui = str(links.get("webui") or "")
745
+ page_url = f"{base_url}{web_ui}" if base_url and web_ui else None
746
+ version = page_payload.get("version")
747
+ if isinstance(version, dict):
748
+ page_version = version.get("number")
749
+ page_updated = version.get("when")
750
+ space = page_payload.get("space")
751
+ if isinstance(space, dict):
752
+ raw_space = space.get("key")
753
+ if raw_space:
754
+ space_key = str(raw_space)
755
+ if title is None and html_path is not None:
756
+ title = html_path.stem.split("_", 1)[1] if "_" in html_path.stem else html_path.stem
757
+
758
+ meta: dict[str, Any] = {
759
+ "page_id": page_id,
760
+ "title": title,
761
+ "page_url": page_url,
762
+ "parent_id": parent_id,
763
+ "depth": depth,
764
+ "status": "downloaded" if html_path is not None else "unchanged",
765
+ "html_path": str(html_path) if html_path is not None else None,
766
+ "html_sha256": None,
767
+ "html_size_bytes": None,
768
+ "source_format": "html-view",
769
+ "space_key": space_key,
770
+ "page_version": page_version,
771
+ "page_updated": page_updated,
772
+ "ancestor_ids": [],
773
+ "ancestor_titles": [],
774
+ "error": None,
775
+ }
776
+ if html_path is not None and html_path.exists():
777
+ html_bytes = html_path.read_bytes()
778
+ meta["html_sha256"] = _sha256_bytes(html_bytes)
779
+ meta["html_size_bytes"] = html_path.stat().st_size
780
+ return meta
781
+
782
+ previous_page_meta_by_id: dict[str, dict[str, Any]] = {}
783
+ previous_attachments_by_page: dict[str, list[dict[str, Any]]] = {}
784
+ if options.resume and resolved_manifest_path.exists():
785
+ try:
786
+ loaded = json.loads(resolved_manifest_path.read_text(encoding="utf-8"))
787
+ if isinstance(loaded, dict):
788
+ root_meta = loaded.get("page")
789
+ if isinstance(root_meta, dict):
790
+ root_id = root_meta.get("page_id")
791
+ if root_id:
792
+ previous_page_meta_by_id[str(root_id)] = root_meta
793
+ raw_children = loaded.get("child_pages")
794
+ if isinstance(raw_children, list):
795
+ for child in raw_children:
796
+ if not isinstance(child, dict):
797
+ continue
798
+ child_id = child.get("page_id")
799
+ if child_id:
800
+ previous_page_meta_by_id[str(child_id)] = child
801
+ raw_attachments = loaded.get("attachments")
802
+ if isinstance(raw_attachments, list):
803
+ for attachment in raw_attachments:
804
+ if not isinstance(attachment, dict):
805
+ continue
806
+ page_id = attachment.get("page_id")
807
+ if page_id is None:
808
+ continue
809
+ previous_attachments_by_page.setdefault(str(page_id), []).append(attachment)
810
+ except Exception:
811
+ previous_page_meta_by_id = {}
812
+ previous_attachments_by_page = {}
813
+
814
+ previous_corpus_chunks_by_page: dict[str, list[dict[str, Any]]] = {}
815
+ if options.resume and options.emit_corpus:
816
+ previous_corpus_path = output_dir / "corpus.jsonl"
817
+ if previous_corpus_path.exists():
818
+ try:
819
+ with previous_corpus_path.open("r", encoding="utf-8") as previous_corpus_fp:
820
+ for raw_line in previous_corpus_fp:
821
+ line = raw_line.strip()
822
+ if not line:
823
+ continue
824
+ try:
825
+ payload = json.loads(line)
826
+ except json.JSONDecodeError:
827
+ continue
828
+ if not isinstance(payload, dict):
829
+ continue
830
+ page_id = str(payload.get("page_id") or "").strip()
831
+ if not page_id:
832
+ continue
833
+ previous_corpus_chunks_by_page.setdefault(page_id, []).append(payload)
834
+ except Exception:
835
+ previous_corpus_chunks_by_page = {}
836
+
837
+ # State file for resume support (TSK-188)
838
+ state_path = output_dir / "crawl_state.json"
839
+ crawl_state: CrawlState | None = None
840
+ skipped_unchanged: int = 0
841
+ skipped_already_visited: int = 0
842
+ nodes_processed: int = 0
843
+ pages_fetched: int = 0
844
+ pages_failed: int = 0
845
+ termination_reason = "queue_exhausted"
846
+ unchanged_page_ids: set[str] = set()
847
+
848
+ # Track previously visited pages for skip-unchanged logic (separate from current run)
849
+ previous_visited: dict[str, dict[str, Any]] = {}
850
+
851
+ if options.resume:
852
+ crawl_state = _load_crawl_state(state_path)
853
+ if crawl_state is not None:
854
+ # Validate state matches current crawl parameters
855
+ if crawl_state.root_page_id != resolved_root_id:
856
+ # Root changed, start fresh
857
+ crawl_state = None
858
+ elif crawl_state.body_format != body_format:
859
+ # Format changed, start fresh
860
+ crawl_state = None
861
+ elif crawl_state.completed:
862
+ # Previous crawl completed - keep visited_pages for skip-unchanged checks
863
+ # but reset queue to re-crawl from root
864
+ previous_visited = dict(crawl_state.visited_pages)
865
+ previous_sync_timestamp = crawl_state.last_sync_timestamp
866
+ previous_full_reconciliation_timestamp = crawl_state.last_full_reconciliation_timestamp
867
+ crawl_state = CrawlState(
868
+ root_page_id=resolved_root_id,
869
+ body_format=body_format,
870
+ visited_pages=previous_visited, # Keep for version checks
871
+ pending_queue=[(resolved_root_id, None, 0)],
872
+ started_at=_now_iso(),
873
+ last_updated_at=_now_iso(),
874
+ completed=False,
875
+ last_sync_timestamp=previous_sync_timestamp,
876
+ last_full_reconciliation_timestamp=previous_full_reconciliation_timestamp,
877
+ )
878
+ else:
879
+ # Incomplete crawl - restore visited pages for skip logic
880
+ previous_visited = dict(crawl_state.visited_pages)
881
+
882
+ # Initialize state if needed
883
+ if crawl_state is None:
884
+ crawl_state = CrawlState(
885
+ root_page_id=resolved_root_id,
886
+ body_format=body_format,
887
+ visited_pages={},
888
+ pending_queue=[(resolved_root_id, None, 0)],
889
+ started_at=_now_iso(),
890
+ last_updated_at=_now_iso(),
891
+ completed=False,
892
+ )
893
+
894
+ # For resume mode, track pages visited in current session to avoid re-processing
895
+ visited_pages: set[str] = set()
896
+ downloaded_attachments: set[str] = set()
897
+
898
+ queue: list[tuple[str, str | None, int]] = list(crawl_state.pending_queue)
899
+ discovered_page_ids: set[str] = {resolved_root_id}
900
+ for queued_page_id, _, _ in queue:
901
+ discovered_page_ids.add(queued_page_id)
902
+
903
+ root_meta: dict[str, Any] | None = None
904
+ child_pages: list[dict[str, Any]] = []
905
+ attachments: list[dict[str, Any]] = []
906
+ link_edges: list[dict[str, Any]] = []
907
+ corpus_chunks: list[dict[str, Any]] = []
908
+ tables: list[dict[str, Any]] = []
909
+ mentions: list[dict[str, Any]] = []
910
+ telemetry_getter = getattr(http, "get_runtime_telemetry", None)
911
+ configure_rate_limit_budget = getattr(http, "configure_rate_limit_budget", None)
912
+ if callable(configure_rate_limit_budget):
913
+ try:
914
+ configure_rate_limit_budget(int(options.rate_limit_budget))
915
+ except Exception:
916
+ pass
917
+
918
+ def _children_for(page_id: str, depth: int) -> list[tuple[str, str | None, int]]:
919
+ if options.max_depth >= 0 and depth >= options.max_depth:
920
+ return []
921
+ try:
922
+ child_payload = http.get_child_pages(page_id)
923
+ except Exception:
924
+ return []
925
+ collected: list[tuple[str, str | None, int]] = []
926
+ for child in child_payload:
927
+ if not isinstance(child, dict):
928
+ continue
929
+ child_id = child.get("id")
930
+ if child_id is None:
931
+ continue
932
+ collected.append((str(child_id), page_id, depth + 1))
933
+ return collected
934
+
935
+ body_expands: list[str]
936
+ if body_format == "export_view":
937
+ body_expands = ["body.export_view", "body.view", "body.storage"]
938
+ elif body_format == "storage":
939
+ body_expands = ["body.storage", "body.view"]
940
+ else:
941
+ body_expands = ["body.view", "body.storage"]
942
+
943
+ def _fetch_page(node: tuple[str, str | None, int]) -> _FetchedPage:
944
+ page_id, parent_id, depth = node
945
+ children: list[tuple[str, str | None, int]] = []
946
+ children_future: Future[list[tuple[str, str | None, int]]] | None = None
947
+ children_executor: ThreadPoolExecutor | None = None
948
+ if not use_structure_first and (options.max_depth < 0 or depth < options.max_depth):
949
+ children_executor = ThreadPoolExecutor(max_workers=1)
950
+ children_future = children_executor.submit(_children_for, page_id, depth)
951
+
952
+ try:
953
+ if options.resume and page_id in crawl_state.visited_pages:
954
+ try:
955
+ version_check = http.get_page(page_id, expand=["version"])
956
+ except Exception as exc:
957
+ if children_future is not None:
958
+ children = children_future.result()
959
+ return _FetchedPage(
960
+ page_id=page_id,
961
+ parent_id=parent_id,
962
+ depth=depth,
963
+ status="error",
964
+ error=str(exc),
965
+ children=children,
966
+ )
967
+ check_version = (version_check.get("version") or {}).get("number")
968
+ if crawl_state.is_page_unchanged(page_id, check_version, None):
969
+ if children_future is not None:
970
+ children = children_future.result()
971
+ return _FetchedPage(
972
+ page_id=page_id,
973
+ parent_id=parent_id,
974
+ depth=depth,
975
+ status="unchanged",
976
+ version_payload=version_check,
977
+ children=children,
978
+ )
979
+
980
+ try:
981
+ page_payload = http.get_page(page_id, expand=[*body_expands, "version", "space", "ancestors"])
982
+ except Exception as exc:
983
+ if children_future is not None:
984
+ children = children_future.result()
985
+ return _FetchedPage(
986
+ page_id=page_id,
987
+ parent_id=parent_id,
988
+ depth=depth,
989
+ status="error",
990
+ error=str(exc),
991
+ children=children,
992
+ )
993
+
994
+ if children_future is not None:
995
+ children = children_future.result()
996
+
997
+ return _FetchedPage(
998
+ page_id=page_id,
999
+ parent_id=parent_id,
1000
+ depth=depth,
1001
+ status="fetched",
1002
+ page_payload=page_payload,
1003
+ children=children,
1004
+ )
1005
+ finally:
1006
+ if children_executor is not None:
1007
+ children_executor.shutdown(wait=True)
1008
+
1009
+ def _extract_page_id_from_search_item(item: Any) -> str | None:
1010
+ if not isinstance(item, dict):
1011
+ return None
1012
+ raw_id = item.get("id")
1013
+ if raw_id is None and isinstance(item.get("content"), dict):
1014
+ raw_id = item["content"].get("id")
1015
+ if raw_id is None:
1016
+ return None
1017
+ page_id = str(raw_id).strip()
1018
+ return page_id if page_id.isdigit() else None
1019
+
1020
+ search_by_space_and_type = getattr(http, "search_by_space_and_type", None)
1021
+
1022
+ def _search_space_page_ids(
1023
+ *,
1024
+ space_key: str,
1025
+ last_modified_after: datetime | None = None,
1026
+ overlap_minutes: int = 0,
1027
+ ) -> tuple[set[str], bool]:
1028
+ if not callable(search_by_space_and_type):
1029
+ return set(), False
1030
+
1031
+ page_ids: set[str] = set()
1032
+ start = 0
1033
+ limit = 100
1034
+ successful = False
1035
+ max_pages = 500
1036
+
1037
+ for _ in range(max_pages):
1038
+ response = search_by_space_and_type(
1039
+ space_key=space_key,
1040
+ content_type="page",
1041
+ limit=limit,
1042
+ start=start,
1043
+ expand=["version"],
1044
+ last_modified_after=last_modified_after,
1045
+ overlap_minutes=overlap_minutes,
1046
+ )
1047
+ if not isinstance(response, dict):
1048
+ break
1049
+ successful = True
1050
+ results = response.get("results")
1051
+ if not isinstance(results, list) or not results:
1052
+ break
1053
+ for item in results:
1054
+ page_id = _extract_page_id_from_search_item(item)
1055
+ if page_id:
1056
+ page_ids.add(page_id)
1057
+ if len(results) < limit:
1058
+ break
1059
+ start += limit
1060
+
1061
+ return page_ids, successful
1062
+
1063
+ def _resolve_known_space_key() -> str | None:
1064
+ root_meta = previous_page_meta_by_id.get(resolved_root_id)
1065
+ if not isinstance(root_meta, dict):
1066
+ return None
1067
+ raw_space_key = root_meta.get("space_key")
1068
+ if not isinstance(raw_space_key, str):
1069
+ return None
1070
+ space_key = raw_space_key.strip()
1071
+ return space_key or None
1072
+
1073
+ def _discover_structure_first_tree(
1074
+ initial_queue: list[tuple[str, str | None, int]],
1075
+ ) -> tuple[list[tuple[str, str | None, int]], list[tuple[str, str | None, int]], set[str]]:
1076
+ ordered: list[tuple[str, str | None, int]] = []
1077
+ pending: list[tuple[str, str | None, int]] = []
1078
+ discovered: set[str] = set()
1079
+ pending_ids: set[str] = set()
1080
+ seen: set[str] = set()
1081
+ current_level: list[tuple[str, str | None, int]] = list(initial_queue)
1082
+ for page_id, _, _ in current_level:
1083
+ discovered.add(page_id)
1084
+
1085
+ while current_level:
1086
+ level_nodes: list[tuple[str, str | None, int]] = []
1087
+ for page_id, parent_id, depth in current_level:
1088
+ if page_id in seen:
1089
+ continue
1090
+ if options.max_nodes is not None and len(ordered) >= options.max_nodes:
1091
+ if page_id not in pending_ids:
1092
+ pending.append((page_id, parent_id, depth))
1093
+ pending_ids.add(page_id)
1094
+ continue
1095
+ seen.add(page_id)
1096
+ ordered.append((page_id, parent_id, depth))
1097
+ level_nodes.append((page_id, parent_id, depth))
1098
+
1099
+ if options.max_nodes is not None and len(ordered) >= options.max_nodes:
1100
+ for page_id, parent_id, depth in current_level:
1101
+ if page_id in seen or page_id in pending_ids:
1102
+ continue
1103
+ pending.append((page_id, parent_id, depth))
1104
+ pending_ids.add(page_id)
1105
+ break
1106
+
1107
+ parents_for_children: list[tuple[str, int]] = []
1108
+ for page_id, _, depth in level_nodes:
1109
+ if options.max_depth >= 0 and depth >= options.max_depth:
1110
+ continue
1111
+ parents_for_children.append((page_id, depth))
1112
+
1113
+ if not parents_for_children:
1114
+ current_level = []
1115
+ continue
1116
+
1117
+ child_batches: list[list[tuple[str, str | None, int]]]
1118
+ if len(parents_for_children) == 1:
1119
+ parent_id, parent_depth = parents_for_children[0]
1120
+ child_batches = [_children_for(parent_id, parent_depth)]
1121
+ else:
1122
+ max_workers = min(page_concurrency, len(parents_for_children))
1123
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
1124
+ futures: list[Future[list[tuple[str, str | None, int]]]] = [
1125
+ executor.submit(_children_for, parent_id, parent_depth)
1126
+ for parent_id, parent_depth in parents_for_children
1127
+ ]
1128
+ child_batches = [future.result() for future in futures]
1129
+
1130
+ next_level: list[tuple[str, str | None, int]] = []
1131
+ next_level_ids: set[str] = set()
1132
+ for children in child_batches:
1133
+ for child_id, child_parent, child_depth in children:
1134
+ if child_id in seen or child_id in next_level_ids:
1135
+ continue
1136
+ discovered.add(child_id)
1137
+ next_level_ids.add(child_id)
1138
+ next_level.append((child_id, child_parent, child_depth))
1139
+ current_level = next_level
1140
+
1141
+ return ordered, pending, discovered
1142
+
1143
+ incremental_candidate_ids: set[str] = set()
1144
+ incremental_high_water_mark_ran = False
1145
+ incremental_high_water_mark_skipped = False
1146
+ incremental_high_water_mark_skip_reason: str | None = None
1147
+ incremental_known_page_count = 0
1148
+ full_reconciliation_deleted_ids: list[str] = []
1149
+ full_reconciliation_ran = False
1150
+ full_reconciliation_skipped = False
1151
+ full_reconciliation_skip_reason: str | None = None
1152
+ full_reconciliation_known_page_count = 0
1153
+ space_key_for_incremental = _resolve_known_space_key()
1154
+ if options.resume and options.incremental_cql and callable(search_by_space_and_type):
1155
+ known_page_ids = set(crawl_state.visited_pages.keys())
1156
+ incremental_known_page_count = len(known_page_ids)
1157
+ last_sync_dt = _parse_iso_timestamp(crawl_state.last_sync_timestamp)
1158
+ if space_key_for_incremental and last_sync_dt is not None:
1159
+ if incremental_known_page_count <= 1:
1160
+ incremental_high_water_mark_skipped = True
1161
+ incremental_high_water_mark_skip_reason = "known_pages_lte_1"
1162
+ else:
1163
+ incremental_high_water_mark_ran = True
1164
+ try:
1165
+ incremental_ids, incremental_ok = _search_space_page_ids(
1166
+ space_key=space_key_for_incremental,
1167
+ last_modified_after=last_sync_dt,
1168
+ overlap_minutes=max(0, int(options.incremental_overlap_minutes)),
1169
+ )
1170
+ except Exception:
1171
+ incremental_ids, incremental_ok = set(), False
1172
+ if incremental_ok:
1173
+ incremental_candidate_ids = {page_id for page_id in incremental_ids if page_id in known_page_ids}
1174
+ if incremental_candidate_ids:
1175
+ queued_ids = {queued_page_id for queued_page_id, _, _ in queue}
1176
+ for page_id in sorted(incremental_candidate_ids):
1177
+ if page_id in queued_ids:
1178
+ continue
1179
+ previous_meta = previous_page_meta_by_id.get(page_id) or {}
1180
+ parent_id = previous_meta.get("parent_id")
1181
+ if parent_id is not None:
1182
+ parent_id = str(parent_id)
1183
+ depth = previous_meta.get("depth")
1184
+ if not isinstance(depth, int) or depth < 0:
1185
+ depth = 0
1186
+ queue.append((page_id, parent_id, depth))
1187
+ queued_ids.add(page_id)
1188
+ discovered_page_ids.add(page_id)
1189
+
1190
+ full_reconciliation_due = False
1191
+ last_full_reconciliation_dt = _parse_iso_timestamp(crawl_state.last_full_reconciliation_timestamp)
1192
+ if last_full_reconciliation_dt is None:
1193
+ full_reconciliation_due = True
1194
+ else:
1195
+ full_reconciliation_due = (
1196
+ datetime.now(UTC)
1197
+ - last_full_reconciliation_dt
1198
+ >= timedelta(days=max(1, int(options.full_reconciliation_interval_days)))
1199
+ )
1200
+
1201
+ if space_key_for_incremental and full_reconciliation_due:
1202
+ known_page_ids = set(crawl_state.visited_pages.keys())
1203
+ full_reconciliation_known_page_count = len(known_page_ids)
1204
+ reconciliation_marker_timestamp = _now_iso()
1205
+ if full_reconciliation_known_page_count <= 1:
1206
+ full_reconciliation_skipped = True
1207
+ full_reconciliation_skip_reason = "known_pages_lte_1"
1208
+ crawl_state.last_full_reconciliation_timestamp = reconciliation_marker_timestamp
1209
+ else:
1210
+ try:
1211
+ remote_page_ids, reconciliation_ok = _search_space_page_ids(space_key=space_key_for_incremental)
1212
+ except Exception:
1213
+ remote_page_ids, reconciliation_ok = set(), False
1214
+ if reconciliation_ok:
1215
+ full_reconciliation_ran = True
1216
+ full_reconciliation_deleted_ids = sorted(known_page_ids - remote_page_ids)
1217
+ if full_reconciliation_deleted_ids:
1218
+ for page_id in full_reconciliation_deleted_ids:
1219
+ crawl_state.visited_pages.pop(page_id, None)
1220
+ previous_visited.pop(page_id, None)
1221
+ previous_page_meta_by_id.pop(page_id, None)
1222
+ previous_attachments_by_page.pop(page_id, None)
1223
+ discovered_page_ids.discard(page_id)
1224
+ _emit_runtime_event(
1225
+ options,
1226
+ "pages_deleted",
1227
+ root_page_id=resolved_root_id,
1228
+ deleted_count=len(full_reconciliation_deleted_ids),
1229
+ page_ids=full_reconciliation_deleted_ids,
1230
+ )
1231
+ crawl_state.last_full_reconciliation_timestamp = reconciliation_marker_timestamp
1232
+
1233
+ structure_first_pending_queue: list[tuple[str, str | None, int]] = []
1234
+ structure_pre_discovered_nodes = 0
1235
+ if use_structure_first:
1236
+ queue, structure_first_pending_queue, structure_discovered_ids = _discover_structure_first_tree(queue)
1237
+ structure_pre_discovered_nodes = len(queue) + len(structure_first_pending_queue)
1238
+ discovered_page_ids.update(structure_discovered_ids)
1239
+
1240
+ crawl_started_at = time.perf_counter()
1241
+ max_inflight_workers = 0
1242
+ effective_page_concurrency = page_concurrency
1243
+ effective_concurrency_signal: float | None = None
1244
+ throttle_pressure_score: float | None = None
1245
+
1246
+ while queue:
1247
+ if options.max_nodes is not None and nodes_processed >= options.max_nodes:
1248
+ termination_reason = "max_nodes_reached"
1249
+ break
1250
+
1251
+ if options.max_nodes is None:
1252
+ remaining = len(queue)
1253
+ else:
1254
+ remaining = min(len(queue), options.max_nodes - nodes_processed)
1255
+ if remaining <= 0:
1256
+ termination_reason = "max_nodes_reached"
1257
+ break
1258
+ runtime_page_concurrency = page_concurrency
1259
+ if callable(telemetry_getter):
1260
+ try:
1261
+ telemetry_snapshot = telemetry_getter()
1262
+ except Exception:
1263
+ telemetry_snapshot = None
1264
+ if isinstance(telemetry_snapshot, dict):
1265
+ throttle = telemetry_snapshot.get("adaptive_throttle")
1266
+ if isinstance(throttle, dict):
1267
+ signal = throttle.get("effective_concurrency_signal")
1268
+ if isinstance(signal, (float, int)):
1269
+ effective_concurrency_signal = float(signal)
1270
+ scaled = int(round(page_concurrency * float(signal)))
1271
+ runtime_page_concurrency = max(1, min(page_concurrency, scaled))
1272
+ pressure_score = throttle.get("pressure_score")
1273
+ if isinstance(pressure_score, (float, int)):
1274
+ throttle_pressure_score = float(pressure_score)
1275
+ effective_page_concurrency = runtime_page_concurrency
1276
+ batch_size = min(runtime_page_concurrency, remaining)
1277
+ max_inflight_workers = max(max_inflight_workers, batch_size)
1278
+
1279
+ batch: list[tuple[str, str | None, int]] = []
1280
+ while queue and len(batch) < batch_size:
1281
+ page_id, parent_id, depth = queue.pop(0)
1282
+ if page_id in visited_pages:
1283
+ skipped_already_visited += 1
1284
+ continue
1285
+ visited_pages.add(page_id)
1286
+ nodes_processed += 1
1287
+ batch.append((page_id, parent_id, depth))
1288
+
1289
+ crawl_state.pending_queue = list(queue)
1290
+ if not batch:
1291
+ continue
1292
+
1293
+ fetched: list[_FetchedPage] = []
1294
+ if batch_size == 1:
1295
+ fetched = [_fetch_page(node) for node in batch]
1296
+ else:
1297
+ with ThreadPoolExecutor(max_workers=batch_size) as executor:
1298
+ futures: dict[Future[_FetchedPage], int] = {
1299
+ executor.submit(_fetch_page, node): idx for idx, node in enumerate(batch)
1300
+ }
1301
+ fetched_by_index: dict[int, _FetchedPage] = {}
1302
+ for future, idx in futures.items():
1303
+ fetched_by_index[idx] = future.result()
1304
+ fetched = [fetched_by_index[idx] for idx in range(len(batch))]
1305
+
1306
+ for result in fetched:
1307
+ page_id = result.page_id
1308
+ parent_id = result.parent_id
1309
+ depth = result.depth
1310
+
1311
+ if result.status == "unchanged":
1312
+ skipped_unchanged += 1
1313
+ unchanged_page_ids.add(page_id)
1314
+ previous_meta = previous_page_meta_by_id.get(page_id)
1315
+ if previous_meta is None:
1316
+ previous_meta = _meta_from_existing_html(
1317
+ page_id=page_id,
1318
+ parent_id=parent_id,
1319
+ depth=depth,
1320
+ page_payload=result.version_payload if isinstance(result.version_payload, dict) else None,
1321
+ )
1322
+ if previous_meta is not None:
1323
+ if depth == 0:
1324
+ root_meta = previous_meta
1325
+ else:
1326
+ child_pages.append(previous_meta)
1327
+ if options.include_attachments:
1328
+ for previous_attachment in previous_attachments_by_page.get(page_id, []):
1329
+ attachment_id = previous_attachment.get("attachment_id")
1330
+ if attachment_id is not None:
1331
+ downloaded_attachments.add(str(attachment_id))
1332
+ attachments.append(previous_attachment)
1333
+ for child_id, child_parent, child_depth in result.children:
1334
+ if child_id in visited_pages:
1335
+ continue
1336
+ discovered_page_ids.add(child_id)
1337
+ queue.append((child_id, child_parent, child_depth))
1338
+ continue
1339
+
1340
+ if result.status == "error":
1341
+ pages_failed += 1
1342
+ meta = {
1343
+ "page_id": page_id,
1344
+ "title": None,
1345
+ "page_url": None,
1346
+ "parent_id": parent_id,
1347
+ "depth": depth,
1348
+ "status": "error",
1349
+ "error": result.error,
1350
+ }
1351
+ if depth == 0:
1352
+ root_meta = meta
1353
+ else:
1354
+ child_pages.append(meta)
1355
+ continue
1356
+
1357
+ pages_fetched += 1
1358
+ page = result.page_payload if isinstance(result.page_payload, dict) else {}
1359
+ title = str(page.get("title") or "untitled")
1360
+ safe_title = _safe_filename(title)
1361
+ body_obj = page.get("body") or {}
1362
+ view_html = (body_obj.get("view") or {}).get("value", "")
1363
+ export_html = (body_obj.get("export_view") or {}).get("value", "")
1364
+ storage_html = (body_obj.get("storage") or {}).get("value", "")
1365
+ candidates = {
1366
+ "view": view_html if isinstance(view_html, str) else "",
1367
+ "export_view": export_html if isinstance(export_html, str) else "",
1368
+ "storage": storage_html if isinstance(storage_html, str) else "",
1369
+ }
1370
+ chosen_html = candidates.get(body_format, "") or ""
1371
+ chosen_format = body_format
1372
+ if not chosen_html.strip():
1373
+ fallback_order = (
1374
+ ["view", "storage", "export_view"]
1375
+ if body_format == "view"
1376
+ else ["storage", "view", "export_view"]
1377
+ if body_format == "storage"
1378
+ else ["export_view", "view", "storage"]
1379
+ )
1380
+ for fmt in fallback_order:
1381
+ candidate = candidates.get(fmt, "") or ""
1382
+ if isinstance(candidate, str) and candidate.strip():
1383
+ chosen_html = candidate
1384
+ chosen_format = fmt
1385
+ break
1386
+
1387
+ html = chosen_html if isinstance(chosen_html, str) else ""
1388
+ source_format = f"html-{chosen_format.replace('_', '-')}"
1389
+
1390
+ links = page.get("_links", {}) if isinstance(page.get("_links"), dict) else {}
1391
+ base_url = str(links.get("base") or "")
1392
+ web_ui = str(links.get("webui") or "")
1393
+ page_url = f"{base_url}{web_ui}" if base_url and web_ui else None
1394
+
1395
+ version = page.get("version") or {}
1396
+ page_version = version.get("number") if isinstance(version, dict) else None
1397
+ page_updated = version.get("when") if isinstance(version, dict) else None
1398
+ space_key = (page.get("space") or {}).get("key") if isinstance(page.get("space"), dict) else None
1399
+
1400
+ ancestors = page.get("ancestors") or []
1401
+ ancestor_ids: list[str] = []
1402
+ ancestor_titles: list[str] = []
1403
+ if isinstance(ancestors, list):
1404
+ for entry in ancestors:
1405
+ if not isinstance(entry, dict):
1406
+ continue
1407
+ if entry.get("id") is not None:
1408
+ ancestor_ids.append(str(entry["id"]))
1409
+ if entry.get("title"):
1410
+ ancestor_titles.append(str(entry["title"]))
1411
+
1412
+ html_path = output_dir / f"{page_id}_{safe_title}.html"
1413
+ html_path.write_text(html, encoding="utf-8")
1414
+ html_bytes = html.encode("utf-8")
1415
+ html_sha256 = _sha256_bytes(html_bytes)
1416
+ crawl_state.record_page(page_id, page_version, html_sha256)
1417
+
1418
+ meta = {
1419
+ "page_id": page_id,
1420
+ "title": title,
1421
+ "page_url": page_url,
1422
+ "parent_id": parent_id,
1423
+ "depth": depth,
1424
+ "status": "downloaded",
1425
+ "html_path": str(html_path),
1426
+ "html_sha256": html_sha256,
1427
+ "html_size_bytes": html_path.stat().st_size,
1428
+ "source_format": source_format,
1429
+ "space_key": space_key,
1430
+ "page_version": page_version,
1431
+ "page_updated": page_updated,
1432
+ "ancestor_ids": ancestor_ids,
1433
+ "ancestor_titles": ancestor_titles,
1434
+ "error": None,
1435
+ }
1436
+ if depth == 0:
1437
+ root_meta = meta
1438
+ else:
1439
+ child_pages.append(meta)
1440
+
1441
+ if options.emit_link_graph:
1442
+ for link in _extract_links(html, base_url=base_url):
1443
+ target_url = link.get("url")
1444
+ target_type = _classify_target_type(str(target_url)) if target_url else None
1445
+ target_page_id = _extract_target_page_id(str(target_url)) if target_url else None
1446
+ edge: dict[str, Any] = {
1447
+ "source_page_id": page_id,
1448
+ "source_page_url": page_url,
1449
+ "target_url": target_url,
1450
+ "anchor_text": link.get("text"),
1451
+ }
1452
+ if target_type:
1453
+ edge["target_type"] = target_type
1454
+ if target_type == "confluence_page" and target_page_id:
1455
+ edge["target_page_id"] = target_page_id
1456
+ link_edges.append(edge)
1457
+
1458
+ if options.emit_corpus:
1459
+ for chunk in _chunk_html_by_headings(html, max_chars=chunk_max_chars):
1460
+ heading_path = chunk.get("heading_path") or []
1461
+ ordinal = chunk.get("ordinal") if isinstance(chunk.get("ordinal"), int) else 0
1462
+ base_key = f"{page_id}|{'/'.join(heading_path)}"
1463
+ key = base_key if ordinal == 0 else f"{base_key}|{ordinal}"
1464
+ corpus_chunks.append(
1465
+ {
1466
+ "chunk_id": _sha256_text(key)[:16],
1467
+ "page_id": page_id,
1468
+ "page_url": page_url,
1469
+ "title": title,
1470
+ "depth": depth,
1471
+ "parent_id": parent_id,
1472
+ "ancestors": ancestor_ids,
1473
+ "ancestor_ids": ancestor_ids,
1474
+ "space_key": space_key,
1475
+ "page_version": page_version,
1476
+ "page_updated": page_updated,
1477
+ "ancestor_titles": ancestor_titles,
1478
+ "heading_path": heading_path,
1479
+ "chunk_ordinal": ordinal,
1480
+ "text": chunk.get("text"),
1481
+ "source_type": "confluence",
1482
+ "doc_class": "canonical_doc",
1483
+ "generated_flag": False,
1484
+ "historical_flag": False,
1485
+ "discovery": "confluence_crawl_page",
1486
+ }
1487
+ )
1488
+
1489
+ if options.emit_tables:
1490
+ for table in _extract_tables(html):
1491
+ key = f"{page_id}|table|{table.get('table_index')}"
1492
+ tables.append(
1493
+ {
1494
+ "table_id": _sha256_text(key)[:16],
1495
+ "page_id": page_id,
1496
+ "page_url": page_url,
1497
+ "title": title,
1498
+ "depth": depth,
1499
+ "parent_id": parent_id,
1500
+ "space_key": space_key,
1501
+ "page_version": page_version,
1502
+ "page_updated": page_updated,
1503
+ "ancestor_titles": ancestor_titles,
1504
+ "table_index": table.get("table_index"),
1505
+ "headers": table.get("headers"),
1506
+ "rows": table.get("rows"),
1507
+ "row_count": table.get("row_count"),
1508
+ "col_count": table.get("col_count"),
1509
+ }
1510
+ )
1511
+
1512
+ if options.emit_mentions:
1513
+ for index, mention in enumerate(_extract_mentions(html, base_url=base_url)):
1514
+ key = f"{page_id}|mention|{mention.get('type')}|{mention.get('value')}|{mention.get('url') or ''}"
1515
+ mentions.append(
1516
+ {
1517
+ "mention_id": _sha256_text(key)[:16],
1518
+ "page_id": page_id,
1519
+ "page_url": page_url,
1520
+ "title": title,
1521
+ "depth": depth,
1522
+ "parent_id": parent_id,
1523
+ "space_key": space_key,
1524
+ "page_version": page_version,
1525
+ "page_updated": page_updated,
1526
+ "ancestor_titles": ancestor_titles,
1527
+ "mention_index": index,
1528
+ **mention,
1529
+ }
1530
+ )
1531
+
1532
+ if options.include_attachments:
1533
+ try:
1534
+ attachment_results = http.get_attachments(page_id)
1535
+ except Exception as exc:
1536
+ attachments.append(
1537
+ {
1538
+ "attachment_id": None,
1539
+ "page_id": page_id,
1540
+ "title": None,
1541
+ "status": "error",
1542
+ "error": str(exc),
1543
+ }
1544
+ )
1545
+ attachment_results = []
1546
+
1547
+ for att in attachment_results:
1548
+ if not isinstance(att, dict):
1549
+ continue
1550
+ attachment_id = att.get("id") or att.get("attachment_id")
1551
+ if attachment_id is None:
1552
+ continue
1553
+ attachment_id = str(attachment_id)
1554
+ if attachment_id in downloaded_attachments:
1555
+ continue
1556
+ downloaded_attachments.add(attachment_id)
1557
+
1558
+ att_title = str(att.get("title") or att.get("name") or f"attachment_{attachment_id}")
1559
+ out_path = attachments_dir / _safe_attachment_filename(
1560
+ attachment_id, att_title, fallback_suffix=_guess_attachment_suffix(att)
1561
+ )
1562
+
1563
+ try:
1564
+ content = http.get_attachment_content(attachment_id)
1565
+ out_path.write_bytes(content)
1566
+ file_sha256 = _sha256_bytes(content)
1567
+ status = "downloaded"
1568
+ error = None
1569
+ except Exception as exc:
1570
+ file_sha256 = None
1571
+ status = "error"
1572
+ error = str(exc)
1573
+
1574
+ version_obj = att.get("version") or {}
1575
+ attachment_version = version_obj.get("number") if isinstance(version_obj, dict) else None
1576
+ extensions = att.get("extensions") or {}
1577
+ attachment_size = None
1578
+ if isinstance(extensions, dict):
1579
+ attachment_size = extensions.get("fileSize") or extensions.get("size")
1580
+
1581
+ attachments.append(
1582
+ {
1583
+ "attachment_id": attachment_id,
1584
+ "page_id": page_id,
1585
+ "title": att_title,
1586
+ "status": status,
1587
+ "file_path": str(out_path) if status == "downloaded" else None,
1588
+ "file_sha256": file_sha256,
1589
+ "file_size_bytes": out_path.stat().st_size
1590
+ if status == "downloaded" and out_path.exists()
1591
+ else None,
1592
+ "attachment_version": attachment_version,
1593
+ "attachment_size": attachment_size,
1594
+ "error": error,
1595
+ }
1596
+ )
1597
+
1598
+ for child_id, child_parent, child_depth in result.children:
1599
+ if child_id in visited_pages:
1600
+ continue
1601
+ discovered_page_ids.add(child_id)
1602
+ queue.append((child_id, child_parent, child_depth))
1603
+
1604
+ _emit_runtime_event(
1605
+ options,
1606
+ "crawl_progress",
1607
+ root_page_id=resolved_root_id,
1608
+ pages_discovered=len(discovered_page_ids),
1609
+ pages_fetched=pages_fetched,
1610
+ pages_failed=pages_failed,
1611
+ pages_skipped_unchanged=skipped_unchanged,
1612
+ pages_skipped_total=skipped_unchanged + skipped_already_visited,
1613
+ pending_queue_size=len(queue),
1614
+ effective_page_concurrency=effective_page_concurrency,
1615
+ effective_concurrency_signal=effective_concurrency_signal,
1616
+ throttle_pressure_score=throttle_pressure_score,
1617
+ )
1618
+
1619
+ if options.emit_corpus and previous_corpus_chunks_by_page:
1620
+ should_merge_unchanged_corpus = pages_fetched > 0 or bool(full_reconciliation_deleted_ids)
1621
+ if should_merge_unchanged_corpus:
1622
+ emitted_chunk_ids = {
1623
+ str(chunk.get("chunk_id") or "").strip()
1624
+ for chunk in corpus_chunks
1625
+ if isinstance(chunk, dict) and str(chunk.get("chunk_id") or "").strip()
1626
+ }
1627
+ for page_id in sorted(unchanged_page_ids):
1628
+ for previous_chunk in previous_corpus_chunks_by_page.get(page_id, []):
1629
+ if not isinstance(previous_chunk, dict):
1630
+ continue
1631
+ chunk_id = str(previous_chunk.get("chunk_id") or "").strip()
1632
+ if chunk_id and chunk_id in emitted_chunk_ids:
1633
+ continue
1634
+ corpus_chunks.append(dict(previous_chunk))
1635
+ if chunk_id:
1636
+ emitted_chunk_ids.add(chunk_id)
1637
+
1638
+ if structure_first_pending_queue:
1639
+ queued_page_ids = {queued_page_id for queued_page_id, _, _ in queue}
1640
+ for pending_page_id, pending_parent_id, pending_depth in structure_first_pending_queue:
1641
+ if pending_page_id in visited_pages or pending_page_id in queued_page_ids:
1642
+ continue
1643
+ queue.append((pending_page_id, pending_parent_id, pending_depth))
1644
+ queued_page_ids.add(pending_page_id)
1645
+ if queue:
1646
+ termination_reason = "max_nodes_reached"
1647
+
1648
+ # Update and save crawl state (TSK-188)
1649
+ http_telemetry: dict[str, Any] | None = None
1650
+ if callable(telemetry_getter):
1651
+ try:
1652
+ telemetry_payload = telemetry_getter()
1653
+ if isinstance(telemetry_payload, dict):
1654
+ http_telemetry = telemetry_payload
1655
+ except Exception:
1656
+ http_telemetry = None
1657
+ if http_telemetry is None:
1658
+ http_telemetry = {}
1659
+
1660
+ elapsed_seconds = max(0.001, time.perf_counter() - crawl_started_at)
1661
+ throughput_pages_per_second = round(nodes_processed / elapsed_seconds, 3)
1662
+ latency_metrics = http_telemetry.get("latency_ms") if isinstance(http_telemetry, dict) else None
1663
+ p95_page_latency_ms = None
1664
+ if isinstance(latency_metrics, dict):
1665
+ operation_latency = latency_metrics.get("operation")
1666
+ if isinstance(operation_latency, dict):
1667
+ p95_value = operation_latency.get("p95")
1668
+ if isinstance(p95_value, (float, int)):
1669
+ p95_page_latency_ms = round(float(p95_value), 3)
1670
+
1671
+ retry_events_total = None
1672
+ if isinstance(http_telemetry, dict):
1673
+ retry_events_list = http_telemetry.get("recent_retry_events")
1674
+ if isinstance(retry_events_list, list):
1675
+ retry_events_total = len(retry_events_list)
1676
+
1677
+ if isinstance(http_telemetry, dict):
1678
+ http_telemetry["inflight_workers"] = max_inflight_workers
1679
+ http_telemetry["queue_depth"] = len(queue)
1680
+ http_telemetry["pages_per_second"] = throughput_pages_per_second
1681
+ http_telemetry["retry_events"] = retry_events_total
1682
+ http_telemetry["p95_page_latency_ms"] = p95_page_latency_ms
1683
+ adaptive_throttle = http_telemetry.get("adaptive_throttle")
1684
+ if isinstance(adaptive_throttle, dict):
1685
+ signal_value = adaptive_throttle.get("effective_concurrency_signal")
1686
+ if isinstance(signal_value, (float, int)):
1687
+ effective_concurrency_signal = float(signal_value)
1688
+ pressure_value = adaptive_throttle.get("pressure_score")
1689
+ if isinstance(pressure_value, (float, int)):
1690
+ throttle_pressure_score = float(pressure_value)
1691
+
1692
+ crawl_state.last_sync_timestamp = _now_iso()
1693
+ pages_skipped = skipped_unchanged + skipped_already_visited
1694
+ progress_event = {
1695
+ "pages_discovered": len(discovered_page_ids),
1696
+ "pages_fetched": pages_fetched,
1697
+ "pages_failed": pages_failed,
1698
+ "pages_skipped": pages_skipped,
1699
+ "pages_skipped_unchanged": skipped_unchanged,
1700
+ "throttle": {
1701
+ "configured_page_concurrency": page_concurrency,
1702
+ "effective_page_concurrency": effective_page_concurrency,
1703
+ "effective_concurrency_signal": effective_concurrency_signal,
1704
+ "pressure_score": throttle_pressure_score,
1705
+ },
1706
+ }
1707
+
1708
+ crawl_state.pending_queue = list(queue)
1709
+ crawl_state.completed = len(queue) == 0
1710
+ if crawl_state.completed:
1711
+ termination_reason = "queue_exhausted"
1712
+ crawl_state.run_diagnostics = {
1713
+ "termination_reason": termination_reason,
1714
+ "nodes_processed": nodes_processed,
1715
+ "skipped_unchanged": skipped_unchanged,
1716
+ "skipped_already_visited": skipped_already_visited,
1717
+ "pages_fetched": pages_fetched,
1718
+ "pages_failed": pages_failed,
1719
+ "pages_discovered": len(discovered_page_ids),
1720
+ "pages_skipped": pages_skipped,
1721
+ "pending_queue_size": len(queue),
1722
+ "page_concurrency": page_concurrency,
1723
+ "staged_mode": staged_mode,
1724
+ "incremental_cql": bool(options.incremental_cql),
1725
+ "rate_limit_budget": int(options.rate_limit_budget),
1726
+ "full_reconciliation_interval_days": max(1, int(options.full_reconciliation_interval_days)),
1727
+ "last_sync_timestamp": crawl_state.last_sync_timestamp,
1728
+ "last_full_reconciliation_timestamp": crawl_state.last_full_reconciliation_timestamp,
1729
+ "incremental_overlap_minutes": max(0, int(options.incremental_overlap_minutes)),
1730
+ "incremental_known_page_count": incremental_known_page_count,
1731
+ "incremental_candidate_count": len(incremental_candidate_ids),
1732
+ "incremental_high_water_mark_ran": incremental_high_water_mark_ran,
1733
+ "incremental_high_water_mark_skipped": incremental_high_water_mark_skipped,
1734
+ "incremental_high_water_mark_skip_reason": incremental_high_water_mark_skip_reason,
1735
+ "full_reconciliation_ran": full_reconciliation_ran,
1736
+ "full_reconciliation_deleted_ids": full_reconciliation_deleted_ids,
1737
+ "full_reconciliation_skipped": full_reconciliation_skipped,
1738
+ "full_reconciliation_skip_reason": full_reconciliation_skip_reason,
1739
+ "full_reconciliation_known_page_count": full_reconciliation_known_page_count,
1740
+ "structure_pre_discovered_nodes": structure_pre_discovered_nodes if use_structure_first else None,
1741
+ "progress_event": progress_event,
1742
+ "telemetry": http_telemetry,
1743
+ "http_telemetry": http_telemetry,
1744
+ }
1745
+ _save_crawl_state(crawl_state, state_path)
1746
+
1747
+ payload: dict[str, Any] = {
1748
+ "root_ref": root_ref,
1749
+ "root_page_id": resolved_root_id,
1750
+ "out_dir": str(output_dir),
1751
+ "generated_at": _now_iso(),
1752
+ "body_format": body_format,
1753
+ "chunk_max_chars": chunk_max_chars,
1754
+ "crawl_mode": {
1755
+ "page_concurrency": page_concurrency,
1756
+ "staged_mode": staged_mode,
1757
+ "incremental_cql": bool(options.incremental_cql),
1758
+ "rate_limit_budget": int(options.rate_limit_budget),
1759
+ "incremental_overlap_minutes": max(0, int(options.incremental_overlap_minutes)),
1760
+ "full_reconciliation_interval_days": max(1, int(options.full_reconciliation_interval_days)),
1761
+ },
1762
+ "page": root_meta,
1763
+ "child_pages": child_pages,
1764
+ "attachments": attachments,
1765
+ "corpus_path": None,
1766
+ "link_graph_path": None,
1767
+ "tables_path": None,
1768
+ "mentions_path": None,
1769
+ "progress_event": progress_event,
1770
+ "resume_state": {
1771
+ "skipped_unchanged": skipped_unchanged,
1772
+ "skipped_already_visited": skipped_already_visited,
1773
+ "pages_fetched": pages_fetched,
1774
+ "pages_failed": pages_failed,
1775
+ "pages_discovered": len(discovered_page_ids),
1776
+ "pages_skipped": pages_skipped,
1777
+ "pages_skipped_unchanged": skipped_unchanged,
1778
+ "nodes_processed": nodes_processed,
1779
+ "completed": crawl_state.completed,
1780
+ "termination_reason": termination_reason,
1781
+ "pending_queue_size": len(queue),
1782
+ "state_path": str(state_path),
1783
+ "page_concurrency": page_concurrency,
1784
+ "staged_mode": staged_mode,
1785
+ "last_sync_timestamp": crawl_state.last_sync_timestamp,
1786
+ "last_full_reconciliation_timestamp": crawl_state.last_full_reconciliation_timestamp,
1787
+ },
1788
+ "telemetry": http_telemetry,
1789
+ "http_telemetry": http_telemetry,
1790
+ }
1791
+
1792
+ if options.emit_corpus:
1793
+ corpus_path = output_dir / "corpus.jsonl"
1794
+ if corpus_chunks or not (options.resume and corpus_path.exists()):
1795
+ with corpus_path.open("w", encoding="utf-8") as fp:
1796
+ for item in corpus_chunks:
1797
+ fp.write(json.dumps(item, ensure_ascii=False) + "\n")
1798
+ payload["corpus_path"] = str(corpus_path)
1799
+
1800
+ if options.emit_link_graph:
1801
+ graph_path = output_dir / "link-graph.json"
1802
+ if link_edges or not (options.resume and graph_path.exists()):
1803
+ graph_payload = {
1804
+ "generated_at": payload["generated_at"],
1805
+ "root_page_id": resolved_root_id,
1806
+ "edges": [edge for edge in link_edges if edge.get("target_url")],
1807
+ }
1808
+ graph_path.write_text(json.dumps(graph_payload, ensure_ascii=False, indent=2), encoding="utf-8")
1809
+ payload["link_graph_path"] = str(graph_path)
1810
+
1811
+ if options.emit_tables:
1812
+ tables_path = output_dir / "tables.jsonl"
1813
+ if tables or not (options.resume and tables_path.exists()):
1814
+ with tables_path.open("w", encoding="utf-8") as fp:
1815
+ for item in tables:
1816
+ fp.write(json.dumps(item, ensure_ascii=False) + "\n")
1817
+ payload["tables_path"] = str(tables_path)
1818
+
1819
+ if options.emit_mentions:
1820
+ mentions_path = output_dir / "mentions.jsonl"
1821
+ if mentions or not (options.resume and mentions_path.exists()):
1822
+ with mentions_path.open("w", encoding="utf-8") as fp:
1823
+ for item in mentions:
1824
+ fp.write(json.dumps(item, ensure_ascii=False) + "\n")
1825
+ payload["mentions_path"] = str(mentions_path)
1826
+
1827
+ resolved_manifest_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
1828
+ payload["manifest_path"] = str(resolved_manifest_path)
1829
+
1830
+ # Create bundle zip if requested (TSK-189)
1831
+ if options.bundle:
1832
+ bundle_path = _create_bundle_zip(output_dir, resolved_root_id, resolved_manifest_path)
1833
+ payload["bundle_path"] = str(bundle_path)
1834
+
1835
+ return payload