@ngocsangairvds/vsaf 3.1.27 → 3.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (303) hide show
  1. package/package.json +2 -2
  2. package/src/global.js +65 -39
  3. package/tools/skills/vds-scripts-skill/.openskills.json +6 -0
  4. package/tools/skills/vds-scripts-skill/QUALITY.md +44 -0
  5. package/tools/skills/vds-scripts-skill/SKILL.md +135 -0
  6. package/tools/skills/vds-scripts-skill/references/audit-commands.md +171 -0
  7. package/tools/skills/vds-scripts-skill/references/capability-index.md +34 -0
  8. package/tools/skills/vds-scripts-skill/references/development-commands.md +12 -0
  9. package/tools/skills/vds-scripts-skill/references/google-sheets.md +73 -0
  10. package/tools/skills/vds-scripts-skill/references/integration-commands.md +17 -0
  11. package/tools/skills/vds-scripts-skill/references/platform-bootstrap.md +31 -0
  12. package/tools/skills/vds-scripts-skill/references/specialist-routing.md +14 -0
  13. package/tools/skills/vds-scripts-skill/references/validation-commands.md +15 -0
  14. package/tools/skills/vsaf-build/SKILL.md +32 -2
  15. package/tools/skills/vsaf-ship/SKILL.md +41 -10
  16. package/tools/skills/vsaf-test/SKILL.md +8 -0
  17. package/tools/vds-scripts/.mcp.json +11 -0
  18. package/tools/vds-scripts/.secrets.baseline +133 -0
  19. package/tools/vds-scripts/AGENTS.md +152 -0
  20. package/tools/vds-scripts/CLAUDE.md +101 -0
  21. package/tools/vds-scripts/CLI_COMMAND_OPTIMIZATION.md +156 -0
  22. package/tools/vds-scripts/PACKAGE_P125B_IMPLEMENTATION_SUMMARY.md +131 -0
  23. package/tools/vds-scripts/PROJECT_COMPLETION_SUMMARY.md +45 -0
  24. package/tools/vds-scripts/README.md +97 -0
  25. package/tools/vds-scripts/bitbucket_manifest_mapping.toml +34 -0
  26. package/tools/vds-scripts/bitbucket_orchestrator/ARCHITECTURE_ANALYSIS.md +258 -0
  27. package/tools/vds-scripts/bitbucket_orchestrator/BITBUCKET_API_PRACTICES.md +393 -0
  28. package/tools/vds-scripts/bitbucket_orchestrator/EVALUATION_REPORT.md +61 -0
  29. package/tools/vds-scripts/bitbucket_orchestrator/FEATURES.md +908 -0
  30. package/tools/vds-scripts/bitbucket_orchestrator/README.md +687 -0
  31. package/tools/vds-scripts/bitbucket_orchestrator/pyproject.toml +40 -0
  32. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/__init__.py +20 -0
  33. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/async_client.py +657 -0
  34. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/cli.py +2108 -0
  35. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/client.py +2534 -0
  36. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/config.py +171 -0
  37. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/errors.py +67 -0
  38. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/factory.py +185 -0
  39. package/tools/vds-scripts/bitbucket_orchestrator/src/vds_bitbucket_orchestrator/protocols.py +244 -0
  40. package/tools/vds-scripts/bitbucket_orchestrator/tests/__init__.py +8 -0
  41. package/tools/vds-scripts/bitbucket_orchestrator/tests/conftest.py +65 -0
  42. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_advanced_search.py +151 -0
  43. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_async_client.py +546 -0
  44. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_branch_permissions.py +145 -0
  45. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_cli.py +115 -0
  46. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client.py +157 -0
  47. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_branch_conditions.py +79 -0
  48. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_code_advanced.py +163 -0
  49. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_code_file.py +32 -0
  50. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_deployment_environments.py +194 -0
  51. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_issues.py +164 -0
  52. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_pipelines_advanced.py +179 -0
  53. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_pr_blockers.py +119 -0
  54. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_client_repository_variables.py +156 -0
  55. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code.py +98 -0
  56. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code_advanced.py +282 -0
  57. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_code_insights.py +335 -0
  58. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_conditions.py +147 -0
  59. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_config.py +131 -0
  60. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_deployment_env.py +352 -0
  61. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_factory.py +371 -0
  62. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_fork_operations.py +204 -0
  63. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_issue_cli.py +261 -0
  64. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_pipeline_advanced.py +270 -0
  65. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_pr_blocker.py +204 -0
  66. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_protocols.py +334 -0
  67. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_repo_settings.py +343 -0
  68. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_repo_variables.py +270 -0
  69. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_webhooks.py +189 -0
  70. package/tools/vds-scripts/bitbucket_orchestrator/tests/test_workspace.py +233 -0
  71. package/tools/vds-scripts/bitbucket_orchestrator/uv.lock +742 -0
  72. package/tools/vds-scripts/confluence_orchestrator/Dockerfile +19 -0
  73. package/tools/vds-scripts/confluence_orchestrator/README.md +412 -0
  74. package/tools/vds-scripts/confluence_orchestrator/SYNC_SCRIPTS.md +127 -0
  75. package/tools/vds-scripts/confluence_orchestrator/SYNC_STANDARDIZATION.md +108 -0
  76. package/tools/vds-scripts/confluence_orchestrator/pyproject.toml +48 -0
  77. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/__init__.py +20 -0
  78. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/cli.py +2532 -0
  79. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/config.py +175 -0
  80. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/content.py +290 -0
  81. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/content_v2.py +94 -0
  82. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/crawl_tree.py +1835 -0
  83. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/errors.py +80 -0
  84. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/eventing.py +109 -0
  85. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/http.py +1114 -0
  86. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/orchestration.py +165 -0
  87. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/reporting.py +78 -0
  88. package/tools/vds-scripts/confluence_orchestrator/src/confluence_orchestrator/tree.py +121 -0
  89. package/tools/vds-scripts/confluence_orchestrator/sync_pdfs_from_markdown.py +213 -0
  90. package/tools/vds-scripts/confluence_orchestrator/sync_pdfs_to_confluence.py +305 -0
  91. package/tools/vds-scripts/confluence_orchestrator/sync_png_attachments.py +305 -0
  92. package/tools/vds-scripts/confluence_orchestrator/tests/__init__.py +0 -0
  93. package/tools/vds-scripts/confluence_orchestrator/tests/conftest.py +8 -0
  94. package/tools/vds-scripts/confluence_orchestrator/tests/test_advanced_content.py +224 -0
  95. package/tools/vds-scripts/confluence_orchestrator/tests/test_advanced_search.py +188 -0
  96. package/tools/vds-scripts/confluence_orchestrator/tests/test_cache_management.py +247 -0
  97. package/tools/vds-scripts/confluence_orchestrator/tests/test_cli.py +499 -0
  98. package/tools/vds-scripts/confluence_orchestrator/tests/test_config.py +83 -0
  99. package/tools/vds-scripts/confluence_orchestrator/tests/test_content.py +186 -0
  100. package/tools/vds-scripts/confluence_orchestrator/tests/test_content_flags.py +27 -0
  101. package/tools/vds-scripts/confluence_orchestrator/tests/test_crawl_tree.py +2250 -0
  102. package/tools/vds-scripts/confluence_orchestrator/tests/test_draft_management.py +223 -0
  103. package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing.py +71 -0
  104. package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_chaos.py +37 -0
  105. package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_rate_limit.py +44 -0
  106. package/tools/vds-scripts/confluence_orchestrator/tests/test_eventing_timeout.py +49 -0
  107. package/tools/vds-scripts/confluence_orchestrator/tests/test_export.py +230 -0
  108. package/tools/vds-scripts/confluence_orchestrator/tests/test_history.py +204 -0
  109. package/tools/vds-scripts/confluence_orchestrator/tests/test_http.py +117 -0
  110. package/tools/vds-scripts/confluence_orchestrator/tests/test_orchestration.py +91 -0
  111. package/tools/vds-scripts/confluence_orchestrator/tests/test_reporting.py +24 -0
  112. package/tools/vds-scripts/confluence_orchestrator/tests/test_search_cql.py +34 -0
  113. package/tools/vds-scripts/confluence_orchestrator/tests/test_space_management.py +237 -0
  114. package/tools/vds-scripts/confluence_orchestrator/tests/test_space_permissions.py +332 -0
  115. package/tools/vds-scripts/confluence_orchestrator/tests/test_user_group_management.py +388 -0
  116. package/tools/vds-scripts/confluence_orchestrator/uv.lock +1023 -0
  117. package/tools/vds-scripts/git_orchestrator/ENHANCEMENT_SUMMARY.md +119 -0
  118. package/tools/vds-scripts/git_orchestrator/README.md +280 -0
  119. package/tools/vds-scripts/git_orchestrator/VERIFICATION_REPORT.md +152 -0
  120. package/tools/vds-scripts/git_orchestrator/pyproject.toml +35 -0
  121. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/__init__.py +7 -0
  122. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/__main__.py +4 -0
  123. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/cli.py +847 -0
  124. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/logging_config.py +63 -0
  125. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/manifest.py +129 -0
  126. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/orchestrator.py +819 -0
  127. package/tools/vds-scripts/git_orchestrator/src/vds_git_orchestrator/reporting.py +53 -0
  128. package/tools/vds-scripts/git_orchestrator/tests/__init__.py +0 -0
  129. package/tools/vds-scripts/git_orchestrator/tests/test_cli_settings.py +21 -0
  130. package/tools/vds-scripts/git_orchestrator/tests/test_integration.py +74 -0
  131. package/tools/vds-scripts/git_orchestrator/tests/test_manifest.py +79 -0
  132. package/tools/vds-scripts/git_orchestrator/tests/test_orchestrator.py +204 -0
  133. package/tools/vds-scripts/git_orchestrator/tests/test_public_api.py +236 -0
  134. package/tools/vds-scripts/git_orchestrator/tests/test_resilience.py +345 -0
  135. package/tools/vds-scripts/git_orchestrator/uv.lock +271 -0
  136. package/tools/vds-scripts/jira_orchestrator/README.md +770 -0
  137. package/tools/vds-scripts/jira_orchestrator/pyproject.toml +39 -0
  138. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/__init__.py +1 -0
  139. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/adapter.py +1320 -0
  140. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/cli.py +2271 -0
  141. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/config.py +138 -0
  142. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/errors.py +67 -0
  143. package/tools/vds-scripts/jira_orchestrator/src/vds_jira_orchestrator/reporting.py +65 -0
  144. package/tools/vds-scripts/jira_orchestrator/tests/__init__.py +1 -0
  145. package/tools/vds-scripts/jira_orchestrator/tests/conftest.py +86 -0
  146. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_agile_list_payloads.py +54 -0
  147. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_bulk_operations.py +69 -0
  148. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_components.py +57 -0
  149. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_createmeta.py +45 -0
  150. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_dashboard.py +117 -0
  151. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_issue_properties.py +54 -0
  152. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_permissions_compat.py +42 -0
  153. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_reindex.py +42 -0
  154. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_remote_links.py +76 -0
  155. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_transitions.py +91 -0
  156. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_user_management.py +110 -0
  157. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_version_management.py +133 -0
  158. package/tools/vds-scripts/jira_orchestrator/tests/test_adapter_watchers.py +41 -0
  159. package/tools/vds-scripts/jira_orchestrator/tests/test_advanced_search.py +164 -0
  160. package/tools/vds-scripts/jira_orchestrator/tests/test_agile.py +256 -0
  161. package/tools/vds-scripts/jira_orchestrator/tests/test_application_properties.py +193 -0
  162. package/tools/vds-scripts/jira_orchestrator/tests/test_backlog.py +91 -0
  163. package/tools/vds-scripts/jira_orchestrator/tests/test_bulk_operations.py +277 -0
  164. package/tools/vds-scripts/jira_orchestrator/tests/test_cli.py +106 -0
  165. package/tools/vds-scripts/jira_orchestrator/tests/test_components.py +106 -0
  166. package/tools/vds-scripts/jira_orchestrator/tests/test_config.py +164 -0
  167. package/tools/vds-scripts/jira_orchestrator/tests/test_dashboard.py +122 -0
  168. package/tools/vds-scripts/jira_orchestrator/tests/test_discover_fields.py +207 -0
  169. package/tools/vds-scripts/jira_orchestrator/tests/test_filter_management.py +333 -0
  170. package/tools/vds-scripts/jira_orchestrator/tests/test_issue_archiving.py +164 -0
  171. package/tools/vds-scripts/jira_orchestrator/tests/test_issue_links.py +257 -0
  172. package/tools/vds-scripts/jira_orchestrator/tests/test_issue_properties.py +171 -0
  173. package/tools/vds-scripts/jira_orchestrator/tests/test_link_types.py +314 -0
  174. package/tools/vds-scripts/jira_orchestrator/tests/test_parse_set.py +37 -0
  175. package/tools/vds-scripts/jira_orchestrator/tests/test_permissions.py +273 -0
  176. package/tools/vds-scripts/jira_orchestrator/tests/test_reindex.py +81 -0
  177. package/tools/vds-scripts/jira_orchestrator/tests/test_remote_links.py +254 -0
  178. package/tools/vds-scripts/jira_orchestrator/tests/test_security_schemes.py +170 -0
  179. package/tools/vds-scripts/jira_orchestrator/tests/test_transitions_changelog.py +114 -0
  180. package/tools/vds-scripts/jira_orchestrator/tests/test_user_management.py +226 -0
  181. package/tools/vds-scripts/jira_orchestrator/tests/test_version_management.py +339 -0
  182. package/tools/vds-scripts/jira_orchestrator/tests/test_watchers.py +101 -0
  183. package/tools/vds-scripts/jira_orchestrator/tests/test_worklog.py +223 -0
  184. package/tools/vds-scripts/jira_orchestrator/uv.lock +738 -0
  185. package/tools/vds-scripts/mcp_server/Dockerfile +34 -0
  186. package/tools/vds-scripts/mcp_server/README.md +140 -0
  187. package/tools/vds-scripts/mcp_server/pyproject.toml +42 -0
  188. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/__init__.py +4 -0
  189. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/config.py +36 -0
  190. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/server.py +66 -0
  191. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/__init__.py +14 -0
  192. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/bitbucket_tools.py +47 -0
  193. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/confluence_tools.py +59 -0
  194. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/git_tools.py +71 -0
  195. package/tools/vds-scripts/mcp_server/src/vds_mcp_server/tools/jira_tools.py +63 -0
  196. package/tools/vds-scripts/mcp_server/tests/__init__.py +2 -0
  197. package/tools/vds-scripts/mcp_server/tests/conftest.py +29 -0
  198. package/tools/vds-scripts/mcp_server/tests/unit/__init__.py +2 -0
  199. package/tools/vds-scripts/mcp_server/tests/unit/test_bitbucket_tools.py +25 -0
  200. package/tools/vds-scripts/mcp_server/tests/unit/test_confluence_tools.py +25 -0
  201. package/tools/vds-scripts/mcp_server/tests/unit/test_git_tools.py +32 -0
  202. package/tools/vds-scripts/mcp_server/tests/unit/test_jira_tools.py +32 -0
  203. package/tools/vds-scripts/mcp_server/tests/verification/__init__.py +2 -0
  204. package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_confluence_tools.py +40 -0
  205. package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_jira_tools.py +37 -0
  206. package/tools/vds-scripts/mcp_server/tests/verification/test_mcp_tool_registration.py +47 -0
  207. package/tools/vds-scripts/mcp_server/uv.lock +1032 -0
  208. package/tools/vds-scripts/mypy.ini +5 -0
  209. package/tools/vds-scripts/pyproject.toml +29 -0
  210. package/tools/vds-scripts/repo-manifest.yaml +273 -0
  211. package/tools/vds-scripts/repo-manifest.yaml.example +25 -0
  212. package/tools/vds-scripts/scripts/BRD-Validation-API.postman_collection.json +706 -0
  213. package/tools/vds-scripts/scripts/BRD-Validation-README.md +308 -0
  214. package/tools/vds-scripts/scripts/README.md +162 -0
  215. package/tools/vds-scripts/scripts/bootstrap_uv.sh +30 -0
  216. package/tools/vds-scripts/scripts/brd-validation-environment.json +51 -0
  217. package/tools/vds-scripts/scripts/brd-validation-test-results.json +13023 -0
  218. package/tools/vds-scripts/scripts/brd_coverage_report.json +276 -0
  219. package/tools/vds-scripts/scripts/create_memory_session.py +35 -0
  220. package/tools/vds-scripts/scripts/deployment/load_docker_images_offline.sh +90 -0
  221. package/tools/vds-scripts/scripts/final_completion_report.md +139 -0
  222. package/tools/vds-scripts/scripts/folder_structure_report.json +321 -0
  223. package/tools/vds-scripts/scripts/generate_completion_report.py +125 -0
  224. package/tools/vds-scripts/scripts/generate_intellij_modules.py +150 -0
  225. package/tools/vds-scripts/scripts/link_integrity_report.json +807 -0
  226. package/tools/vds-scripts/scripts/move_audit_artifact_pages.py +255 -0
  227. package/tools/vds-scripts/scripts/move_audit_artifact_pages_rest.py +165 -0
  228. package/tools/vds-scripts/scripts/move_wrong_dept_pages.py +216 -0
  229. package/tools/vds-scripts/scripts/save_intellij_memories.py +120 -0
  230. package/tools/vds-scripts/scripts/save_memories_to_vds_ai.py +83 -0
  231. package/tools/vds-scripts/scripts/save_memories_vds_style.py +129 -0
  232. package/tools/vds-scripts/scripts/search_intellij_memories.py +50 -0
  233. package/tools/vds-scripts/scripts/setup_intellij_workspace.py +65 -0
  234. package/tools/vds-scripts/scripts/target-state-automation/README.md +89 -0
  235. package/tools/vds-scripts/scripts/target-state-automation/confluence_sync_coordinator.sh +27 -0
  236. package/tools/vds-scripts/scripts/target-state-automation/coordination.sh +114 -0
  237. package/tools/vds-scripts/scripts/target-state-automation/diagram_coordinator.sh +25 -0
  238. package/tools/vds-scripts/scripts/target-state-automation/docs_root.sh +22 -0
  239. package/tools/vds-scripts/scripts/target-state-automation/generate_diagrams.sh +22 -0
  240. package/tools/vds-scripts/scripts/target-state-automation/markdown_coordinator.sh +25 -0
  241. package/tools/vds-scripts/scripts/target-state-automation/progress_dashboard.sh +17 -0
  242. package/tools/vds-scripts/scripts/target-state-automation/schema_coordinator.sh +25 -0
  243. package/tools/vds-scripts/scripts/target-state-automation/sync_confluence.sh +30 -0
  244. package/tools/vds-scripts/scripts/target-state-automation/update_dependencies.sh +19 -0
  245. package/tools/vds-scripts/scripts/target-state-automation/validate_links.sh +86 -0
  246. package/tools/vds-scripts/scripts/target-state-automation/validate_markdown.sh +52 -0
  247. package/tools/vds-scripts/scripts/target-state-automation/validate_schemas.sh +26 -0
  248. package/tools/vds-scripts/scripts/target-state-automation/validate_structure.sh +98 -0
  249. package/tools/vds-scripts/scripts/update_modules_xml.py +190 -0
  250. package/tools/vds-scripts/scripts/uv-workspace-alignment-verification-2026-03-25.md +128 -0
  251. package/tools/vds-scripts/scripts/validate_brd_coverage.py +179 -0
  252. package/tools/vds-scripts/scripts/validate_folder_structure.py +240 -0
  253. package/tools/vds-scripts/scripts/validate_link_integrity.py +272 -0
  254. package/tools/vds-scripts/scripts/vds_sh_helpers.sh +180 -0
  255. package/tools/vds-scripts/scripts/verification/phase2_portable_paths_ubuntu_docker.sh +26 -0
  256. package/tools/vds-scripts/scripts/worktree_uv.sh +48 -0
  257. package/tools/vds-scripts/uv.lock +8 -0
  258. package/tools/vds-scripts/vds_cli/README.md +126 -0
  259. package/tools/vds-scripts/vds_cli/VERIFICATION_REPORT.md +41 -0
  260. package/tools/vds-scripts/vds_cli/pyproject.toml +38 -0
  261. package/tools/vds-scripts/vds_cli/src/vds_cli/__init__.py +3 -0
  262. package/tools/vds-scripts/vds_cli/src/vds_cli/cli.py +173 -0
  263. package/tools/vds-scripts/vds_cli/src/vds_cli/docs_sync.py +1203 -0
  264. package/tools/vds-scripts/vds_cli/src/vds_cli/env.py +41 -0
  265. package/tools/vds-scripts/vds_cli/src/vds_cli/google_sheets_orchestrator/__init__.py +3 -0
  266. package/tools/vds-scripts/vds_cli/src/vds_cli/google_sheets_orchestrator/google_sheets_orchestrator.py +198 -0
  267. package/tools/vds-scripts/vds_cli/src/vds_cli/router.py +93 -0
  268. package/tools/vds-scripts/vds_cli/src/vds_cli/sync_api.py +647 -0
  269. package/tools/vds-scripts/vds_cli/src/vds_cli/sync_service.py +266 -0
  270. package/tools/vds-scripts/vds_cli/tests/__init__.py +2 -0
  271. package/tools/vds-scripts/vds_cli/tests/conftest.py +49 -0
  272. package/tools/vds-scripts/vds_cli/tests/unit/__init__.py +2 -0
  273. package/tools/vds-scripts/vds_cli/tests/unit/test_cli.py +143 -0
  274. package/tools/vds-scripts/vds_cli/tests/unit/test_docs_sync.py +422 -0
  275. package/tools/vds-scripts/vds_cli/tests/unit/test_env.py +51 -0
  276. package/tools/vds-scripts/vds_cli/tests/unit/test_router.py +72 -0
  277. package/tools/vds-scripts/vds_cli/tests/unit/test_sync_api.py +357 -0
  278. package/tools/vds-scripts/vds_cli/tests/unit/test_sync_service.py +160 -0
  279. package/tools/vds-scripts/vds_cli/tests/verification/__init__.py +2 -0
  280. package/tools/vds-scripts/vds_cli/tests/verification/test_bitbucket_real.py +33 -0
  281. package/tools/vds-scripts/vds_cli/tests/verification/test_confluence_real.py +35 -0
  282. package/tools/vds-scripts/vds_cli/tests/verification/test_jira_real.py +41 -0
  283. package/tools/vds-scripts/vds_cli/uv.lock +524 -0
  284. package/tools/vds-scripts/vds_cli_common/README.md +190 -0
  285. package/tools/vds-scripts/vds_cli_common/pyproject.toml +92 -0
  286. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/__init__.py +34 -0
  287. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/completers.py +139 -0
  288. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/context.py +201 -0
  289. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/env.py +119 -0
  290. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/errors.py +318 -0
  291. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/output.py +284 -0
  292. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/paths.py +78 -0
  293. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/testing.py +213 -0
  294. package/tools/vds-scripts/vds_cli_common/src/vds_cli_common/version.py +85 -0
  295. package/tools/vds-scripts/vds_cli_common/tests/__init__.py +1 -0
  296. package/tools/vds-scripts/vds_cli_common/tests/test_completers.py +148 -0
  297. package/tools/vds-scripts/vds_cli_common/tests/test_context.py +192 -0
  298. package/tools/vds-scripts/vds_cli_common/tests/test_env.py +102 -0
  299. package/tools/vds-scripts/vds_cli_common/tests/test_errors.py +186 -0
  300. package/tools/vds-scripts/vds_cli_common/tests/test_output.py +229 -0
  301. package/tools/vds-scripts/vds_cli_common/tests/test_paths.py +61 -0
  302. package/tools/vds-scripts/vds_cli_common/tests/test_testing.py +138 -0
  303. package/tools/vds-scripts/vds_cli_common/tests/test_version.py +64 -0
@@ -0,0 +1,2250 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import threading
6
+ import time
7
+ import zipfile
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import pytest
12
+ from confluence_orchestrator.cli import app
13
+ from confluence_orchestrator.crawl_tree import CrawlOptions, CrawlState, _create_bundle_zip, crawl_tree_to_disk
14
+ from typer.testing import CliRunner
15
+
16
+ runner = CliRunner()
17
+
18
+
19
+ def test_crawl_options_phase80_defaults() -> None:
20
+ options = CrawlOptions()
21
+ assert options.max_depth == 3
22
+ assert options.max_nodes == 500
23
+ assert options.page_concurrency == 5
24
+
25
+
26
+ def test_crawl_tree_downloads_pages_and_attachments(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
27
+ class DummyClient:
28
+ def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
29
+ raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
30
+
31
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
32
+ assert page_id == "1"
33
+ assert expand is not None
34
+ return {
35
+ "id": page_id,
36
+ "title": "Root",
37
+ "body": {
38
+ "view": {
39
+ "value": (
40
+ "<h1>Intro</h1>"
41
+ "<p>Hello</p>"
42
+ '<a class="confluence-userlink" data-username="john.doe" href="/display/~john.doe">@John Doe</a>'
43
+ "<table><tr><th>H1</th><th>H2</th></tr><tr><td>A</td><td>B</td></tr></table>"
44
+ '<a href="/display/X/Y">Y</a>'
45
+ '<a href="/pages/viewpage.action?pageId=777">P777</a>'
46
+ '<a href="/pages/888/Test">P888</a>'
47
+ )
48
+ },
49
+ "storage": {"value": "<p>fallback</p>"},
50
+ },
51
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
52
+ "space": {"key": "SPACE"},
53
+ "version": {"number": 1, "when": "2026-01-01T00:00:00.000+00:00"},
54
+ "ancestors": [{"id": "10", "title": "Ancestor"}],
55
+ }
56
+
57
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
58
+ assert page_id == "1"
59
+ return []
60
+
61
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
62
+ assert page_id == "1"
63
+ return [
64
+ {
65
+ "id": "a1",
66
+ "title": "file.txt",
67
+ "version": {"number": 2},
68
+ "extensions": {"fileSize": 3},
69
+ }
70
+ ]
71
+
72
+ def get_attachment_content(self, attachment_id: str) -> bytes:
73
+ assert attachment_id == "a1"
74
+ return b"abc"
75
+
76
+ dummy = DummyClient()
77
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
78
+
79
+ out_dir = tmp_path / "out"
80
+ result = runner.invoke(
81
+ app,
82
+ [
83
+ "--report-dir",
84
+ str(tmp_path),
85
+ "crawl-tree",
86
+ "1",
87
+ "--out",
88
+ str(out_dir),
89
+ "--depth",
90
+ "0",
91
+ "--emit-corpus",
92
+ "--emit-link-graph",
93
+ "--emit-tables",
94
+ "--emit-mentions",
95
+ ],
96
+ )
97
+
98
+ assert result.exit_code == 0, result.output
99
+ payload = json.loads(result.stdout)
100
+
101
+ assert payload["root_page_id"] == "1"
102
+ assert (out_dir / "1_Root.html").exists()
103
+ assert (out_dir / "attachments" / "a1_file.txt").exists()
104
+ assert payload["corpus_path"]
105
+ assert payload["link_graph_path"]
106
+ assert payload["tables_path"]
107
+ assert payload["mentions_path"]
108
+
109
+ corpus_records = [
110
+ json.loads(line)
111
+ for line in Path(payload["corpus_path"]).read_text(encoding="utf-8").splitlines()
112
+ if line.strip()
113
+ ]
114
+ assert corpus_records
115
+ assert corpus_records[0]["depth"] == 0
116
+ assert corpus_records[0]["parent_id"] is None
117
+ assert corpus_records[0]["ancestors"] == ["10"]
118
+ assert corpus_records[0]["ancestor_titles"] == ["Ancestor"]
119
+
120
+ tables_path = Path(payload["tables_path"])
121
+ mentions_path = Path(payload["mentions_path"])
122
+ assert tables_path.exists()
123
+ assert mentions_path.exists()
124
+
125
+ table_record = json.loads(tables_path.read_text(encoding="utf-8").splitlines()[0])
126
+ assert table_record["page_id"] == "1"
127
+ assert table_record["table_index"] == 0
128
+ assert table_record["headers"] == ["H1", "H2"]
129
+ assert table_record["rows"] == [["A", "B"]]
130
+
131
+ mention_records = [
132
+ json.loads(line) for line in mentions_path.read_text(encoding="utf-8").splitlines() if line.strip()
133
+ ]
134
+ assert mention_records
135
+ assert all(record["page_id"] == "1" for record in mention_records)
136
+ assert any(record.get("type") == "confluence_page" and record.get("value") == "777" for record in mention_records)
137
+
138
+ link_graph = json.loads(Path(payload["link_graph_path"]).read_text(encoding="utf-8"))
139
+ edges = link_graph["edges"]
140
+ enriched = {edge.get("target_page_id"): edge for edge in edges if edge.get("target_page_id")}
141
+ assert enriched["777"]["target_type"] == "confluence_page"
142
+ assert enriched["888"]["target_type"] == "confluence_page"
143
+
144
+
145
+ def test_crawl_tree_appends_attachment_suffix_when_missing(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
146
+ class DummyClient:
147
+ def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
148
+ raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
149
+
150
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
151
+ assert page_id == "1"
152
+ return {
153
+ "id": page_id,
154
+ "title": "Root",
155
+ "body": {"view": {"value": "<p>ok</p>"}, "storage": {"value": ""}},
156
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
157
+ }
158
+
159
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
160
+ return []
161
+
162
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
163
+ return [
164
+ {
165
+ "id": "a2",
166
+ "title": "diagram",
167
+ "extensions": {"fileSize": 3, "mediaType": "application/pdf"},
168
+ }
169
+ ]
170
+
171
+ def get_attachment_content(self, attachment_id: str) -> bytes:
172
+ assert attachment_id == "a2"
173
+ return b"abc"
174
+
175
+ dummy = DummyClient()
176
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
177
+
178
+ out_dir = tmp_path / "out"
179
+ result = runner.invoke(
180
+ app,
181
+ [
182
+ "--report-dir",
183
+ str(tmp_path),
184
+ "crawl-tree",
185
+ "1",
186
+ "--out",
187
+ str(out_dir),
188
+ "--depth",
189
+ "0",
190
+ "--emit-corpus",
191
+ ],
192
+ )
193
+
194
+ assert result.exit_code == 0, result.output
195
+ assert (out_dir / "attachments" / "a2_diagram.pdf").exists()
196
+
197
+
198
+ def test_crawl_tree_infers_server_from_root_url(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
199
+ class DummyClient:
200
+ def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
201
+ raise AssertionError("search_cql should not be called for pageId URL root")
202
+
203
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
204
+ assert page_id == "1"
205
+ return {
206
+ "id": page_id,
207
+ "title": "Root",
208
+ "body": {"view": {"value": "<p>ok</p>"}, "storage": {"value": ""}},
209
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
210
+ }
211
+
212
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
213
+ return []
214
+
215
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
216
+ return []
217
+
218
+ def get_attachment_content(self, attachment_id: str) -> bytes:
219
+ raise AssertionError("no attachments expected")
220
+
221
+ dummy = DummyClient()
222
+ observed: dict[str, str] = {}
223
+
224
+ def _build_http_client(settings, server: str):
225
+ observed["server"] = server
226
+ return dummy
227
+
228
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", _build_http_client)
229
+
230
+ out_dir = tmp_path / "out"
231
+ result = runner.invoke(
232
+ app,
233
+ [
234
+ "--report-dir",
235
+ str(tmp_path),
236
+ "crawl-tree",
237
+ "http://10.254.136.35:8090/pages/viewpage.action?pageId=1",
238
+ "--out",
239
+ str(out_dir),
240
+ "--depth",
241
+ "0",
242
+ "--no-include-attachments",
243
+ ],
244
+ )
245
+
246
+ assert result.exit_code == 0, result.output
247
+ # 10.254.136.35 is one of the two old external Confluence servers — both map to "external"
248
+ assert observed["server"] == "external"
249
+
250
+
251
+ def test_crawl_tree_does_not_emit_tables_or_mentions_by_default(
252
+ monkeypatch: pytest.MonkeyPatch, tmp_path: Path
253
+ ) -> None:
254
+ class DummyClient:
255
+ def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
256
+ raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
257
+
258
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
259
+ assert page_id == "1"
260
+ return {
261
+ "id": page_id,
262
+ "title": "Root",
263
+ "body": {
264
+ "view": {
265
+ "value": (
266
+ '<a class="confluence-userlink" data-username="john.doe">@John Doe</a>'
267
+ "<table><tr><td>only</td></tr></table>"
268
+ )
269
+ },
270
+ "storage": {"value": ""},
271
+ },
272
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
273
+ }
274
+
275
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
276
+ return []
277
+
278
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
279
+ return []
280
+
281
+ def get_attachment_content(self, attachment_id: str) -> bytes:
282
+ raise AssertionError("no attachments expected")
283
+
284
+ dummy = DummyClient()
285
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
286
+
287
+ out_dir = tmp_path / "out"
288
+ result = runner.invoke(
289
+ app,
290
+ [
291
+ "--report-dir",
292
+ str(tmp_path),
293
+ "crawl-tree",
294
+ "1",
295
+ "--out",
296
+ str(out_dir),
297
+ "--depth",
298
+ "0",
299
+ "--no-include-attachments",
300
+ ],
301
+ )
302
+
303
+ assert result.exit_code == 0, result.output
304
+ payload = json.loads(result.stdout)
305
+ assert payload["tables_path"] is None
306
+ assert payload["mentions_path"] is None
307
+ assert not (out_dir / "tables.jsonl").exists()
308
+ assert not (out_dir / "mentions.jsonl").exists()
309
+
310
+
311
+ def test_crawl_tree_resolves_display_url(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
312
+ class DummyClient:
313
+ def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
314
+ assert 'space="TDOV"' in cql
315
+ assert 'title="My Page"' in cql
316
+ return {"results": [{"id": "42"}]}
317
+
318
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
319
+ assert page_id == "42"
320
+ return {
321
+ "id": page_id,
322
+ "title": "My Page",
323
+ "body": {"storage": {"value": "<p>ok</p>"}, "view": {"value": ""}},
324
+ "_links": {"base": "http://example", "webui": "/display/TDOV/My+Page"},
325
+ }
326
+
327
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
328
+ return []
329
+
330
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
331
+ return []
332
+
333
+ def get_attachment_content(self, attachment_id: str) -> bytes:
334
+ raise AssertionError("no attachments expected")
335
+
336
+ dummy = DummyClient()
337
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
338
+
339
+ out_dir = tmp_path / "out"
340
+ result = runner.invoke(
341
+ app,
342
+ [
343
+ "--report-dir",
344
+ str(tmp_path),
345
+ "crawl-tree",
346
+ "http://example/display/TDOV/My+Page",
347
+ "--out",
348
+ str(out_dir),
349
+ "--depth",
350
+ "0",
351
+ "--no-include-attachments",
352
+ ],
353
+ )
354
+
355
+ assert result.exit_code == 0, result.output
356
+ payload = json.loads(result.stdout)
357
+ assert payload["root_page_id"] == "42"
358
+ assert (out_dir / "42_My Page.html").exists()
359
+
360
+
361
+ def test_crawl_tree_splits_corpus_chunks_by_max_chars(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
362
+ long_text = "word " * 200 # > 50 chars
363
+
364
+ class DummyClient:
365
+ def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
366
+ raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
367
+
368
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
369
+ assert page_id == "1"
370
+ return {
371
+ "id": page_id,
372
+ "title": "Root",
373
+ "body": {"view": {"value": f"<h1>Intro</h1><p>{long_text}</p>"}, "storage": {"value": ""}},
374
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
375
+ }
376
+
377
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
378
+ return []
379
+
380
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
381
+ return []
382
+
383
+ def get_attachment_content(self, attachment_id: str) -> bytes:
384
+ raise AssertionError("no attachments expected")
385
+
386
+ dummy = DummyClient()
387
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
388
+
389
+ out_dir = tmp_path / "out"
390
+ result = runner.invoke(
391
+ app,
392
+ [
393
+ "--report-dir",
394
+ str(tmp_path),
395
+ "crawl-tree",
396
+ "1",
397
+ "--out",
398
+ str(out_dir),
399
+ "--depth",
400
+ "0",
401
+ "--no-include-attachments",
402
+ "--emit-corpus",
403
+ "--chunk-max-chars",
404
+ "50",
405
+ ],
406
+ )
407
+
408
+ assert result.exit_code == 0, result.output
409
+ payload = json.loads(result.stdout)
410
+ corpus_path = Path(payload["corpus_path"])
411
+ corpus = [json.loads(line) for line in corpus_path.read_text(encoding="utf-8").splitlines() if line.strip()]
412
+ assert len(corpus) > 1
413
+ assert all(len(entry.get("text") or "") <= 50 for entry in corpus)
414
+ assert {entry.get("chunk_ordinal") for entry in corpus} == set(range(len(corpus)))
415
+ assert corpus[0]["source_type"] == "confluence"
416
+ assert corpus[0]["doc_class"] == "canonical_doc"
417
+ assert corpus[0]["generated_flag"] is False
418
+ assert corpus[0]["historical_flag"] is False
419
+ assert corpus[0]["discovery"] == "confluence_crawl_page"
420
+
421
+ base_key = "1|Intro"
422
+ expected_first = hashlib.sha256(base_key.encode("utf-8")).hexdigest()[:16]
423
+ expected_second = hashlib.sha256(f"{base_key}|1".encode()).hexdigest()[:16]
424
+ assert corpus[0]["chunk_id"] == expected_first
425
+ assert corpus[1]["chunk_id"] == expected_second
426
+
427
+
428
+ def test_crawl_tree_uses_export_view_when_requested(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
429
+ class DummyClient:
430
+ def search_cql(self, cql: str, limit: int = 25, expand: list[str] | None = None, **_: Any) -> dict[str, Any]:
431
+ raise AssertionError(f"search_cql should not be called for numeric root (got {cql})")
432
+
433
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
434
+ assert page_id == "1"
435
+ assert expand is not None
436
+ assert "body.export_view" in expand
437
+ return {
438
+ "id": page_id,
439
+ "title": "Root",
440
+ "body": {
441
+ "view": {"value": "<p>view</p>"},
442
+ "export_view": {"value": "<p>export</p>"},
443
+ "storage": {"value": "<p>storage</p>"},
444
+ },
445
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
446
+ }
447
+
448
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
449
+ return []
450
+
451
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
452
+ return []
453
+
454
+ def get_attachment_content(self, attachment_id: str) -> bytes:
455
+ raise AssertionError("no attachments expected")
456
+
457
+ dummy = DummyClient()
458
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
459
+
460
+ out_dir = tmp_path / "out"
461
+ result = runner.invoke(
462
+ app,
463
+ [
464
+ "--report-dir",
465
+ str(tmp_path),
466
+ "crawl-tree",
467
+ "1",
468
+ "--out",
469
+ str(out_dir),
470
+ "--depth",
471
+ "0",
472
+ "--no-include-attachments",
473
+ "--body-format",
474
+ "export_view",
475
+ ],
476
+ )
477
+
478
+ assert result.exit_code == 0, result.output
479
+ payload = json.loads(result.stdout)
480
+ html_path = Path(payload["page"]["html_path"])
481
+ assert "export" in html_path.read_text(encoding="utf-8")
482
+ assert payload["page"]["source_format"] == "html-export-view"
483
+
484
+
485
+ def test_crawl_tree_concurrency_preserves_deterministic_order(tmp_path: Path) -> None:
486
+ class DummyClient:
487
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
488
+ raise AssertionError("search_cql should not be called")
489
+
490
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
491
+ # Force non-deterministic completion order when concurrency > 1.
492
+ if page_id == "2":
493
+ time.sleep(0.04)
494
+ if page_id == "3":
495
+ time.sleep(0.01)
496
+ return {
497
+ "id": page_id,
498
+ "title": f"Page {page_id}",
499
+ "body": {"view": {"value": f"<h1>{page_id}</h1><p>Content {page_id}</p>"}},
500
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
501
+ "version": {"number": int(page_id)},
502
+ }
503
+
504
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
505
+ if page_id == "1":
506
+ return [{"id": "2"}, {"id": "3"}]
507
+ return []
508
+
509
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
510
+ return []
511
+
512
+ def get_attachment_content(self, attachment_id: str) -> bytes:
513
+ return b""
514
+
515
+ client = DummyClient()
516
+ sequential = crawl_tree_to_disk(
517
+ http=client,
518
+ root_ref="1",
519
+ output_dir=tmp_path / "seq",
520
+ options=CrawlOptions(max_depth=1, include_attachments=False, emit_corpus=True, page_concurrency=1),
521
+ )
522
+ concurrent = crawl_tree_to_disk(
523
+ http=client,
524
+ root_ref="1",
525
+ output_dir=tmp_path / "concurrent",
526
+ options=CrawlOptions(max_depth=1, include_attachments=False, emit_corpus=True, page_concurrency=4),
527
+ )
528
+
529
+ assert [p["page_id"] for p in sequential["child_pages"]] == ["2", "3"]
530
+ assert [p["page_id"] for p in concurrent["child_pages"]] == ["2", "3"]
531
+
532
+ seq_corpus = Path(sequential["corpus_path"]).read_text(encoding="utf-8").splitlines()
533
+ conc_corpus = Path(concurrent["corpus_path"]).read_text(encoding="utf-8").splitlines()
534
+ assert [json.loads(line)["page_id"] for line in seq_corpus] == [json.loads(line)["page_id"] for line in conc_corpus]
535
+
536
+
537
+ def test_crawl_tree_parallelizes_child_discovery_with_page_fetch(tmp_path: Path) -> None:
538
+ lock = threading.Lock()
539
+ overlap_counter = 0
540
+ active_page_calls = 0
541
+ active_children_calls = 0
542
+
543
+ class DummyClient:
544
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
545
+ raise AssertionError("search_cql should not be called")
546
+
547
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
548
+ nonlocal overlap_counter, active_page_calls, active_children_calls
549
+ with lock:
550
+ active_page_calls += 1
551
+ if active_children_calls > 0:
552
+ overlap_counter += 1
553
+ time.sleep(0.03)
554
+ with lock:
555
+ active_page_calls -= 1
556
+ return {
557
+ "id": page_id,
558
+ "title": f"Page {page_id}",
559
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
560
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
561
+ "version": {"number": int(page_id)},
562
+ }
563
+
564
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
565
+ nonlocal overlap_counter, active_page_calls, active_children_calls
566
+ with lock:
567
+ active_children_calls += 1
568
+ if active_page_calls > 0:
569
+ overlap_counter += 1
570
+ time.sleep(0.03)
571
+ with lock:
572
+ active_children_calls -= 1
573
+ if page_id == "1":
574
+ return [{"id": "2"}]
575
+ return []
576
+
577
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
578
+ return []
579
+
580
+ def get_attachment_content(self, attachment_id: str) -> bytes:
581
+ return b""
582
+
583
+ result = crawl_tree_to_disk(
584
+ http=DummyClient(),
585
+ root_ref="1",
586
+ output_dir=tmp_path / "out",
587
+ options=CrawlOptions(max_depth=1, include_attachments=False, page_concurrency=1),
588
+ )
589
+
590
+ assert result["child_pages"][0]["page_id"] == "2"
591
+ assert overlap_counter > 0
592
+
593
+
594
+ def test_crawl_tree_concurrent_throughput_exceeds_2x_sequential(tmp_path: Path) -> None:
595
+ class DummyClient:
596
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
597
+ raise AssertionError("search_cql should not be called")
598
+
599
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
600
+ time.sleep(0.08)
601
+ return {
602
+ "id": page_id,
603
+ "title": f"Page {page_id}",
604
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
605
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
606
+ "version": {"number": int(page_id)},
607
+ }
608
+
609
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
610
+ if page_id == "1":
611
+ return [{"id": str(i)} for i in range(2, 8)]
612
+ return []
613
+
614
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
615
+ return []
616
+
617
+ def get_attachment_content(self, attachment_id: str) -> bytes:
618
+ return b""
619
+
620
+ client = DummyClient()
621
+ start = time.perf_counter()
622
+ crawl_tree_to_disk(
623
+ http=client,
624
+ root_ref="1",
625
+ output_dir=tmp_path / "seq",
626
+ options=CrawlOptions(max_depth=1, include_attachments=False, page_concurrency=1),
627
+ )
628
+ sequential_elapsed = time.perf_counter() - start
629
+
630
+ start = time.perf_counter()
631
+ crawl_tree_to_disk(
632
+ http=client,
633
+ root_ref="1",
634
+ output_dir=tmp_path / "concurrent",
635
+ options=CrawlOptions(max_depth=1, include_attachments=False, page_concurrency=5),
636
+ )
637
+ concurrent_elapsed = time.perf_counter() - start
638
+
639
+ assert concurrent_elapsed * 2 <= sequential_elapsed
640
+
641
+
642
+ def test_crawl_tree_structure_first_mode_pre_discovers_before_fetch(tmp_path: Path) -> None:
643
+ call_log: list[tuple[str, str]] = []
644
+
645
+ class DummyClient:
646
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
647
+ raise AssertionError("search_cql should not be called")
648
+
649
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
650
+ call_log.append(("page", page_id))
651
+ return {
652
+ "id": page_id,
653
+ "title": f"Page {page_id}",
654
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
655
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
656
+ "version": {"number": int(page_id)},
657
+ }
658
+
659
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
660
+ call_log.append(("children", page_id))
661
+ if page_id == "1":
662
+ return [{"id": "2"}, {"id": "3"}]
663
+ if page_id == "2":
664
+ return [{"id": "4"}]
665
+ return []
666
+
667
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
668
+ return []
669
+
670
+ def get_attachment_content(self, attachment_id: str) -> bytes:
671
+ return b""
672
+
673
+ result = crawl_tree_to_disk(
674
+ http=DummyClient(),
675
+ root_ref="1",
676
+ output_dir=tmp_path / "out",
677
+ options=CrawlOptions(
678
+ max_depth=2,
679
+ include_attachments=False,
680
+ page_concurrency=2,
681
+ staged_mode="structure_first",
682
+ max_nodes=2,
683
+ ),
684
+ )
685
+
686
+ first_page_call_index = next(index for index, event in enumerate(call_log) if event[0] == "page")
687
+ assert all(event[0] == "children" for event in call_log[:first_page_call_index])
688
+ assert call_log[0] == ("children", "1")
689
+ assert [page["page_id"] for page in result["child_pages"]] == ["2"]
690
+ assert result["resume_state"]["termination_reason"] == "max_nodes_reached"
691
+ assert result["resume_state"]["pending_queue_size"] > 0
692
+ assert result["crawl_mode"]["staged_mode"] == "structure_first"
693
+ assert result["crawl_mode"]["page_concurrency"] == 2
694
+
695
+
696
+ # =============================================================================
697
+ # TSK-188: Resume/Incremental Crawling Tests
698
+ # =============================================================================
699
+
700
+
701
+ def test_crawl_state_serialization() -> None:
702
+ """Test CrawlState can serialize and deserialize correctly."""
703
+ state = CrawlState(
704
+ root_page_id="123",
705
+ body_format="view",
706
+ visited_pages={
707
+ "123": {"version": 5, "html_sha256": "abc123", "visited_at": "2026-01-01T00:00:00Z"},
708
+ "456": {"version": 3, "html_sha256": "def456", "visited_at": "2026-01-01T00:01:00Z"},
709
+ },
710
+ pending_queue=[("789", "123", 1), ("101", "123", 1)],
711
+ started_at="2026-01-01T00:00:00Z",
712
+ last_updated_at="2026-01-01T00:05:00Z",
713
+ completed=False,
714
+ last_sync_timestamp="2026-01-01T00:05:00Z",
715
+ last_full_reconciliation_timestamp="2026-01-01T00:00:00Z",
716
+ )
717
+
718
+ # Serialize
719
+ data = state.to_dict()
720
+ assert data["root_page_id"] == "123"
721
+ assert len(data["visited_pages"]) == 2
722
+ assert len(data["pending_queue"]) == 2
723
+ assert data["completed"] is False
724
+ assert data["last_sync_timestamp"] == "2026-01-01T00:05:00Z"
725
+ assert data["last_full_reconciliation_timestamp"] == "2026-01-01T00:00:00Z"
726
+
727
+ # Deserialize
728
+ restored = CrawlState.from_dict(data)
729
+ assert restored.root_page_id == "123"
730
+ assert restored.body_format == "view"
731
+ assert len(restored.visited_pages) == 2
732
+ assert restored.visited_pages["123"]["version"] == 5
733
+ assert len(restored.pending_queue) == 2
734
+ assert restored.pending_queue[0] == ("789", "123", 1)
735
+ assert restored.last_sync_timestamp == "2026-01-01T00:05:00Z"
736
+ assert restored.last_full_reconciliation_timestamp == "2026-01-01T00:00:00Z"
737
+
738
+
739
+ def test_crawl_state_is_page_unchanged_by_version() -> None:
740
+ """Test page unchanged detection by version number."""
741
+ state = CrawlState(
742
+ root_page_id="1",
743
+ body_format="view",
744
+ visited_pages={"1": {"version": 5, "html_sha256": "abc", "visited_at": "2026-01-01T00:00:00Z"}},
745
+ pending_queue=[],
746
+ started_at="2026-01-01T00:00:00Z",
747
+ last_updated_at="2026-01-01T00:00:00Z",
748
+ )
749
+
750
+ # Same version = unchanged
751
+ assert state.is_page_unchanged("1", 5, None) is True
752
+ # Different version = changed
753
+ assert state.is_page_unchanged("1", 6, None) is False
754
+ # Unknown page = changed
755
+ assert state.is_page_unchanged("2", 5, None) is False
756
+
757
+
758
+ def test_crawl_state_is_page_unchanged_by_hash() -> None:
759
+ """Test page unchanged detection by content hash."""
760
+ state = CrawlState(
761
+ root_page_id="1",
762
+ body_format="view",
763
+ visited_pages={"1": {"version": None, "html_sha256": "abc123", "visited_at": "2026-01-01T00:00:00Z"}},
764
+ pending_queue=[],
765
+ started_at="2026-01-01T00:00:00Z",
766
+ last_updated_at="2026-01-01T00:00:00Z",
767
+ )
768
+
769
+ # Same hash = unchanged
770
+ assert state.is_page_unchanged("1", None, "abc123") is True
771
+ # Different hash = changed
772
+ assert state.is_page_unchanged("1", None, "xyz789") is False
773
+
774
+
775
+ def test_crawl_tree_max_nodes_limits_pages(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
776
+ """Test --max-nodes stops crawl after N pages."""
777
+ pages_fetched: list[str] = []
778
+
779
+ class DummyClient:
780
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
781
+ raise AssertionError("search_cql should not be called")
782
+
783
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
784
+ pages_fetched.append(page_id)
785
+ return {
786
+ "id": page_id,
787
+ "title": f"Page {page_id}",
788
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
789
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
790
+ "version": {"number": 1},
791
+ }
792
+
793
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
794
+ # Root has 5 children
795
+ if page_id == "1":
796
+ return [{"id": str(i)} for i in range(2, 7)]
797
+ return []
798
+
799
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
800
+ return []
801
+
802
+ def get_attachment_content(self, attachment_id: str) -> bytes:
803
+ return b""
804
+
805
+ dummy = DummyClient()
806
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
807
+
808
+ out_dir = tmp_path / "out"
809
+ result = runner.invoke(
810
+ app,
811
+ [
812
+ "--report-dir",
813
+ str(tmp_path),
814
+ "crawl-tree",
815
+ "1",
816
+ "--out",
817
+ str(out_dir),
818
+ "--depth",
819
+ "2",
820
+ "--no-include-attachments",
821
+ "--max-nodes",
822
+ "3",
823
+ ],
824
+ )
825
+
826
+ assert result.exit_code == 0, result.output
827
+ payload = json.loads(result.stdout)
828
+
829
+ # Should have processed exactly 3 nodes
830
+ assert payload["resume_state"]["nodes_processed"] == 3
831
+ assert payload["resume_state"]["completed"] is False
832
+ assert payload["resume_state"]["termination_reason"] == "max_nodes_reached"
833
+ assert payload["resume_state"]["pending_queue_size"] > 0
834
+ assert len(pages_fetched) == 3
835
+
836
+
837
+ def test_crawl_tree_cli_accepts_crawl_max_aliases(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
838
+ class DummyClient:
839
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
840
+ raise AssertionError("search_cql should not be called")
841
+
842
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
843
+ return {
844
+ "id": page_id,
845
+ "title": f"Page {page_id}",
846
+ "body": {"view": {"value": "<p>Content</p>"}},
847
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
848
+ "version": {"number": 1},
849
+ }
850
+
851
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
852
+ return [{"id": "2"}] if page_id == "1" else []
853
+
854
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
855
+ return []
856
+
857
+ def get_attachment_content(self, attachment_id: str) -> bytes:
858
+ return b""
859
+
860
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: DummyClient())
861
+
862
+ out_dir = tmp_path / "out"
863
+ result = runner.invoke(
864
+ app,
865
+ [
866
+ "--report-dir",
867
+ str(tmp_path),
868
+ "crawl-tree",
869
+ "1",
870
+ "--out",
871
+ str(out_dir),
872
+ "--crawl-max-depth",
873
+ "0",
874
+ "--crawl-max-nodes",
875
+ "10",
876
+ "--no-include-attachments",
877
+ ],
878
+ )
879
+
880
+ assert result.exit_code == 0, result.output
881
+ payload = json.loads(result.stdout)
882
+ assert payload["resume_state"]["nodes_processed"] == 1
883
+ assert payload["crawl_mode"]["page_concurrency"] == 5
884
+
885
+
886
+ def test_crawl_tree_rejects_depth_out_of_range(tmp_path: Path) -> None:
887
+ out_dir = tmp_path / "out"
888
+ result = runner.invoke(
889
+ app,
890
+ [
891
+ "--report-dir",
892
+ str(tmp_path),
893
+ "crawl-tree",
894
+ "1",
895
+ "--out",
896
+ str(out_dir),
897
+ "--depth",
898
+ "-1",
899
+ ],
900
+ )
901
+
902
+ assert result.exit_code != 0
903
+ assert "Invalid value for '--depth'" in result.output
904
+
905
+
906
+ def test_crawl_tree_resume_saves_state(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
907
+ """Test that crawl saves state file for resume."""
908
+
909
+ class DummyClient:
910
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
911
+ raise AssertionError("search_cql should not be called")
912
+
913
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
914
+ return {
915
+ "id": page_id,
916
+ "title": f"Page {page_id}",
917
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
918
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
919
+ "version": {"number": 1},
920
+ }
921
+
922
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
923
+ return []
924
+
925
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
926
+ return []
927
+
928
+ def get_attachment_content(self, attachment_id: str) -> bytes:
929
+ return b""
930
+
931
+ dummy = DummyClient()
932
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
933
+
934
+ out_dir = tmp_path / "out"
935
+ result = runner.invoke(
936
+ app,
937
+ [
938
+ "--report-dir",
939
+ str(tmp_path),
940
+ "crawl-tree",
941
+ "1",
942
+ "--out",
943
+ str(out_dir),
944
+ "--depth",
945
+ "0",
946
+ "--no-include-attachments",
947
+ "--resume",
948
+ ],
949
+ )
950
+
951
+ assert result.exit_code == 0, result.output
952
+
953
+ # State file should exist
954
+ state_path = out_dir / "crawl_state.json"
955
+ assert state_path.exists()
956
+
957
+ state_data = json.loads(state_path.read_text(encoding="utf-8"))
958
+ assert state_data["root_page_id"] == "1"
959
+ assert state_data["completed"] is True
960
+ assert "1" in state_data["visited_pages"]
961
+ assert state_data["run_diagnostics"]["termination_reason"] == "queue_exhausted"
962
+ assert state_data["run_diagnostics"]["pending_queue_size"] == 0
963
+
964
+
965
+ def test_crawl_tree_resume_skips_unchanged_pages(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
966
+ """Test that resume mode skips pages that haven't changed."""
967
+ pages_fetched: list[str] = []
968
+ call_count: dict[str, int] = {"1": 0}
969
+
970
+ class DummyClient:
971
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
972
+ raise AssertionError("search_cql should not be called")
973
+
974
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
975
+ call_count["1"] = call_count.get("1", 0) + 1
976
+ pages_fetched.append(page_id)
977
+ return {
978
+ "id": page_id,
979
+ "title": f"Page {page_id}",
980
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
981
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
982
+ "version": {"number": 1}, # Version stays at 1
983
+ }
984
+
985
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
986
+ return []
987
+
988
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
989
+ return []
990
+
991
+ def get_attachment_content(self, attachment_id: str) -> bytes:
992
+ return b""
993
+
994
+ dummy = DummyClient()
995
+ out_dir = tmp_path / "out"
996
+
997
+ # First crawl
998
+ result1 = crawl_tree_to_disk(
999
+ http=dummy,
1000
+ root_ref="1",
1001
+ output_dir=out_dir,
1002
+ options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
1003
+ )
1004
+
1005
+ assert result1["resume_state"]["nodes_processed"] == 1
1006
+ pages_fetched.clear()
1007
+
1008
+ # Second crawl with resume - should skip unchanged page
1009
+ result2 = crawl_tree_to_disk(
1010
+ http=dummy,
1011
+ root_ref="1",
1012
+ output_dir=out_dir,
1013
+ options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
1014
+ )
1015
+
1016
+ # Page should be skipped (only version check, not full fetch)
1017
+ assert result2["resume_state"]["skipped_unchanged"] == 1
1018
+ assert result2["progress_event"]["pages_skipped_unchanged"] == 1
1019
+
1020
+
1021
+ def test_crawl_tree_resume_avoids_full_fetch_for_unchanged_pages(tmp_path: Path) -> None:
1022
+ """Phase 27 TSK-275.1: unchanged pages should only use version checks on warm runs."""
1023
+ full_fetch_calls: list[str] = []
1024
+
1025
+ class DummyClient:
1026
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1027
+ raise AssertionError("search_cql should not be called")
1028
+
1029
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1030
+ if expand and "version" in expand and len(expand) == 1:
1031
+ return {
1032
+ "id": page_id,
1033
+ "title": f"Page {page_id}",
1034
+ "body": {"view": {"value": ""}},
1035
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1036
+ "version": {"number": 1},
1037
+ }
1038
+ full_fetch_calls.append(page_id)
1039
+ return {
1040
+ "id": page_id,
1041
+ "title": f"Page {page_id}",
1042
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1043
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1044
+ "version": {"number": 1},
1045
+ }
1046
+
1047
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1048
+ if page_id == "88716667":
1049
+ return [{"id": "100"}, {"id": "200"}]
1050
+ return []
1051
+
1052
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1053
+ return []
1054
+
1055
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1056
+ return b""
1057
+
1058
+ out_dir = tmp_path / "out"
1059
+ client = DummyClient()
1060
+ crawl_tree_to_disk(
1061
+ http=client,
1062
+ root_ref="88716667",
1063
+ output_dir=out_dir,
1064
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1065
+ )
1066
+ assert len(full_fetch_calls) == 3
1067
+
1068
+ full_fetch_calls.clear()
1069
+ result = crawl_tree_to_disk(
1070
+ http=client,
1071
+ root_ref="88716667",
1072
+ output_dir=out_dir,
1073
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1074
+ )
1075
+ assert result["resume_state"]["skipped_unchanged"] == 3
1076
+ assert full_fetch_calls == []
1077
+
1078
+
1079
+ def test_crawl_tree_resume_runs_version_pass_before_full_fetch(tmp_path: Path) -> None:
1080
+ call_log: list[tuple[str, str]] = []
1081
+ versions = {"1": 1, "2": 1}
1082
+
1083
+ class DummyClient:
1084
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1085
+ raise AssertionError("search_cql should not be called")
1086
+
1087
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1088
+ if expand == ["version"]:
1089
+ call_log.append(("version", page_id))
1090
+ return {
1091
+ "id": page_id,
1092
+ "title": f"Page {page_id}",
1093
+ "body": {"view": {"value": ""}},
1094
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1095
+ "version": {"number": versions[page_id]},
1096
+ }
1097
+
1098
+ call_log.append(("full", page_id))
1099
+ return {
1100
+ "id": page_id,
1101
+ "title": f"Page {page_id}",
1102
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1103
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1104
+ "space": {"key": "SPACE"},
1105
+ "version": {"number": versions[page_id]},
1106
+ }
1107
+
1108
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1109
+ if page_id == "1":
1110
+ return [{"id": "2"}]
1111
+ return []
1112
+
1113
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1114
+ return []
1115
+
1116
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1117
+ return b""
1118
+
1119
+ out_dir = tmp_path / "out"
1120
+ client = DummyClient()
1121
+ crawl_tree_to_disk(
1122
+ http=client,
1123
+ root_ref="1",
1124
+ output_dir=out_dir,
1125
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1126
+ )
1127
+
1128
+ call_log.clear()
1129
+ versions["2"] = 2
1130
+ result = crawl_tree_to_disk(
1131
+ http=client,
1132
+ root_ref="1",
1133
+ output_dir=out_dir,
1134
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1135
+ )
1136
+
1137
+ assert result["resume_state"]["skipped_unchanged"] == 1
1138
+ assert ("version", "1") in call_log
1139
+ assert ("full", "1") not in call_log
1140
+ assert call_log.index(("version", "2")) < call_log.index(("full", "2"))
1141
+
1142
+
1143
+ def test_crawl_tree_resume_refreshes_only_changed_page(tmp_path: Path) -> None:
1144
+ """Phase 27 TSK-275.2: changed-only refresh should fetch only the mutated page."""
1145
+ full_fetch_calls: list[str] = []
1146
+ versions = {"88716667": 1, "100": 1, "200": 1}
1147
+
1148
+ class DummyClient:
1149
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1150
+ raise AssertionError("search_cql should not be called")
1151
+
1152
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1153
+ if expand and "version" in expand and len(expand) == 1:
1154
+ return {
1155
+ "id": page_id,
1156
+ "title": f"Page {page_id}",
1157
+ "body": {"view": {"value": ""}},
1158
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1159
+ "version": {"number": versions[page_id]},
1160
+ }
1161
+ full_fetch_calls.append(page_id)
1162
+ return {
1163
+ "id": page_id,
1164
+ "title": f"Page {page_id}",
1165
+ "body": {"view": {"value": f"<p>Content {page_id} v{versions[page_id]}</p>"}},
1166
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1167
+ "version": {"number": versions[page_id]},
1168
+ }
1169
+
1170
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1171
+ if page_id == "88716667":
1172
+ return [{"id": "100"}, {"id": "200"}]
1173
+ return []
1174
+
1175
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1176
+ return []
1177
+
1178
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1179
+ return b""
1180
+
1181
+ out_dir = tmp_path / "out"
1182
+ client = DummyClient()
1183
+ crawl_tree_to_disk(
1184
+ http=client,
1185
+ root_ref="88716667",
1186
+ output_dir=out_dir,
1187
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1188
+ )
1189
+
1190
+ full_fetch_calls.clear()
1191
+ versions["200"] = 2
1192
+ result = crawl_tree_to_disk(
1193
+ http=client,
1194
+ root_ref="88716667",
1195
+ output_dir=out_dir,
1196
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1197
+ )
1198
+ assert result["resume_state"]["skipped_unchanged"] == 2
1199
+ assert full_fetch_calls == ["200"]
1200
+
1201
+
1202
+ def test_crawl_tree_resume_emit_corpus_preserves_unchanged_pages_on_partial_refresh(tmp_path: Path) -> None:
1203
+ versions = {"1": 1, "2": 1, "3": 1}
1204
+
1205
+ class DummyClient:
1206
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1207
+ raise AssertionError("search_cql should not be called")
1208
+
1209
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1210
+ if expand and "version" in expand and len(expand) == 1:
1211
+ return {
1212
+ "id": page_id,
1213
+ "title": f"Page {page_id}",
1214
+ "body": {"view": {"value": ""}},
1215
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1216
+ "version": {"number": versions[page_id]},
1217
+ }
1218
+ return {
1219
+ "id": page_id,
1220
+ "title": f"Page {page_id}",
1221
+ "body": {"view": {"value": f"<p>Content {page_id} v{versions[page_id]}</p>"}},
1222
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1223
+ "version": {"number": versions[page_id]},
1224
+ }
1225
+
1226
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1227
+ if page_id == "1":
1228
+ return [{"id": "2"}, {"id": "3"}]
1229
+ return []
1230
+
1231
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1232
+ return []
1233
+
1234
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1235
+ return b""
1236
+
1237
+ out_dir = tmp_path / "out"
1238
+ client = DummyClient()
1239
+ initial = crawl_tree_to_disk(
1240
+ http=client,
1241
+ root_ref="1",
1242
+ output_dir=out_dir,
1243
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, emit_corpus=True),
1244
+ )
1245
+ assert initial["corpus_path"]
1246
+
1247
+ versions["3"] = 2
1248
+ refreshed = crawl_tree_to_disk(
1249
+ http=client,
1250
+ root_ref="1",
1251
+ output_dir=out_dir,
1252
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, emit_corpus=True),
1253
+ )
1254
+ assert refreshed["resume_state"]["skipped_unchanged"] == 2
1255
+
1256
+ corpus_path = Path(str(refreshed["corpus_path"]))
1257
+ corpus_rows = [json.loads(line) for line in corpus_path.read_text(encoding="utf-8").splitlines() if line.strip()]
1258
+ page_ids = {str(row.get("page_id") or "") for row in corpus_rows if isinstance(row, dict)}
1259
+ assert page_ids == {"1", "2", "3"}
1260
+
1261
+ page_three_text = [str(row.get("text") or "") for row in corpus_rows if str(row.get("page_id") or "") == "3"]
1262
+ assert any("v2" in text for text in page_three_text)
1263
+
1264
+
1265
+ def test_crawl_tree_cold_vs_warm_call_profile_for_root_88716667(tmp_path: Path) -> None:
1266
+ """Phase 27 TSK-275.3 benchmark proxy: warm run should reduce full fetch calls."""
1267
+ full_fetch_calls: list[str] = []
1268
+
1269
+ class DummyClient:
1270
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1271
+ raise AssertionError("search_cql should not be called")
1272
+
1273
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1274
+ if expand and "version" in expand and len(expand) == 1:
1275
+ return {
1276
+ "id": page_id,
1277
+ "title": f"Page {page_id}",
1278
+ "body": {"view": {"value": ""}},
1279
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1280
+ "version": {"number": 1},
1281
+ }
1282
+ full_fetch_calls.append(page_id)
1283
+ return {
1284
+ "id": page_id,
1285
+ "title": f"Page {page_id}",
1286
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1287
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1288
+ "version": {"number": 1},
1289
+ }
1290
+
1291
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1292
+ if page_id == "88716667":
1293
+ return [{"id": "100"}, {"id": "200"}, {"id": "300"}]
1294
+ return []
1295
+
1296
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1297
+ return []
1298
+
1299
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1300
+ return b""
1301
+
1302
+ out_dir = tmp_path / "out"
1303
+ client = DummyClient()
1304
+ crawl_tree_to_disk(
1305
+ http=client,
1306
+ root_ref="88716667",
1307
+ output_dir=out_dir,
1308
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1309
+ )
1310
+ cold_full_fetch = len(full_fetch_calls)
1311
+ assert cold_full_fetch == 4
1312
+
1313
+ full_fetch_calls.clear()
1314
+ crawl_tree_to_disk(
1315
+ http=client,
1316
+ root_ref="88716667",
1317
+ output_dir=out_dir,
1318
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1319
+ )
1320
+ warm_full_fetch = len(full_fetch_calls)
1321
+ assert warm_full_fetch < cold_full_fetch
1322
+
1323
+
1324
+ def test_crawl_tree_resume_state_reports_queue_exhausted(tmp_path: Path) -> None:
1325
+ class DummyClient:
1326
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1327
+ raise AssertionError("search_cql should not be called")
1328
+
1329
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1330
+ return {
1331
+ "id": page_id,
1332
+ "title": f"Page {page_id}",
1333
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1334
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1335
+ "version": {"number": 1},
1336
+ }
1337
+
1338
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1339
+ return []
1340
+
1341
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1342
+ return []
1343
+
1344
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1345
+ return b""
1346
+
1347
+ result = crawl_tree_to_disk(
1348
+ http=DummyClient(),
1349
+ root_ref="1",
1350
+ output_dir=tmp_path / "out",
1351
+ options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
1352
+ )
1353
+
1354
+ assert result["resume_state"]["completed"] is True
1355
+ assert result["resume_state"]["termination_reason"] == "queue_exhausted"
1356
+ assert result["resume_state"]["pending_queue_size"] == 0
1357
+
1358
+
1359
+ def test_crawl_tree_resume_skip_unchanged_with_concurrency(tmp_path: Path) -> None:
1360
+ full_fetch_calls: list[str] = []
1361
+
1362
+ class DummyClient:
1363
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1364
+ raise AssertionError("search_cql should not be called")
1365
+
1366
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1367
+ if expand and "version" in expand and len(expand) == 1:
1368
+ return {
1369
+ "id": page_id,
1370
+ "title": f"Page {page_id}",
1371
+ "body": {"view": {"value": ""}},
1372
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1373
+ "version": {"number": 1},
1374
+ }
1375
+ full_fetch_calls.append(page_id)
1376
+ return {
1377
+ "id": page_id,
1378
+ "title": f"Page {page_id}",
1379
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1380
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1381
+ "version": {"number": 1},
1382
+ }
1383
+
1384
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1385
+ if page_id == "1":
1386
+ return [{"id": "2"}, {"id": "3"}, {"id": "4"}]
1387
+ return []
1388
+
1389
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1390
+ return []
1391
+
1392
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1393
+ return b""
1394
+
1395
+ out_dir = tmp_path / "out"
1396
+ client = DummyClient()
1397
+ crawl_tree_to_disk(
1398
+ http=client,
1399
+ root_ref="1",
1400
+ output_dir=out_dir,
1401
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, page_concurrency=3),
1402
+ )
1403
+ cold_full_fetch = len(full_fetch_calls)
1404
+ assert cold_full_fetch == 4
1405
+
1406
+ full_fetch_calls.clear()
1407
+ crawl_tree_to_disk(
1408
+ http=client,
1409
+ root_ref="1",
1410
+ output_dir=out_dir,
1411
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, page_concurrency=3),
1412
+ )
1413
+ assert len(full_fetch_calls) < cold_full_fetch
1414
+
1415
+
1416
+ def test_crawl_tree_resume_supports_incremental_high_water_mark_and_reconciliation(tmp_path: Path) -> None:
1417
+ search_calls: list[dict[str, Any]] = []
1418
+ deleted = {"value": False}
1419
+
1420
+ class DummyClient:
1421
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1422
+ raise AssertionError("search_cql should not be called")
1423
+
1424
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1425
+ version_num = 1
1426
+ if expand == ["version"]:
1427
+ return {
1428
+ "id": page_id,
1429
+ "title": f"Page {page_id}",
1430
+ "version": {"number": version_num},
1431
+ }
1432
+ return {
1433
+ "id": page_id,
1434
+ "title": f"Page {page_id}",
1435
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1436
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1437
+ "space": {"key": "SPACE"},
1438
+ "version": {"number": version_num},
1439
+ }
1440
+
1441
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1442
+ if page_id == "1" and not deleted["value"]:
1443
+ return [{"id": "2"}]
1444
+ return []
1445
+
1446
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1447
+ return []
1448
+
1449
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1450
+ return b""
1451
+
1452
+ def search_by_space_and_type(
1453
+ self,
1454
+ space_key: str | None = None,
1455
+ content_type: str | None = None,
1456
+ *,
1457
+ limit: int = 25,
1458
+ start: int = 0,
1459
+ expand: list[str] | None = None,
1460
+ last_modified_after: Any = None,
1461
+ overlap_minutes: int = 0,
1462
+ **_: Any,
1463
+ ) -> dict[str, Any]:
1464
+ search_calls.append(
1465
+ {
1466
+ "space_key": space_key,
1467
+ "content_type": content_type,
1468
+ "limit": limit,
1469
+ "start": start,
1470
+ "last_modified_after": last_modified_after,
1471
+ "overlap_minutes": overlap_minutes,
1472
+ }
1473
+ )
1474
+ if last_modified_after is not None:
1475
+ return {"results": []}
1476
+ if deleted["value"]:
1477
+ return {"results": [{"id": "1"}]}
1478
+ return {"results": [{"id": "1"}, {"id": "2"}]}
1479
+
1480
+ out_dir = tmp_path / "out"
1481
+ client = DummyClient()
1482
+
1483
+ crawl_tree_to_disk(
1484
+ http=client,
1485
+ root_ref="1",
1486
+ output_dir=out_dir,
1487
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1488
+ )
1489
+
1490
+ deleted["value"] = True
1491
+ search_calls.clear()
1492
+ result = crawl_tree_to_disk(
1493
+ http=client,
1494
+ root_ref="1",
1495
+ output_dir=out_dir,
1496
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1497
+ )
1498
+
1499
+ state_data = json.loads((out_dir / "crawl_state.json").read_text(encoding="utf-8"))
1500
+ assert "2" not in state_data["visited_pages"]
1501
+ assert state_data["last_sync_timestamp"] is not None
1502
+ assert state_data["last_full_reconciliation_timestamp"] is not None
1503
+ assert state_data["run_diagnostics"]["full_reconciliation_ran"] is True
1504
+ assert state_data["run_diagnostics"]["full_reconciliation_deleted_ids"] == ["2"]
1505
+ assert state_data["run_diagnostics"]["full_reconciliation_skipped"] is False
1506
+ assert state_data["run_diagnostics"]["full_reconciliation_skip_reason"] is None
1507
+ assert state_data["run_diagnostics"]["incremental_known_page_count"] == 2
1508
+ assert state_data["run_diagnostics"]["incremental_high_water_mark_ran"] is True
1509
+ assert state_data["run_diagnostics"]["incremental_high_water_mark_skipped"] is False
1510
+ assert state_data["run_diagnostics"]["incremental_high_water_mark_skip_reason"] is None
1511
+
1512
+ incremental_calls = [call for call in search_calls if call["last_modified_after"] is not None]
1513
+ assert incremental_calls
1514
+ assert all(call["overlap_minutes"] == 5 for call in incremental_calls)
1515
+ full_calls = [call for call in search_calls if call["last_modified_after"] is None]
1516
+ assert full_calls
1517
+
1518
+ assert result["resume_state"]["last_sync_timestamp"] is not None
1519
+ assert result["resume_state"]["last_full_reconciliation_timestamp"] is not None
1520
+
1521
+
1522
+ def test_crawl_tree_resume_skips_full_reconciliation_for_root_only_state(tmp_path: Path) -> None:
1523
+ search_calls: list[dict[str, Any]] = []
1524
+
1525
+ class DummyClient:
1526
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1527
+ raise AssertionError("search_cql should not be called")
1528
+
1529
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1530
+ if expand == ["version"]:
1531
+ return {"id": page_id, "title": f"Page {page_id}", "version": {"number": 1}}
1532
+ return {
1533
+ "id": page_id,
1534
+ "title": f"Page {page_id}",
1535
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1536
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1537
+ "space": {"key": "SPACE"},
1538
+ "version": {"number": 1},
1539
+ }
1540
+
1541
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1542
+ return []
1543
+
1544
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1545
+ return []
1546
+
1547
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1548
+ return b""
1549
+
1550
+ def search_by_space_and_type(
1551
+ self,
1552
+ space_key: str | None = None,
1553
+ content_type: str | None = None,
1554
+ *,
1555
+ limit: int = 25,
1556
+ start: int = 0,
1557
+ expand: list[str] | None = None,
1558
+ last_modified_after: Any = None,
1559
+ overlap_minutes: int = 0,
1560
+ **_: Any,
1561
+ ) -> dict[str, Any]:
1562
+ search_calls.append(
1563
+ {
1564
+ "space_key": space_key,
1565
+ "content_type": content_type,
1566
+ "limit": limit,
1567
+ "start": start,
1568
+ "last_modified_after": last_modified_after,
1569
+ "overlap_minutes": overlap_minutes,
1570
+ }
1571
+ )
1572
+ if last_modified_after is not None:
1573
+ return {"results": []}
1574
+ return {"results": [{"id": "1"}]}
1575
+
1576
+ out_dir = tmp_path / "out"
1577
+ client = DummyClient()
1578
+
1579
+ crawl_tree_to_disk(
1580
+ http=client,
1581
+ root_ref="1",
1582
+ output_dir=out_dir,
1583
+ options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
1584
+ )
1585
+
1586
+ search_calls.clear()
1587
+ result = crawl_tree_to_disk(
1588
+ http=client,
1589
+ root_ref="1",
1590
+ output_dir=out_dir,
1591
+ options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
1592
+ )
1593
+
1594
+ state_data = json.loads((out_dir / "crawl_state.json").read_text(encoding="utf-8"))
1595
+ assert state_data["last_full_reconciliation_timestamp"] is not None
1596
+ assert state_data["run_diagnostics"]["full_reconciliation_ran"] is False
1597
+ assert state_data["run_diagnostics"]["full_reconciliation_skipped"] is True
1598
+ assert state_data["run_diagnostics"]["full_reconciliation_skip_reason"] == "known_pages_lte_1"
1599
+ assert state_data["run_diagnostics"]["incremental_known_page_count"] == 1
1600
+ assert state_data["run_diagnostics"]["incremental_high_water_mark_ran"] is False
1601
+ assert state_data["run_diagnostics"]["incremental_high_water_mark_skipped"] is True
1602
+ assert state_data["run_diagnostics"]["incremental_high_water_mark_skip_reason"] == "known_pages_lte_1"
1603
+
1604
+ incremental_calls = [call for call in search_calls if call["last_modified_after"] is not None]
1605
+ assert not incremental_calls
1606
+ full_calls = [call for call in search_calls if call["last_modified_after"] is None]
1607
+ assert not full_calls
1608
+
1609
+ assert result["resume_state"]["last_full_reconciliation_timestamp"] is not None
1610
+
1611
+
1612
+ def test_crawl_tree_exposes_http_telemetry_in_manifest_and_state(tmp_path: Path) -> None:
1613
+ class DummyClient:
1614
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1615
+ raise AssertionError("search_cql should not be called")
1616
+
1617
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1618
+ return {
1619
+ "id": page_id,
1620
+ "title": f"Page {page_id}",
1621
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1622
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1623
+ "version": {"number": 1},
1624
+ }
1625
+
1626
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1627
+ return []
1628
+
1629
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1630
+ return []
1631
+
1632
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1633
+ return b""
1634
+
1635
+ def get_runtime_telemetry(self) -> dict[str, Any]:
1636
+ return {
1637
+ "retries_total": 2,
1638
+ "rate_limit_total": 1,
1639
+ "timeout_total": 1,
1640
+ "adaptive_throttle": {"effective_concurrency_signal": 0.73, "pressure_score": 0.27},
1641
+ "recent_retry_events": [{"reason": "http_429"}, {"reason": "timeout"}],
1642
+ }
1643
+
1644
+ out_dir = tmp_path / "out"
1645
+ result = crawl_tree_to_disk(
1646
+ http=DummyClient(),
1647
+ root_ref="1",
1648
+ output_dir=out_dir,
1649
+ options=CrawlOptions(max_depth=0, include_attachments=False, resume=True),
1650
+ )
1651
+
1652
+ assert result["http_telemetry"]["retries_total"] == 2
1653
+ assert result["telemetry"]["retries_total"] == 2
1654
+ assert result["http_telemetry"]["adaptive_throttle"]["effective_concurrency_signal"] == 0.73
1655
+ assert result["resume_state"]["completed"] is True
1656
+
1657
+ state_data = json.loads((out_dir / "crawl_state.json").read_text(encoding="utf-8"))
1658
+ assert state_data["run_diagnostics"]["http_telemetry"]["timeout_total"] == 1
1659
+ assert state_data["run_diagnostics"]["telemetry"]["timeout_total"] == 1
1660
+ assert state_data["run_diagnostics"]["progress_event"]["pages_discovered"] == 1
1661
+
1662
+
1663
+ def test_crawl_tree_emits_crawl_progress_runtime_events(tmp_path: Path) -> None:
1664
+ emitted_events: list[tuple[str, dict[str, Any]]] = []
1665
+
1666
+ class DummyClient:
1667
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1668
+ raise AssertionError("search_cql should not be called")
1669
+
1670
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1671
+ return {
1672
+ "id": page_id,
1673
+ "title": f"Page {page_id}",
1674
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1675
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1676
+ "version": {"number": 1},
1677
+ }
1678
+
1679
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1680
+ if page_id == "1":
1681
+ return [{"id": "2"}, {"id": "3"}]
1682
+ return []
1683
+
1684
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1685
+ return []
1686
+
1687
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1688
+ return b""
1689
+
1690
+ def _capture_runtime_event(event: str, payload: dict[str, Any]) -> None:
1691
+ emitted_events.append((event, payload))
1692
+
1693
+ result = crawl_tree_to_disk(
1694
+ http=DummyClient(),
1695
+ root_ref="1",
1696
+ output_dir=tmp_path / "out",
1697
+ options=CrawlOptions(
1698
+ max_depth=1,
1699
+ include_attachments=False,
1700
+ page_concurrency=2,
1701
+ runtime_event_sink=_capture_runtime_event,
1702
+ ),
1703
+ )
1704
+
1705
+ progress_events = [payload for name, payload in emitted_events if name == "crawl_progress"]
1706
+ assert progress_events
1707
+ assert progress_events[-1]["pages_discovered"] == result["progress_event"]["pages_discovered"]
1708
+ assert progress_events[-1]["pages_fetched"] == result["progress_event"]["pages_fetched"]
1709
+ assert progress_events[-1]["pages_failed"] == result["progress_event"]["pages_failed"]
1710
+ assert progress_events[-1]["pages_skipped_unchanged"] == result["progress_event"]["pages_skipped_unchanged"]
1711
+
1712
+
1713
+ def test_crawl_tree_emits_pages_deleted_event(tmp_path: Path) -> None:
1714
+ emitted_events: list[tuple[str, dict[str, Any]]] = []
1715
+ search_calls: list[dict[str, Any]] = []
1716
+ deleted = {"value": False}
1717
+
1718
+ class DummyClient:
1719
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1720
+ raise AssertionError("search_cql should not be called")
1721
+
1722
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1723
+ if expand == ["version"]:
1724
+ return {"id": page_id, "title": f"Page {page_id}", "version": {"number": 1}}
1725
+ return {
1726
+ "id": page_id,
1727
+ "title": f"Page {page_id}",
1728
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1729
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1730
+ "space": {"key": "SPACE"},
1731
+ "version": {"number": 1},
1732
+ }
1733
+
1734
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1735
+ if page_id == "1" and not deleted["value"]:
1736
+ return [{"id": "2"}]
1737
+ return []
1738
+
1739
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1740
+ return []
1741
+
1742
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1743
+ return b""
1744
+
1745
+ def search_by_space_and_type(
1746
+ self,
1747
+ space_key: str | None = None,
1748
+ content_type: str | None = None,
1749
+ *,
1750
+ limit: int = 25,
1751
+ start: int = 0,
1752
+ expand: list[str] | None = None,
1753
+ last_modified_after: Any = None,
1754
+ overlap_minutes: int = 0,
1755
+ **_: Any,
1756
+ ) -> dict[str, Any]:
1757
+ search_calls.append(
1758
+ {
1759
+ "space_key": space_key,
1760
+ "content_type": content_type,
1761
+ "limit": limit,
1762
+ "start": start,
1763
+ "last_modified_after": last_modified_after,
1764
+ "overlap_minutes": overlap_minutes,
1765
+ }
1766
+ )
1767
+ if last_modified_after is not None:
1768
+ return {"results": []}
1769
+ if deleted["value"]:
1770
+ return {"results": [{"id": "1"}]}
1771
+ return {"results": [{"id": "1"}, {"id": "2"}]}
1772
+
1773
+ def _capture_runtime_event(event: str, payload: dict[str, Any]) -> None:
1774
+ emitted_events.append((event, payload))
1775
+
1776
+ out_dir = tmp_path / "out"
1777
+ client = DummyClient()
1778
+
1779
+ crawl_tree_to_disk(
1780
+ http=client,
1781
+ root_ref="1",
1782
+ output_dir=out_dir,
1783
+ options=CrawlOptions(
1784
+ max_depth=1,
1785
+ include_attachments=False,
1786
+ resume=True,
1787
+ runtime_event_sink=_capture_runtime_event,
1788
+ ),
1789
+ )
1790
+ deleted["value"] = True
1791
+ emitted_events.clear()
1792
+ search_calls.clear()
1793
+
1794
+ crawl_tree_to_disk(
1795
+ http=client,
1796
+ root_ref="1",
1797
+ output_dir=out_dir,
1798
+ options=CrawlOptions(
1799
+ max_depth=1,
1800
+ include_attachments=False,
1801
+ resume=True,
1802
+ runtime_event_sink=_capture_runtime_event,
1803
+ ),
1804
+ )
1805
+
1806
+ pages_deleted_events = [payload for name, payload in emitted_events if name == "pages_deleted"]
1807
+ assert pages_deleted_events
1808
+ assert pages_deleted_events[-1]["deleted_count"] == 1
1809
+ assert pages_deleted_events[-1]["page_ids"] == ["2"]
1810
+
1811
+
1812
+ def test_crawl_tree_throughput_metrics_and_adaptive_signal(tmp_path: Path) -> None:
1813
+ class DummyClient:
1814
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1815
+ raise AssertionError("search_cql should not be called")
1816
+
1817
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1818
+ return {
1819
+ "id": page_id,
1820
+ "title": f"Page {page_id}",
1821
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1822
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1823
+ "version": {"number": 1},
1824
+ }
1825
+
1826
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1827
+ if page_id == "1":
1828
+ return [{"id": "2"}, {"id": "3"}]
1829
+ return []
1830
+
1831
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1832
+ return []
1833
+
1834
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1835
+ return b""
1836
+
1837
+ def get_runtime_telemetry(self) -> dict[str, Any]:
1838
+ return {
1839
+ "adaptive_throttle": {"effective_concurrency_signal": 0.2, "pressure_score": 0.8},
1840
+ "latency_ms": {"operation": {"p95": 123.4}},
1841
+ "recent_retry_events": [{"reason": "http_429"}, {"reason": "timeout"}],
1842
+ }
1843
+
1844
+ result = crawl_tree_to_disk(
1845
+ http=DummyClient(),
1846
+ root_ref="1",
1847
+ output_dir=tmp_path / "out",
1848
+ options=CrawlOptions(max_depth=1, include_attachments=False, page_concurrency=4),
1849
+ )
1850
+
1851
+ telemetry = result["http_telemetry"]
1852
+ assert telemetry["inflight_workers"] == 1
1853
+ assert telemetry["queue_depth"] == 0
1854
+ assert telemetry["pages_per_second"] > 0
1855
+ assert telemetry["retry_events"] == 2
1856
+ assert telemetry["p95_page_latency_ms"] == 123.4
1857
+
1858
+ progress_event = result["progress_event"]
1859
+ assert progress_event == {
1860
+ "pages_discovered": 3,
1861
+ "pages_fetched": 3,
1862
+ "pages_failed": 0,
1863
+ "pages_skipped": 0,
1864
+ "pages_skipped_unchanged": 0,
1865
+ "throttle": {
1866
+ "configured_page_concurrency": 4,
1867
+ "effective_page_concurrency": 1,
1868
+ "effective_concurrency_signal": 0.2,
1869
+ "pressure_score": 0.8,
1870
+ },
1871
+ }
1872
+
1873
+
1874
+ def test_crawl_tree_resume_continues_from_pending_queue(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
1875
+ """Test that resume continues from where it left off."""
1876
+ pages_fetched: list[str] = []
1877
+
1878
+ class DummyClient:
1879
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1880
+ raise AssertionError("search_cql should not be called")
1881
+
1882
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1883
+ pages_fetched.append(page_id)
1884
+ return {
1885
+ "id": page_id,
1886
+ "title": f"Page {page_id}",
1887
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
1888
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
1889
+ "version": {"number": int(page_id)}, # Different versions
1890
+ }
1891
+
1892
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1893
+ if page_id == "1":
1894
+ return [{"id": "2"}, {"id": "3"}, {"id": "4"}]
1895
+ return []
1896
+
1897
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1898
+ return []
1899
+
1900
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1901
+ return b""
1902
+
1903
+ dummy = DummyClient()
1904
+ out_dir = tmp_path / "out"
1905
+
1906
+ # First crawl with max_nodes=2 (will process 1 and 2, leave 3,4 pending)
1907
+ result1 = crawl_tree_to_disk(
1908
+ http=dummy,
1909
+ root_ref="1",
1910
+ output_dir=out_dir,
1911
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True, max_nodes=2),
1912
+ )
1913
+
1914
+ assert result1["resume_state"]["nodes_processed"] == 2
1915
+ assert result1["resume_state"]["completed"] is False
1916
+
1917
+ pages_fetched.clear()
1918
+
1919
+ # Resume crawl - should continue with remaining pages
1920
+ result2 = crawl_tree_to_disk(
1921
+ http=dummy,
1922
+ root_ref="1",
1923
+ output_dir=out_dir,
1924
+ options=CrawlOptions(max_depth=1, include_attachments=False, resume=True),
1925
+ )
1926
+
1927
+ # Should have processed pages 3 and 4 (plus version checks for 1, 2)
1928
+ assert result2["resume_state"]["completed"] is True
1929
+
1930
+
1931
+ def test_crawl_tree_cli_resume_flag(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
1932
+ """Test CLI --resume flag works."""
1933
+
1934
+ class DummyClient:
1935
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1936
+ raise AssertionError("search_cql should not be called")
1937
+
1938
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1939
+ return {
1940
+ "id": page_id,
1941
+ "title": f"Page {page_id}",
1942
+ "body": {"view": {"value": "<p>Content</p>"}},
1943
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Page"},
1944
+ "version": {"number": 1},
1945
+ }
1946
+
1947
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
1948
+ return []
1949
+
1950
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
1951
+ return []
1952
+
1953
+ def get_attachment_content(self, attachment_id: str) -> bytes:
1954
+ return b""
1955
+
1956
+ dummy = DummyClient()
1957
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
1958
+
1959
+ out_dir = tmp_path / "out"
1960
+
1961
+ # Run with --resume
1962
+ result = runner.invoke(
1963
+ app,
1964
+ [
1965
+ "--report-dir",
1966
+ str(tmp_path),
1967
+ "crawl-tree",
1968
+ "1",
1969
+ "--out",
1970
+ str(out_dir),
1971
+ "--depth",
1972
+ "0",
1973
+ "--no-include-attachments",
1974
+ "--resume",
1975
+ ],
1976
+ )
1977
+
1978
+ assert result.exit_code == 0, result.output
1979
+ payload = json.loads(result.stdout)
1980
+ assert "resume_state" in payload
1981
+ assert payload["resume_state"]["state_path"] == str(out_dir / "crawl_state.json")
1982
+
1983
+
1984
+ def test_crawl_tree_cli_accepts_phase27_compat_flags(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
1985
+ """Phase 27 compatibility: crawl-tree should accept cache/content root flags from audit orchestrator."""
1986
+
1987
+ class DummyClient:
1988
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
1989
+ raise AssertionError("search_cql should not be called")
1990
+
1991
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
1992
+ return {
1993
+ "id": page_id,
1994
+ "title": f"Page {page_id}",
1995
+ "body": {"view": {"value": "<p>Content</p>"}},
1996
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Page"},
1997
+ "version": {"number": 1},
1998
+ }
1999
+
2000
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
2001
+ return []
2002
+
2003
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
2004
+ return []
2005
+
2006
+ def get_attachment_content(self, attachment_id: str) -> bytes:
2007
+ return b""
2008
+
2009
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: DummyClient())
2010
+
2011
+ out_dir = tmp_path / "out"
2012
+ content_root = tmp_path / "content-root"
2013
+ result = runner.invoke(
2014
+ app,
2015
+ [
2016
+ "--report-dir",
2017
+ str(tmp_path),
2018
+ "crawl-tree",
2019
+ "1",
2020
+ "--out",
2021
+ str(out_dir),
2022
+ "--depth",
2023
+ "0",
2024
+ "--no-include-attachments",
2025
+ "--cache-mode",
2026
+ "refresh",
2027
+ "--cache-ttl-hours",
2028
+ "48",
2029
+ "--incremental",
2030
+ "--content-root",
2031
+ str(content_root),
2032
+ "--project-storage-key",
2033
+ "project-x",
2034
+ ],
2035
+ )
2036
+
2037
+ assert result.exit_code == 0, result.output
2038
+ payload = json.loads(result.stdout)
2039
+ assert payload["cache"]["mode"] == "refresh"
2040
+ assert payload["cache"]["ttl_hours"] == 48
2041
+ assert payload["cache"]["incremental"] is True
2042
+ assert payload["cache"]["effective_resume"] is True
2043
+ assert payload["cache"]["content_root"] == str(content_root)
2044
+ assert payload["cache"]["project_storage_key"] == "project-x"
2045
+
2046
+
2047
+ def test_crawl_tree_cli_accepts_concurrency_and_staged_mode_flags(
2048
+ monkeypatch: pytest.MonkeyPatch, tmp_path: Path
2049
+ ) -> None:
2050
+ class DummyClient:
2051
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
2052
+ raise AssertionError("search_cql should not be called")
2053
+
2054
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
2055
+ return {
2056
+ "id": page_id,
2057
+ "title": f"Page {page_id}",
2058
+ "body": {"view": {"value": "<p>Content</p>"}},
2059
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Page"},
2060
+ "version": {"number": 1},
2061
+ }
2062
+
2063
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
2064
+ return []
2065
+
2066
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
2067
+ return []
2068
+
2069
+ def get_attachment_content(self, attachment_id: str) -> bytes:
2070
+ return b""
2071
+
2072
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: DummyClient())
2073
+
2074
+ out_dir = tmp_path / "out"
2075
+ result = runner.invoke(
2076
+ app,
2077
+ [
2078
+ "--report-dir",
2079
+ str(tmp_path),
2080
+ "crawl-tree",
2081
+ "1",
2082
+ "--out",
2083
+ str(out_dir),
2084
+ "--depth",
2085
+ "0",
2086
+ "--no-include-attachments",
2087
+ "--page-concurrency",
2088
+ "3",
2089
+ "--staged-mode",
2090
+ "structure-first",
2091
+ ],
2092
+ )
2093
+
2094
+ assert result.exit_code == 0, result.output
2095
+ payload = json.loads(result.stdout)
2096
+ assert payload["crawl_mode"]["page_concurrency"] == 3
2097
+ assert payload["crawl_mode"]["staged_mode"] == "structure_first"
2098
+
2099
+
2100
+ # =============================================================================
2101
+ # TSK-189: Bundle Zip Tests
2102
+ # =============================================================================
2103
+
2104
+
2105
+ def test_create_bundle_zip_contains_all_artifacts(tmp_path: Path) -> None:
2106
+ """Test that bundle zip contains all expected files."""
2107
+ out_dir = tmp_path / "out"
2108
+ out_dir.mkdir()
2109
+ attachments_dir = out_dir / "attachments"
2110
+ attachments_dir.mkdir()
2111
+
2112
+ # Create mock files
2113
+ (out_dir / "123_Root.html").write_text("<p>Root page</p>")
2114
+ (out_dir / "456_Child.html").write_text("<p>Child page</p>")
2115
+ (attachments_dir / "a1_file.txt").write_text("attachment content")
2116
+ (out_dir / "corpus.jsonl").write_text('{"chunk_id": "abc"}\n')
2117
+ (out_dir / "link-graph.json").write_text('{"edges": []}')
2118
+ (out_dir / "tables.jsonl").write_text('{"table_id": "t1"}\n')
2119
+ (out_dir / "mentions.jsonl").write_text('{"mention_id": "m1"}\n')
2120
+ (out_dir / "crawl_state.json").write_text('{"completed": true}')
2121
+ manifest_path = out_dir / "crawl_manifest.json"
2122
+ manifest_path.write_text('{"root_page_id": "123"}')
2123
+
2124
+ # Create bundle
2125
+ bundle_path = _create_bundle_zip(out_dir, "123", manifest_path)
2126
+
2127
+ assert bundle_path.exists()
2128
+ assert bundle_path.suffix == ".zip"
2129
+ assert "crawl_bundle_123_" in bundle_path.name
2130
+
2131
+ # Verify contents
2132
+ with zipfile.ZipFile(bundle_path, "r") as zf:
2133
+ names = zf.namelist()
2134
+ assert "123_Root.html" in names
2135
+ assert "456_Child.html" in names
2136
+ assert "attachments/a1_file.txt" in names
2137
+ assert "corpus.jsonl" in names
2138
+ assert "link-graph.json" in names
2139
+ assert "tables.jsonl" in names
2140
+ assert "mentions.jsonl" in names
2141
+ assert "crawl_state.json" in names
2142
+ assert "crawl_manifest.json" in names
2143
+
2144
+
2145
+ def test_crawl_tree_bundle_option(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
2146
+ """Test --bundle flag creates zip archive."""
2147
+
2148
+ class DummyClient:
2149
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
2150
+ raise AssertionError("search_cql should not be called")
2151
+
2152
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
2153
+ return {
2154
+ "id": page_id,
2155
+ "title": f"Page {page_id}",
2156
+ "body": {"view": {"value": f"<p>Content {page_id}</p>"}},
2157
+ "_links": {"base": "http://example", "webui": f"/display/SPACE/Page{page_id}"},
2158
+ "version": {"number": 1},
2159
+ }
2160
+
2161
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
2162
+ return []
2163
+
2164
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
2165
+ return [{"id": "a1", "title": "file.txt", "version": {"number": 1}}]
2166
+
2167
+ def get_attachment_content(self, attachment_id: str) -> bytes:
2168
+ return b"attachment content"
2169
+
2170
+ dummy = DummyClient()
2171
+ monkeypatch.setattr("confluence_orchestrator.cli._build_http_client", lambda settings, server: dummy)
2172
+
2173
+ out_dir = tmp_path / "out"
2174
+ result = runner.invoke(
2175
+ app,
2176
+ [
2177
+ "--report-dir",
2178
+ str(tmp_path),
2179
+ "crawl-tree",
2180
+ "1",
2181
+ "--out",
2182
+ str(out_dir),
2183
+ "--depth",
2184
+ "0",
2185
+ "--emit-corpus",
2186
+ "--emit-link-graph",
2187
+ "--bundle",
2188
+ ],
2189
+ )
2190
+
2191
+ assert result.exit_code == 0, result.output
2192
+ payload = json.loads(result.stdout)
2193
+
2194
+ # Bundle should be created
2195
+ assert "bundle_path" in payload
2196
+ bundle_path = Path(payload["bundle_path"])
2197
+ assert bundle_path.exists()
2198
+
2199
+ # Verify bundle contents
2200
+ with zipfile.ZipFile(bundle_path, "r") as zf:
2201
+ names = zf.namelist()
2202
+ assert any(name.endswith(".html") for name in names)
2203
+ assert "corpus.jsonl" in names
2204
+ assert "link-graph.json" in names
2205
+ assert "crawl_manifest.json" in names
2206
+
2207
+
2208
+ def test_crawl_tree_bundle_with_no_attachments(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
2209
+ """Test bundle works even without attachments."""
2210
+
2211
+ class DummyClient:
2212
+ def search_cql(self, cql: str, **_: Any) -> dict[str, Any]:
2213
+ raise AssertionError("search_cql should not be called")
2214
+
2215
+ def get_page(self, page_id: str, expand: list[str] | None = None) -> dict[str, Any]:
2216
+ return {
2217
+ "id": page_id,
2218
+ "title": "Root",
2219
+ "body": {"view": {"value": "<p>Content</p>"}},
2220
+ "_links": {"base": "http://example", "webui": "/display/SPACE/Root"},
2221
+ "version": {"number": 1},
2222
+ }
2223
+
2224
+ def get_child_pages(self, page_id: str) -> list[dict[str, Any]]:
2225
+ return []
2226
+
2227
+ def get_attachments(self, page_id: str) -> list[dict[str, Any]]:
2228
+ return []
2229
+
2230
+ def get_attachment_content(self, attachment_id: str) -> bytes:
2231
+ return b""
2232
+
2233
+ dummy = DummyClient()
2234
+ out_dir = tmp_path / "out"
2235
+
2236
+ result = crawl_tree_to_disk(
2237
+ http=dummy,
2238
+ root_ref="1",
2239
+ output_dir=out_dir,
2240
+ options=CrawlOptions(max_depth=0, include_attachments=False, bundle=True),
2241
+ )
2242
+
2243
+ assert "bundle_path" in result
2244
+ bundle_path = Path(result["bundle_path"])
2245
+ assert bundle_path.exists()
2246
+
2247
+ with zipfile.ZipFile(bundle_path, "r") as zf:
2248
+ names = zf.namelist()
2249
+ assert "1_Root.html" in names
2250
+ assert "crawl_manifest.json" in names