hpc-agent 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. hpc_agent-0.3.0/LICENSE +21 -0
  2. hpc_agent-0.3.0/PKG-INFO +313 -0
  3. hpc_agent-0.3.0/README.md +277 -0
  4. hpc_agent-0.3.0/pyproject.toml +155 -0
  5. hpc_agent-0.3.0/setup.cfg +4 -0
  6. hpc_agent-0.3.0/src/hpc_agent/__init__.py +295 -0
  7. hpc_agent-0.3.0/src/hpc_agent/__main__.py +8 -0
  8. hpc_agent-0.3.0/src/hpc_agent/_internal/__init__.py +1 -0
  9. hpc_agent-0.3.0/src/hpc_agent/_internal/io.py +224 -0
  10. hpc_agent-0.3.0/src/hpc_agent/_internal/layout.py +175 -0
  11. hpc_agent-0.3.0/src/hpc_agent/_internal/lifecycle.py +147 -0
  12. hpc_agent-0.3.0/src/hpc_agent/_internal/operations.py +251 -0
  13. hpc_agent-0.3.0/src/hpc_agent/_internal/playbook.py +144 -0
  14. hpc_agent-0.3.0/src/hpc_agent/_internal/plugins.py +89 -0
  15. hpc_agent-0.3.0/src/hpc_agent/_internal/primitive.py +387 -0
  16. hpc_agent-0.3.0/src/hpc_agent/_internal/schema.py +160 -0
  17. hpc_agent-0.3.0/src/hpc_agent/_internal/session/__init__.py +85 -0
  18. hpc_agent-0.3.0/src/hpc_agent/_internal/session/index.py +237 -0
  19. hpc_agent-0.3.0/src/hpc_agent/_internal/session/journal.py +210 -0
  20. hpc_agent-0.3.0/src/hpc_agent/_internal/session/run_record.py +247 -0
  21. hpc_agent-0.3.0/src/hpc_agent/_internal/telemetry.py +137 -0
  22. hpc_agent-0.3.0/src/hpc_agent/_internal/time.py +50 -0
  23. hpc_agent-0.3.0/src/hpc_agent/_internal/version.py +115 -0
  24. hpc_agent-0.3.0/src/hpc_agent/_schema_models/__init__.py +24 -0
  25. hpc_agent-0.3.0/src/hpc_agent/_schema_models/_shared.py +145 -0
  26. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/__init__.py +0 -0
  27. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/build_executor.py +19 -0
  28. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/build_submit_spec.py +54 -0
  29. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/build_tasks_py.py +102 -0
  30. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/cluster_reduce.py +36 -0
  31. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/combine_wave.py +34 -0
  32. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/interview.py +300 -0
  33. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/resubmit.py +79 -0
  34. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/submit.py +41 -0
  35. hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/update_run_constraints.py +62 -0
  36. hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/__init__.py +0 -0
  37. hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/axes.py +84 -0
  38. hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/campaign_manifest.py +64 -0
  39. hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/envelope.py +69 -0
  40. hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/stages.py +68 -0
  41. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/__init__.py +0 -0
  42. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/campaign.py +55 -0
  43. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/campaign_health.py +39 -0
  44. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/capabilities.py +87 -0
  45. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/clusters.py +49 -0
  46. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/decide_monitor_arm.py +63 -0
  47. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/discover.py +20 -0
  48. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/failures.py +60 -0
  49. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/find_prior_run.py +31 -0
  50. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/inspect_cluster.py +35 -0
  51. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/list_in_flight.py +34 -0
  52. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/monitor_summary.py +42 -0
  53. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/recall.py +156 -0
  54. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/recommend_partition.py +66 -0
  55. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/reconcile.py +30 -0
  56. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/runtime_prior.py +47 -0
  57. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/status.py +46 -0
  58. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/suggest_setup_action.py +30 -0
  59. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/summarize_submit_plan.py +21 -0
  60. hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/verify_aggregation_complete.py +44 -0
  61. hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/__init__.py +0 -0
  62. hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/preflight.py +22 -0
  63. hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_executor_signatures.py +52 -0
  64. hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_input_dataset.py +44 -0
  65. hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_self_qos_limit.py +49 -0
  66. hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_stochastic_marker.py +72 -0
  67. hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_walltime_against_history.py +41 -0
  68. hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/__init__.py +0 -0
  69. hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/aggregate_flow.py +125 -0
  70. hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/monitor_flow.py +128 -0
  71. hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/submit_flow.py +173 -0
  72. hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/submit_flow_batch.py +92 -0
  73. hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/validate_campaign.py +167 -0
  74. hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/verify_canary.py +46 -0
  75. hpc_agent-0.3.0/src/hpc_agent/agent_assets.py +119 -0
  76. hpc_agent-0.3.0/src/hpc_agent/agent_cli.py +2808 -0
  77. hpc_agent-0.3.0/src/hpc_agent/atoms/__init__.py +7 -0
  78. hpc_agent-0.3.0/src/hpc_agent/atoms/aggregation_invariants.py +205 -0
  79. hpc_agent-0.3.0/src/hpc_agent/atoms/axes_init.py +93 -0
  80. hpc_agent-0.3.0/src/hpc_agent/atoms/build_executor.py +73 -0
  81. hpc_agent-0.3.0/src/hpc_agent/atoms/build_submit_spec.py +239 -0
  82. hpc_agent-0.3.0/src/hpc_agent/atoms/build_tasks_py.py +552 -0
  83. hpc_agent-0.3.0/src/hpc_agent/atoms/build_template.py +159 -0
  84. hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_advance.py +103 -0
  85. hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_budget.py +126 -0
  86. hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_converged.py +168 -0
  87. hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_health.py +208 -0
  88. hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_init.py +120 -0
  89. hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_list.py +43 -0
  90. hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_replay.py +72 -0
  91. hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_status.py +42 -0
  92. hpc_agent-0.3.0/src/hpc_agent/atoms/canary_verify.py +293 -0
  93. hpc_agent-0.3.0/src/hpc_agent/atoms/capabilities.py +90 -0
  94. hpc_agent-0.3.0/src/hpc_agent/atoms/cluster_reduce.py +298 -0
  95. hpc_agent-0.3.0/src/hpc_agent/atoms/clusters.py +77 -0
  96. hpc_agent-0.3.0/src/hpc_agent/atoms/failures.py +159 -0
  97. hpc_agent-0.3.0/src/hpc_agent/atoms/interview.py +299 -0
  98. hpc_agent-0.3.0/src/hpc_agent/atoms/list_in_flight.py +75 -0
  99. hpc_agent-0.3.0/src/hpc_agent/atoms/logs.py +111 -0
  100. hpc_agent-0.3.0/src/hpc_agent/atoms/monitor_arm.py +259 -0
  101. hpc_agent-0.3.0/src/hpc_agent/atoms/monitor_summary.py +175 -0
  102. hpc_agent-0.3.0/src/hpc_agent/atoms/plan_throughput.py +118 -0
  103. hpc_agent-0.3.0/src/hpc_agent/atoms/preflight.py +121 -0
  104. hpc_agent-0.3.0/src/hpc_agent/atoms/recall.py +455 -0
  105. hpc_agent-0.3.0/src/hpc_agent/atoms/recommend_partition.py +159 -0
  106. hpc_agent-0.3.0/src/hpc_agent/atoms/setup_actions.py +248 -0
  107. hpc_agent-0.3.0/src/hpc_agent/atoms/submit_plan_summary.py +130 -0
  108. hpc_agent-0.3.0/src/hpc_agent/atoms/validate_executor_signatures.py +240 -0
  109. hpc_agent-0.3.0/src/hpc_agent/atoms/validate_input_dataset.py +283 -0
  110. hpc_agent-0.3.0/src/hpc_agent/atoms/validate_self_qos_limit.py +110 -0
  111. hpc_agent-0.3.0/src/hpc_agent/atoms/validate_stochastic_marker.py +122 -0
  112. hpc_agent-0.3.0/src/hpc_agent/atoms/validate_walltime_against_history.py +218 -0
  113. hpc_agent-0.3.0/src/hpc_agent/campaign/__init__.py +31 -0
  114. hpc_agent-0.3.0/src/hpc_agent/campaign/cursor.py +130 -0
  115. hpc_agent-0.3.0/src/hpc_agent/campaign/dirs.py +34 -0
  116. hpc_agent-0.3.0/src/hpc_agent/campaign/manifest.py +123 -0
  117. hpc_agent-0.3.0/src/hpc_agent/config/clusters.yaml +114 -0
  118. hpc_agent-0.3.0/src/hpc_agent/errors.py +255 -0
  119. hpc_agent-0.3.0/src/hpc_agent/executor_cli.py +192 -0
  120. hpc_agent-0.3.0/src/hpc_agent/flows/__init__.py +16 -0
  121. hpc_agent-0.3.0/src/hpc_agent/flows/aggregate_flow.py +378 -0
  122. hpc_agent-0.3.0/src/hpc_agent/flows/monitor_flow.py +613 -0
  123. hpc_agent-0.3.0/src/hpc_agent/flows/resubmit_flow.py +578 -0
  124. hpc_agent-0.3.0/src/hpc_agent/flows/submit_flow.py +610 -0
  125. hpc_agent-0.3.0/src/hpc_agent/flows/validate_campaign.py +201 -0
  126. hpc_agent-0.3.0/src/hpc_agent/hooks/__init__.py +9 -0
  127. hpc_agent-0.3.0/src/hpc_agent/hooks/install.py +196 -0
  128. hpc_agent-0.3.0/src/hpc_agent/hooks/monitor_armed_check.py +258 -0
  129. hpc_agent-0.3.0/src/hpc_agent/infra/__init__.py +14 -0
  130. hpc_agent-0.3.0/src/hpc_agent/infra/backends/__init__.py +389 -0
  131. hpc_agent-0.3.0/src/hpc_agent/infra/backends/_remote_base.py +67 -0
  132. hpc_agent-0.3.0/src/hpc_agent/infra/backends/query.py +454 -0
  133. hpc_agent-0.3.0/src/hpc_agent/infra/backends/sge.py +177 -0
  134. hpc_agent-0.3.0/src/hpc_agent/infra/backends/sge_remote.py +69 -0
  135. hpc_agent-0.3.0/src/hpc_agent/infra/backends/slurm.py +196 -0
  136. hpc_agent-0.3.0/src/hpc_agent/infra/backends/slurm_remote.py +88 -0
  137. hpc_agent-0.3.0/src/hpc_agent/infra/cache.py +143 -0
  138. hpc_agent-0.3.0/src/hpc_agent/infra/clusters.py +424 -0
  139. hpc_agent-0.3.0/src/hpc_agent/infra/gpu.py +432 -0
  140. hpc_agent-0.3.0/src/hpc_agent/infra/inspect/__init__.py +172 -0
  141. hpc_agent-0.3.0/src/hpc_agent/infra/inspect/_common.py +181 -0
  142. hpc_agent-0.3.0/src/hpc_agent/infra/inspect/_persist.py +203 -0
  143. hpc_agent-0.3.0/src/hpc_agent/infra/inspect/sge.py +184 -0
  144. hpc_agent-0.3.0/src/hpc_agent/infra/inspect/slurm.py +340 -0
  145. hpc_agent-0.3.0/src/hpc_agent/infra/parsing.py +247 -0
  146. hpc_agent-0.3.0/src/hpc_agent/infra/remote.py +890 -0
  147. hpc_agent-0.3.0/src/hpc_agent/infra/slurm_reservations.py +311 -0
  148. hpc_agent-0.3.0/src/hpc_agent/integration/__init__.py +87 -0
  149. hpc_agent-0.3.0/src/hpc_agent/mapreduce/__init__.py +1 -0
  150. hpc_agent-0.3.0/src/hpc_agent/mapreduce/combiner.py +390 -0
  151. hpc_agent-0.3.0/src/hpc_agent/mapreduce/dispatch.py +609 -0
  152. hpc_agent-0.3.0/src/hpc_agent/mapreduce/metrics_io.py +109 -0
  153. hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/__init__.py +23 -0
  154. hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/classify.py +120 -0
  155. hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/history.py +161 -0
  156. hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/metrics.py +277 -0
  157. hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/rollup.py +95 -0
  158. hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/status.py +680 -0
  159. hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/tui.py +576 -0
  160. hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/common/gpu_preamble.sh +85 -0
  161. hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/common/hpc_preamble.sh +231 -0
  162. hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/sge/cpu_array.sh +82 -0
  163. hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/sge/gpu_array.sh +90 -0
  164. hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/slurm/cpu_array.slurm +88 -0
  165. hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/slurm/gpu_array.slurm +95 -0
  166. hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/scaffolds/cli_dispatcher.py +154 -0
  167. hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/scaffolds/executor_template.py +174 -0
  168. hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py +109 -0
  169. hpc_agent-0.3.0/src/hpc_agent/operations.json +620 -0
  170. hpc_agent-0.3.0/src/hpc_agent/planning/__init__.py +15 -0
  171. hpc_agent-0.3.0/src/hpc_agent/planning/axes.py +384 -0
  172. hpc_agent-0.3.0/src/hpc_agent/planning/constraints.py +64 -0
  173. hpc_agent-0.3.0/src/hpc_agent/planning/resubmit_batching.py +215 -0
  174. hpc_agent-0.3.0/src/hpc_agent/planning/stages.py +128 -0
  175. hpc_agent-0.3.0/src/hpc_agent/planning/throughput.py +188 -0
  176. hpc_agent-0.3.0/src/hpc_agent/py.typed +0 -0
  177. hpc_agent-0.3.0/src/hpc_agent/runner/__init__.py +73 -0
  178. hpc_agent-0.3.0/src/hpc_agent/runner/_ssh.py +26 -0
  179. hpc_agent-0.3.0/src/hpc_agent/runner/aggregate.py +193 -0
  180. hpc_agent-0.3.0/src/hpc_agent/runner/combine.py +78 -0
  181. hpc_agent-0.3.0/src/hpc_agent/runner/failure_signatures.py +206 -0
  182. hpc_agent-0.3.0/src/hpc_agent/runner/failures.py +267 -0
  183. hpc_agent-0.3.0/src/hpc_agent/runner/logs.py +71 -0
  184. hpc_agent-0.3.0/src/hpc_agent/runner/reconcile.py +219 -0
  185. hpc_agent-0.3.0/src/hpc_agent/runner/resubmit.py +129 -0
  186. hpc_agent-0.3.0/src/hpc_agent/runner/status.py +123 -0
  187. hpc_agent-0.3.0/src/hpc_agent/runner/submit.py +160 -0
  188. hpc_agent-0.3.0/src/hpc_agent/runner/update_constraints.py +173 -0
  189. hpc_agent-0.3.0/src/hpc_agent/schemas/aggregate_flow.input.json +69 -0
  190. hpc_agent-0.3.0/src/hpc_agent/schemas/aggregate_flow.output.json +81 -0
  191. hpc_agent-0.3.0/src/hpc_agent/schemas/axes.json +75 -0
  192. hpc_agent-0.3.0/src/hpc_agent/schemas/build_executor.output.json +30 -0
  193. hpc_agent-0.3.0/src/hpc_agent/schemas/build_submit_spec.input.json +266 -0
  194. hpc_agent-0.3.0/src/hpc_agent/schemas/build_tasks_py.input.json +184 -0
  195. hpc_agent-0.3.0/src/hpc_agent/schemas/campaign.output.json +113 -0
  196. hpc_agent-0.3.0/src/hpc_agent/schemas/campaign_health.input.json +59 -0
  197. hpc_agent-0.3.0/src/hpc_agent/schemas/campaign_health.output.json +94 -0
  198. hpc_agent-0.3.0/src/hpc_agent/schemas/campaign_manifest.json +246 -0
  199. hpc_agent-0.3.0/src/hpc_agent/schemas/capabilities.output.json +243 -0
  200. hpc_agent-0.3.0/src/hpc_agent/schemas/cluster_reduce.output.json +52 -0
  201. hpc_agent-0.3.0/src/hpc_agent/schemas/clusters_describe.output.json +40 -0
  202. hpc_agent-0.3.0/src/hpc_agent/schemas/clusters_list.output.json +50 -0
  203. hpc_agent-0.3.0/src/hpc_agent/schemas/combine_wave.output.json +59 -0
  204. hpc_agent-0.3.0/src/hpc_agent/schemas/decide_monitor_arm.input.json +89 -0
  205. hpc_agent-0.3.0/src/hpc_agent/schemas/decide_monitor_arm.output.json +90 -0
  206. hpc_agent-0.3.0/src/hpc_agent/schemas/discover.output.json +57 -0
  207. hpc_agent-0.3.0/src/hpc_agent/schemas/envelope.json +156 -0
  208. hpc_agent-0.3.0/src/hpc_agent/schemas/failures.output.json +143 -0
  209. hpc_agent-0.3.0/src/hpc_agent/schemas/find_prior_run.output.json +116 -0
  210. hpc_agent-0.3.0/src/hpc_agent/schemas/inspect_cluster.output.json +170 -0
  211. hpc_agent-0.3.0/src/hpc_agent/schemas/interview.input.json +550 -0
  212. hpc_agent-0.3.0/src/hpc_agent/schemas/interview.output.json +92 -0
  213. hpc_agent-0.3.0/src/hpc_agent/schemas/list_in_flight.output.json +90 -0
  214. hpc_agent-0.3.0/src/hpc_agent/schemas/monitor_flow.input.json +51 -0
  215. hpc_agent-0.3.0/src/hpc_agent/schemas/monitor_flow.output.json +79 -0
  216. hpc_agent-0.3.0/src/hpc_agent/schemas/monitor_summary.output.json +54 -0
  217. hpc_agent-0.3.0/src/hpc_agent/schemas/preflight.output.json +58 -0
  218. hpc_agent-0.3.0/src/hpc_agent/schemas/recall.input.json +82 -0
  219. hpc_agent-0.3.0/src/hpc_agent/schemas/recall.output.json +502 -0
  220. hpc_agent-0.3.0/src/hpc_agent/schemas/recommend_partition.input.json +80 -0
  221. hpc_agent-0.3.0/src/hpc_agent/schemas/recommend_partition.output.json +43 -0
  222. hpc_agent-0.3.0/src/hpc_agent/schemas/reconcile.output.json +53 -0
  223. hpc_agent-0.3.0/src/hpc_agent/schemas/resubmit.input.json +144 -0
  224. hpc_agent-0.3.0/src/hpc_agent/schemas/runtime_prior.output.json +158 -0
  225. hpc_agent-0.3.0/src/hpc_agent/schemas/stages.input.json +244 -0
  226. hpc_agent-0.3.0/src/hpc_agent/schemas/status.output.json +109 -0
  227. hpc_agent-0.3.0/src/hpc_agent/schemas/submit.input.json +89 -0
  228. hpc_agent-0.3.0/src/hpc_agent/schemas/submit.output.json +37 -0
  229. hpc_agent-0.3.0/src/hpc_agent/schemas/submit_flow.input.json +194 -0
  230. hpc_agent-0.3.0/src/hpc_agent/schemas/submit_flow.output.json +73 -0
  231. hpc_agent-0.3.0/src/hpc_agent/schemas/submit_flow_batch.input.json +244 -0
  232. hpc_agent-0.3.0/src/hpc_agent/schemas/submit_flow_batch.output.json +98 -0
  233. hpc_agent-0.3.0/src/hpc_agent/schemas/suggest_setup_action.output.json +62 -0
  234. hpc_agent-0.3.0/src/hpc_agent/schemas/summarize_submit_plan.output.json +28 -0
  235. hpc_agent-0.3.0/src/hpc_agent/schemas/update_run_constraints.input.json +43 -0
  236. hpc_agent-0.3.0/src/hpc_agent/schemas/update_run_constraints.output.json +37 -0
  237. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_campaign.input.json +164 -0
  238. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_campaign.output.json +120 -0
  239. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_executor_signatures.input.json +37 -0
  240. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_executor_signatures.output.json +99 -0
  241. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_input_dataset.input.json +41 -0
  242. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_input_dataset.output.json +99 -0
  243. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_self_qos_limit.input.json +52 -0
  244. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_self_qos_limit.output.json +99 -0
  245. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_stochastic_marker.input.json +26 -0
  246. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_stochastic_marker.output.json +107 -0
  247. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_walltime_against_history.input.json +48 -0
  248. hpc_agent-0.3.0/src/hpc_agent/schemas/validate_walltime_against_history.output.json +99 -0
  249. hpc_agent-0.3.0/src/hpc_agent/schemas/verify_aggregation_complete.output.json +94 -0
  250. hpc_agent-0.3.0/src/hpc_agent/schemas/verify_canary.output.json +65 -0
  251. hpc_agent-0.3.0/src/hpc_agent/state/__init__.py +14 -0
  252. hpc_agent-0.3.0/src/hpc_agent/state/discover.py +479 -0
  253. hpc_agent-0.3.0/src/hpc_agent/state/runs.py +649 -0
  254. hpc_agent-0.3.0/src/hpc_agent/state/runtime_prior.py +498 -0
  255. hpc_agent-0.3.0/src/hpc_agent/state/user_profiles.py +392 -0
  256. hpc_agent-0.3.0/src/hpc_agent/template/__init__.py +110 -0
  257. hpc_agent-0.3.0/src/hpc_agent/template/_runtime.py +266 -0
  258. hpc_agent-0.3.0/src/hpc_agent/template/axis.py +159 -0
  259. hpc_agent-0.3.0/src/hpc_agent/template/discover.py +182 -0
  260. hpc_agent-0.3.0/src/hpc_agent/template/elision.py +192 -0
  261. hpc_agent-0.3.0/src/hpc_agent/template/notebook.py +151 -0
  262. hpc_agent-0.3.0/src/hpc_agent/template/plan.py +158 -0
  263. hpc_agent-0.3.0/src/hpc_agent/template/reduce.py +67 -0
  264. hpc_agent-0.3.0/src/hpc_agent/template/register.py +20 -0
  265. hpc_agent-0.3.0/src/hpc_agent/template/scaffold/Makefile.tmpl +8 -0
  266. hpc_agent-0.3.0/src/hpc_agent/template/scaffold/ci.yml.tmpl +28 -0
  267. hpc_agent-0.3.0/src/hpc_agent/template/scaffold/conftest.py.tmpl +25 -0
  268. hpc_agent-0.3.0/src/hpc_agent/template/scaffold/experiment.ipynb.tmpl +57 -0
  269. hpc_agent-0.3.0/src/hpc_agent/template/scaffold/pre-commit-config.yaml.tmpl +12 -0
  270. hpc_agent-0.3.0/src/hpc_agent/template/scaffold/pyproject.toml.tmpl +24 -0
  271. hpc_agent-0.3.0/src/hpc_agent/template/scaffold/scaffold.py.tmpl +113 -0
  272. hpc_agent-0.3.0/src/hpc_agent/template/scaffold/template.mk.tmpl +29 -0
  273. hpc_agent-0.3.0/src/hpc_agent/template/series.py +32 -0
  274. hpc_agent-0.3.0/src/hpc_agent/template/signature.py +318 -0
  275. hpc_agent-0.3.0/src/hpc_agent.egg-info/PKG-INFO +313 -0
  276. hpc_agent-0.3.0/src/hpc_agent.egg-info/SOURCES.txt +292 -0
  277. hpc_agent-0.3.0/src/hpc_agent.egg-info/dependency_links.txt +1 -0
  278. hpc_agent-0.3.0/src/hpc_agent.egg-info/entry_points.txt +2 -0
  279. hpc_agent-0.3.0/src/hpc_agent.egg-info/requires.txt +17 -0
  280. hpc_agent-0.3.0/src/hpc_agent.egg-info/top_level.txt +2 -0
  281. hpc_agent-0.3.0/src/slash_commands/__init__.py +0 -0
  282. hpc_agent-0.3.0/src/slash_commands/commands/aggregate-hpc.md +52 -0
  283. hpc_agent-0.3.0/src/slash_commands/commands/campaign-hpc.md +34 -0
  284. hpc_agent-0.3.0/src/slash_commands/commands/hpc-axes-init.md +31 -0
  285. hpc_agent-0.3.0/src/slash_commands/commands/monitor-hpc.md +49 -0
  286. hpc_agent-0.3.0/src/slash_commands/commands/preflight.md +19 -0
  287. hpc_agent-0.3.0/src/slash_commands/commands/submit-hpc.md +69 -0
  288. hpc_agent-0.3.0/src/slash_commands/commands/validate-campaign.md +101 -0
  289. hpc_agent-0.3.0/src/slash_commands/skills/hpc-aggregate/SKILL.md +71 -0
  290. hpc_agent-0.3.0/src/slash_commands/skills/hpc-build-executor/SKILL.md +57 -0
  291. hpc_agent-0.3.0/src/slash_commands/skills/hpc-campaign/SKILL.md +63 -0
  292. hpc_agent-0.3.0/src/slash_commands/skills/hpc-preflight/SKILL.md +32 -0
  293. hpc_agent-0.3.0/src/slash_commands/skills/hpc-status/SKILL.md +53 -0
  294. hpc_agent-0.3.0/src/slash_commands/skills/hpc-submit/SKILL.md +259 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 hpc-agent contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,313 @@
1
+ Metadata-Version: 2.4
2
+ Name: hpc-agent
3
+ Version: 0.3.0
4
+ Summary: HPC orchestrator for Claude Code and external agent harnesses
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/jamesdchen/hpc-agent
7
+ Project-URL: Repository, https://github.com/jamesdchen/hpc-agent
8
+ Project-URL: Issues, https://github.com/jamesdchen/hpc-agent/issues
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Environment :: Console
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: POSIX
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Topic :: System :: Distributed Computing
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: pyyaml>=6.0
21
+ Requires-Dist: jsonschema>=4.18
22
+ Requires-Dist: referencing>=0.30
23
+ Requires-Dist: pydantic>=2.6
24
+ Provides-Extra: dev
25
+ Requires-Dist: ruff>=0.4; extra == "dev"
26
+ Requires-Dist: mypy>=1.10; extra == "dev"
27
+ Requires-Dist: pytest>=8.0; extra == "dev"
28
+ Requires-Dist: pytest-xdist>=3.8; extra == "dev"
29
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
30
+ Requires-Dist: hypothesis>=6.100; extra == "dev"
31
+ Requires-Dist: types-PyYAML>=6.0; extra == "dev"
32
+ Requires-Dist: pre-commit>=3.5; extra == "dev"
33
+ Provides-Extra: tui
34
+ Requires-Dist: rich>=13.0; extra == "tui"
35
+ Dynamic: license-file
36
+
37
+ # hpc-agent
38
+
39
+ HPC orchestrator for array-batch experiments on SGE/SLURM clusters. Two surfaces over one core:
40
+
41
+ - **Slash commands for humans** in Claude Code (`/submit-hpc`, `/monitor-hpc`, `/aggregate-hpc`, `/campaign-hpc`, `/preflight`) — interactive markdown templates in `slash_commands/commands/*.md` that walk you through choosing a cluster and authoring `.hpc/tasks.py`. Executor scaffolding is folded into `/submit-hpc` Step 1; preflight is folded into `/submit-hpc` Step 6b as an idempotent gate (with `/preflight` still available as a standalone diagnostic).
42
+ - **CLI for agents and automation** (`hpc-agent <subcommand>`) — JSON-in, JSON-out, exit codes. Designed to be invoked via a `Bash`-style tool by external orchestrators. This is a POSIX-native agent surface: any tool that can shell out and parse JSON can drive a cluster — see [`docs/reference/agent-surface.md`](docs/reference/agent-surface.md). For integrators: [`docs/integrations/CONTRACT.md`](docs/integrations/CONTRACT.md).
43
+
44
+ Both surfaces invoke `hpc-agent <subcommand>`. The slash commands are pure markdown that orchestrate the binary; the binary's atomic-ops layer (`hpc_agent.runner`) ensures cross-surface state — in-flight runs, journal records under `~/.claude/hpc/<repo_hash>/` — is shared automatically.
45
+
46
+ ## Quick Start
47
+
48
+ ### For humans (Claude Code)
49
+
50
+ ```bash
51
+ pip install hpc-agent # or `pip install -e .` from a checkout
52
+ hpc-agent setup # copy commands + skills, wire the Stop hooks
53
+ ```
54
+ `hpc-agent setup` copies the bundled slash commands into
55
+ `~/.claude/commands/` and the skills into `~/.claude/skills/`, then
56
+ installs hpc-agent's Stop hooks — all idempotent, so re-running is
57
+ safe. Both asset trees ship inside the package, so this works the same
58
+ from a wheel install or an editable checkout. Pass `--no-hooks` to
59
+ skip the hook step or `--dry-run` to preview. Every command
60
+ (`/preflight`, `/submit-hpc`, `/monitor-hpc`, `/aggregate-hpc`,
61
+ `/campaign-hpc`, `/hpc-axes-init`) and skill ships inside the package.
62
+
63
+ Once installed:
64
+
65
+ - `/preflight` (optional) — verify SSH agent + cluster reachability. `/submit-hpc` auto-runs this as a cached gate, so you only need it for ad-hoc diagnostics.
66
+ - `/submit-hpc` — answer prompts about cluster, executor, grid params. Scaffolds the executor inline if none exists.
67
+ - `/monitor-hpc` to monitor, `/aggregate-hpc` to collect results.
68
+
69
+ ### For agents and automation
70
+
71
+ ```bash
72
+ pip install hpc-agent
73
+ hpc-agent preflight --cluster hoffman2 # health check
74
+ hpc-agent interview --spec intent.json --campaign-dir <d> # persist campaign intent next to tasks.py
75
+ hpc-agent recall --root ~/experiments --task-kind <kind> # query past interviews for next-interview grounding
76
+ hpc-agent submit --spec spec.json # JSON envelope on stdout
77
+ hpc-agent status --run-id <id> # one-shot snapshot; poll as needed
78
+ hpc-agent aggregate --run-id <id> --wave 1 # combiner + result pull
79
+ ```
80
+ Stdout is a single-line JSON envelope: `{"ok": true, "idempotent": ..., "data": {...}}` or `{"ok": false, "error_code": ..., "retry_safe": ..., "remediation": ...}`. Exit codes: 0 ok, 1 user error, 2 cluster/network, 3 internal. Full schema in [`docs/reference/cli-spec.md`](docs/reference/cli-spec.md); JSON Schema files for runtime validation under `hpc_agent/schemas/`.
81
+
82
+ ### For integrators
83
+
84
+ hpc-agent is `Bash`-invokable from any agent harness with a JSON
85
+ parser. See **[`docs/integrations/CONTRACT.md`](docs/integrations/CONTRACT.md)**
86
+ for the full contract: the spawn env block,
87
+ `error_code` → retry policy table, the `find-prior-run` → `submit` →
88
+ `monitor-summary` → `verify-aggregation-complete` workflow, the
89
+ `.hpc/tasks.py` boundary, and the executor import allowlist.
90
+
91
+ The canonical reference for `.hpc/tasks.py` is shipped inside the
92
+ package at
93
+ [`src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py`](src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py).
94
+ It demonstrates three patterns (Cartesian product, chunking by row
95
+ count, date-window backtests) inline. Integrators locate it at runtime
96
+ via `from hpc_agent import _PACKAGE_ROOT` or `rglob("tasks_example.py")`.
97
+
98
+ The most common first-time failure is the harness's default-empty
99
+ spawn env dropping `SSH_AUTH_SOCK`. `hpc-agent
100
+ status`/`aggregate`/`reconcile` fail fast with `error_code:
101
+ "ssh_unreachable"` (exit 2) instead of hanging on auth — run
102
+ `hpc-agent preflight` first to verify the spawn env. hpc-agent does
103
+ not kill cluster jobs by design (`settings.json` denies
104
+ `scancel`/`qdel`); if the integrator decides a run is bad, stop
105
+ polling and let it expire.
106
+
107
+ ---
108
+
109
+ ## Standalone usage
110
+
111
+ ### Organize your experiment repo
112
+
113
+ Keep standalone executor scripts in a dedicated directory, separate from shared utilities:
114
+
115
+ ```
116
+ my_experiment/
117
+ ├── executors/ # or src/ — each file is a runnable experiment
118
+ │ ├── ml_ridge.py # python3 executors/ml_ridge.py --help
119
+ │ ├── ml_xgboost.py
120
+ │ └── dl_patchts.py
121
+ ├── lib/ # shared utilities (not executors)
122
+ │ ├── loading.py
123
+ │ └── transforms.py
124
+ └── data/
125
+ ```
126
+
127
+ Each executor accepts experiment-specific arguments (`--horizon`, `--start`, `--end`, `--features`, etc.). No HPC awareness is needed — all parameters arrive as CLI flags.
128
+
129
+ ### Run
130
+
131
+ ```
132
+ /preflight → verify SSH agent + cluster reachability before first submit
133
+ /submit → discovers executors, walks you through .hpc/tasks.py, syncs code, submits
134
+ /monitor-hpc → tracks completion per grid point, diagnoses failures, auto-resubmits
135
+ /aggregate → validates completeness, runs aggregation, downloads summaries
136
+ ```
137
+
138
+ **Example conversation:**
139
+
140
+ ```
141
+ You: /submit run ridge and xgboost with horizon=[1, 5, 25]
142
+
143
+ Claude: I found these executors in src/:
144
+ ml_ridge.py — --horizon, --start, --end, --output-file
145
+ ml_xgboost.py — --horizon, --start, --end, --output-file
146
+
147
+ Proposed plan:
148
+ Cluster: hoffman2 (SGE)
149
+ Grid: executor=[ml_ridge, ml_xgboost] × horizon=[1, 5, 25] → 6 grid points
150
+ Total: 6 tasks
151
+ Resources: 1 CPU, 16G, 4:00:00
152
+ Confirm?
153
+
154
+ You: yes
155
+
156
+ Claude: Submitted job 12345678 (6 tasks). Run /monitor-hpc to track progress.
157
+ ```
158
+
159
+ No config files required. Claude discovers your executors by reading their source and `--help`, then suggests resources conversationally based on the executor and your input.
160
+
161
+ ## How It Works
162
+
163
+ The boundary between hpc-agent and your experiment repo is documented in [`docs/reference/boundary-contract.md`](docs/reference/boundary-contract.md) and enforced by `tests/test_boundary_contract.py`.
164
+
165
+ 1. Claude reads your executor scripts and their `--help` output.
166
+ 2. You describe what to run in natural language — Claude walks you through writing `.hpc/tasks.py` once: a small Python module exposing `total()` and `resolve(task_id)` that returns the per-task kwargs. The file is committed to git and reused on every subsequent submit.
167
+ 3. A per-run sidecar `.hpc/runs/<run_id>.json` records the executor command, result-dir template, `cmd_sha`, and wave map for this particular submission.
168
+ 4. The framework executor `_hpc_dispatch.py` (zero deps, stdlib-only) is deployed to the cluster's `.hpc/` by `deploy_runtime`.
169
+ 5. The job template runs the dispatcher, which imports your `.hpc/tasks.py`, calls `resolve(task_id)`, formats the result_dir, and execs your executor command with kwargs as env vars.
170
+ 6. Your executor reads kwargs as ordinary env vars (uppercased + `HPC_KW_*`) — no HPC awareness needed.
171
+
172
+ ### Parallelism Model
173
+
174
+ The parallelization axis lives entirely in user code (`.hpc/tasks.py`). The framework is agnostic to whether you're doing a Cartesian grid, chunking by row count, date-window backtests, or something else — it just calls `total()` and `resolve(i)`. The canonical reference at `hpc_agent/mapreduce/templates/scaffolds/tasks_example.py` shows three patterns inline; the agent helps you keep whichever applies and delete the rest.
175
+
176
+ ### Memory across campaigns
177
+
178
+ Two primitives — `interview` and `recall` — close the loop between consecutive campaigns. The interview agent (Claude Code or any external orchestrator) persists structured intent (`goal`, `task_count`, `budget`, `abort_if`, `task_generator`, `cluster_target`, `transcript`, provenance) into `<campaign_dir>/interview.json` next to the materialized `tasks.py`. The next interview calls `recall --root <experiments-dir>` to query past intents, returning recency-sorted summaries plus a 3-tier rollup (counts/histograms/quantiles, optional walltime aggregation, optional per-generator parameter envelopes). Observed ranges only — reasoning over them stays in the calling agent.
179
+
180
+ See [`docs/workflows/memory-across-campaigns.md`](docs/workflows/memory-across-campaigns.md) for the full flow, including the `task_generator` typed materializer (5 shapes: `enumerated`, `cartesian_product`, `items_x_seeds`, `numeric_logspace`, `numeric_linspace`) and the `~/.hpc-agent/config.json:experiment_roots` default-root config.
181
+
182
+ ### Throughput Optimization
183
+
184
+ hpc-agent automatically optimizes job submissions for cluster constraints. When constraints are configured (max array size, walltime, concurrent job limits), the optimizer packs tasks into batched waves:
185
+
186
+ - Tasks are split into arrays of ≤max_array_size
187
+ - Arrays are grouped into waves of ≤max_concurrent_jobs
188
+ - Waves are staggered via scheduler dependencies (SLURM `--dependency`, SGE `-hold_jid`)
189
+ - Total wall-clock time is estimated when per-task duration is known
190
+
191
+ Configure constraints in `clusters.yaml` (cluster-level); per-experiment overrides resolved at `/submit` time are persisted to the run sidecar at `.hpc/runs/<run_id>.json`.
192
+
193
+ ## Commands
194
+
195
+ | Command | What it does |
196
+ |---------|-------------|
197
+ | `/preflight` | Standalone: verify SSH agent, ssh/rsync on PATH, clusters.yaml parses, cluster reachable. `/submit-hpc` auto-runs the same checks as a 24h-cached gate, so direct invocation is mostly for ad-hoc diagnostics. |
198
+ | `/submit-hpc` | Discover executors (scaffolds inline if none found), build grid conversationally, write `.hpc/tasks.py` with FLAGS dict + `.hpc/cli.py` dispatcher, sync code, submit array jobs |
199
+ | `/monitor-hpc` | Poll status, diagnose failures, auto-resubmit, self-schedule next check |
200
+ | `/aggregate-hpc` | Validate completeness, run aggregation on cluster, download summaries |
201
+ | `/campaign-hpc` | Closed-loop iteration: tag submits, read prior history, repeat `/submit-hpc campaign_id=<slug>` until the strategy stops. See [`docs/workflows/campaign.md`](docs/workflows/campaign.md). |
202
+ | `/hpc-axes-init` | Write `<experiment>/.hpc/axes.yaml` with the parallel-axis enumeration + homogeneity hint that drives the cold-start (and warm-path) array-axis picker. |
203
+
204
+ ### Primitives
205
+
206
+ The slash commands above compose ~50 primitives exposed as `hpc-agent <name>`. Full machine-readable catalog at `docs/generated/operations.md` (auto-regenerated). High-traffic ones for agent orchestration:
207
+
208
+ | Primitive | Replaces |
209
+ |---|---|
210
+ | `submit-flow` / `submit-flow-batch` | rsync + deploy + qsub + record (single or N-spec batch with shared rsync). Auto-dispatches when the spec is `{specs: [...]}`. |
211
+ | `monitor-flow` | Poll-and-combine loop the slash command's tick body wraps. |
212
+ | `aggregate-flow` | rsync_pull `_combiner/` + `reduce_partials` + optional summary pull + ingest runtime samples. |
213
+ | `build-submit-spec` | Resolved-interview-values → validated `submit_flow.input.json` spec. |
214
+ | `build-tasks-py` | Cartesian-product axes → `.hpc/tasks.py` from the canonical Pattern 1 template. |
215
+ | `discover-executors` / `discover-reducers` | Scan repo for executor scripts / aggregator scripts (find existing reducer instead of writing a fresh one). |
216
+ | `decide-monitor-arm` | Pick cron/loop/none + cadence + cron schedule + literal `armed:` line. |
217
+ | `monitor-summary` | Canonical user-facing tick summary (byte-stable framing). |
218
+ | `summarize-submit-plan` | Canonical pre-submit confirmation summary. |
219
+ | `verify-canary` | Wait + grep + output-check protocol for 1-task canary submissions. |
220
+ | `verify-aggregation-complete` | All-waves-combined / all-tasks-present / no-cross-run-contamination invariant report. |
221
+ | `suggest-setup-action` / `find-prior-run` | `/submit-hpc` Setup priority cascade + `cmd_sha` resume detection. |
222
+ | `prune-orphan-sidecars` | Clean half-baked sidecars from failed batches. |
223
+
224
+ `hpc-agent <name> --help` shows the per-primitive args; many take `--spec <path>` for a JSON input. See `docs/primitives/<name>.md` for the per-primitive contract (idempotency, side effects, error codes, schemas).
225
+
226
+ ## Configuration
227
+
228
+ ### `clusters.yaml` (required)
229
+
230
+ Cluster infrastructure definitions. Ships inside the package at `hpc_agent/config/clusters.yaml`. Override the active path with `HPC_CLUSTERS_CONFIG=/your/clusters.yaml` (useful for integrators who want to keep their cluster definitions outside the package):
231
+
232
+ ```yaml
233
+ hoffman2:
234
+ host: hoffman2.idre.ucla.edu
235
+ user: <your_user>
236
+ scheduler: sge
237
+ scratch: <your_scratch>
238
+ modules: [python/3.11.9]
239
+ conda_source: /u/local/apps/anaconda3/2024.06/etc/profile.d/conda.sh
240
+ conda_envs: [<your_env>] # optional — Claude presents these as options
241
+ gpu_types: [a100, h200, a6000]
242
+ ```
243
+
244
+ ### `~/.hpc-agent/config.json` (optional)
245
+
246
+ Per-user config for the `recall` primitive's default `--root`. List one or more directories under `experiment_roots` and `recall` walks them all when `--root` is omitted:
247
+
248
+ ```json
249
+ {
250
+ "experiment_roots": [
251
+ "/home/user/experiments",
252
+ "/scratch/user/campaigns"
253
+ ]
254
+ }
255
+ ```
256
+
257
+ The `--root` CLI flag still wins when set. If neither flag nor config is present, `recall` errors with `spec_invalid` rather than silently falling back to cwd.
258
+
259
+ ### Caching
260
+
261
+ Claude remembers your preferences (cluster, executor directory, environment, resources) across conversations via Claude Code memory. The `.hpc/runs/<run_id>.json` sidecars (paired with `.hpc/tasks.py`) serve as the submission record for monitoring and resubmission.
262
+
263
+ ## Job Templates
264
+
265
+ | Template | SGE | SLURM |
266
+ |----------|-----|-------|
267
+ | CPU array | `hpc_agent/mapreduce/templates/runtime/sge/cpu_array.sh` | `hpc_agent/mapreduce/templates/runtime/slurm/cpu_array.slurm` |
268
+ | GPU array | `hpc_agent/mapreduce/templates/runtime/sge/gpu_array.sh` | `hpc_agent/mapreduce/templates/runtime/slurm/gpu_array.slurm` |
269
+
270
+ Templates are parameterized via environment variables injected at submission time. Resolve paths via `hpc_agent.get_template_path(scheduler, template)`. The GPU template is used when the configured resources include `gpus`; otherwise the CPU template is used.
271
+
272
+ ## Supported Clusters
273
+
274
+ | Cluster | Institution | Scheduler |
275
+ |---------|------------|-----------|
276
+ | Hoffman2 | UCLA IDRE | SGE |
277
+ | Discovery | USC CARC | SLURM |
278
+
279
+ Cluster connection details are in `hpc_agent/config/clusters.yaml` (or whatever `HPC_CLUSTERS_CONFIG` points at).
280
+
281
+ ## Python API
282
+
283
+ ```python
284
+ from hpc_agent import (
285
+ # Framework subdirectory layout
286
+ framework_subdir, runs_subdir, tasks_path, load_tasks_module,
287
+ # Per-run sidecars
288
+ compute_cmd_sha, write_run_sidecar, read_run_sidecar,
289
+ find_run_by_cmd_sha, find_existing_runs,
290
+ # Cluster config
291
+ load_clusters_config, get_template_path, _PACKAGE_ROOT,
292
+ # Submission
293
+ ClusterConstraints, parse_constraints,
294
+ WorkloadSpec, compute_submission_plan, build_wave_map,
295
+ deploy_runtime, run_combiner_checked,
296
+ )
297
+ from hpc_agent.infra.backends import get_backend
298
+ ```
299
+
300
+ ## Development
301
+
302
+ ```bash
303
+ pip install -e '.[dev]'
304
+ pre-commit install # auto-runs ruff, frontmatter regen, index regen
305
+ pytest -q # 1400+ tests
306
+ ```
307
+
308
+ The pre-commit hook regenerates `docs/primitives/*.md` frontmatter,
309
+ `docs/primitives/README.md` catalog, and `docs/generated/operations.md`
310
+ from the `@primitive` registry, then auto-stages the result. Without it
311
+ you'll see CI fail on the corresponding `--check` gates and have to
312
+ push a follow-up `chore: regenerate ...` commit.
313
+
@@ -0,0 +1,277 @@
1
+ # hpc-agent
2
+
3
+ HPC orchestrator for array-batch experiments on SGE/SLURM clusters. Two surfaces over one core:
4
+
5
+ - **Slash commands for humans** in Claude Code (`/submit-hpc`, `/monitor-hpc`, `/aggregate-hpc`, `/campaign-hpc`, `/preflight`) — interactive markdown templates in `slash_commands/commands/*.md` that walk you through choosing a cluster and authoring `.hpc/tasks.py`. Executor scaffolding is folded into `/submit-hpc` Step 1; preflight is folded into `/submit-hpc` Step 6b as an idempotent gate (with `/preflight` still available as a standalone diagnostic).
6
+ - **CLI for agents and automation** (`hpc-agent <subcommand>`) — JSON-in, JSON-out, exit codes. Designed to be invoked via a `Bash`-style tool by external orchestrators. This is a POSIX-native agent surface: any tool that can shell out and parse JSON can drive a cluster — see [`docs/reference/agent-surface.md`](docs/reference/agent-surface.md). For integrators: [`docs/integrations/CONTRACT.md`](docs/integrations/CONTRACT.md).
7
+
8
+ Both surfaces invoke `hpc-agent <subcommand>`. The slash commands are pure markdown that orchestrate the binary; the binary's atomic-ops layer (`hpc_agent.runner`) ensures cross-surface state — in-flight runs, journal records under `~/.claude/hpc/<repo_hash>/` — is shared automatically.
9
+
10
+ ## Quick Start
11
+
12
+ ### For humans (Claude Code)
13
+
14
+ ```bash
15
+ pip install hpc-agent # or `pip install -e .` from a checkout
16
+ hpc-agent setup # copy commands + skills, wire the Stop hooks
17
+ ```
18
+ `hpc-agent setup` copies the bundled slash commands into
19
+ `~/.claude/commands/` and the skills into `~/.claude/skills/`, then
20
+ installs hpc-agent's Stop hooks — all idempotent, so re-running is
21
+ safe. Both asset trees ship inside the package, so this works the same
22
+ from a wheel install or an editable checkout. Pass `--no-hooks` to
23
+ skip the hook step or `--dry-run` to preview. Every command
24
+ (`/preflight`, `/submit-hpc`, `/monitor-hpc`, `/aggregate-hpc`,
25
+ `/campaign-hpc`, `/hpc-axes-init`) and skill ships inside the package.
26
+
27
+ Once installed:
28
+
29
+ - `/preflight` (optional) — verify SSH agent + cluster reachability. `/submit-hpc` auto-runs this as a cached gate, so you only need it for ad-hoc diagnostics.
30
+ - `/submit-hpc` — answer prompts about cluster, executor, grid params. Scaffolds the executor inline if none exists.
31
+ - `/monitor-hpc` to monitor, `/aggregate-hpc` to collect results.
32
+
33
+ ### For agents and automation
34
+
35
+ ```bash
36
+ pip install hpc-agent
37
+ hpc-agent preflight --cluster hoffman2 # health check
38
+ hpc-agent interview --spec intent.json --campaign-dir <d> # persist campaign intent next to tasks.py
39
+ hpc-agent recall --root ~/experiments --task-kind <kind> # query past interviews for next-interview grounding
40
+ hpc-agent submit --spec spec.json # JSON envelope on stdout
41
+ hpc-agent status --run-id <id> # one-shot snapshot; poll as needed
42
+ hpc-agent aggregate --run-id <id> --wave 1 # combiner + result pull
43
+ ```
44
+ Stdout is a single-line JSON envelope: `{"ok": true, "idempotent": ..., "data": {...}}` or `{"ok": false, "error_code": ..., "retry_safe": ..., "remediation": ...}`. Exit codes: 0 ok, 1 user error, 2 cluster/network, 3 internal. Full schema in [`docs/reference/cli-spec.md`](docs/reference/cli-spec.md); JSON Schema files for runtime validation under `hpc_agent/schemas/`.
45
+
46
+ ### For integrators
47
+
48
+ hpc-agent is `Bash`-invokable from any agent harness with a JSON
49
+ parser. See **[`docs/integrations/CONTRACT.md`](docs/integrations/CONTRACT.md)**
50
+ for the full contract: the spawn env block,
51
+ `error_code` → retry policy table, the `find-prior-run` → `submit` →
52
+ `monitor-summary` → `verify-aggregation-complete` workflow, the
53
+ `.hpc/tasks.py` boundary, and the executor import allowlist.
54
+
55
+ The canonical reference for `.hpc/tasks.py` is shipped inside the
56
+ package at
57
+ [`src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py`](src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py).
58
+ It demonstrates three patterns (Cartesian product, chunking by row
59
+ count, date-window backtests) inline. Integrators locate it at runtime
60
+ via `from hpc_agent import _PACKAGE_ROOT` or `rglob("tasks_example.py")`.
61
+
62
+ The most common first-time failure is the harness's default-empty
63
+ spawn env dropping `SSH_AUTH_SOCK`. `hpc-agent
64
+ status`/`aggregate`/`reconcile` fail fast with `error_code:
65
+ "ssh_unreachable"` (exit 2) instead of hanging on auth — run
66
+ `hpc-agent preflight` first to verify the spawn env. hpc-agent does
67
+ not kill cluster jobs by design (`settings.json` denies
68
+ `scancel`/`qdel`); if the integrator decides a run is bad, stop
69
+ polling and let it expire.
70
+
71
+ ---
72
+
73
+ ## Standalone usage
74
+
75
+ ### Organize your experiment repo
76
+
77
+ Keep standalone executor scripts in a dedicated directory, separate from shared utilities:
78
+
79
+ ```
80
+ my_experiment/
81
+ ├── executors/ # or src/ — each file is a runnable experiment
82
+ │ ├── ml_ridge.py # python3 executors/ml_ridge.py --help
83
+ │ ├── ml_xgboost.py
84
+ │ └── dl_patchts.py
85
+ ├── lib/ # shared utilities (not executors)
86
+ │ ├── loading.py
87
+ │ └── transforms.py
88
+ └── data/
89
+ ```
90
+
91
+ Each executor accepts experiment-specific arguments (`--horizon`, `--start`, `--end`, `--features`, etc.). No HPC awareness is needed — all parameters arrive as CLI flags.
92
+
93
+ ### Run
94
+
95
+ ```
96
+ /preflight → verify SSH agent + cluster reachability before first submit
97
+ /submit → discovers executors, walks you through .hpc/tasks.py, syncs code, submits
98
+ /monitor-hpc → tracks completion per grid point, diagnoses failures, auto-resubmits
99
+ /aggregate → validates completeness, runs aggregation, downloads summaries
100
+ ```
101
+
102
+ **Example conversation:**
103
+
104
+ ```
105
+ You: /submit run ridge and xgboost with horizon=[1, 5, 25]
106
+
107
+ Claude: I found these executors in src/:
108
+ ml_ridge.py — --horizon, --start, --end, --output-file
109
+ ml_xgboost.py — --horizon, --start, --end, --output-file
110
+
111
+ Proposed plan:
112
+ Cluster: hoffman2 (SGE)
113
+ Grid: executor=[ml_ridge, ml_xgboost] × horizon=[1, 5, 25] → 6 grid points
114
+ Total: 6 tasks
115
+ Resources: 1 CPU, 16G, 4:00:00
116
+ Confirm?
117
+
118
+ You: yes
119
+
120
+ Claude: Submitted job 12345678 (6 tasks). Run /monitor-hpc to track progress.
121
+ ```
122
+
123
+ No config files required. Claude discovers your executors by reading their source and `--help`, then suggests resources conversationally based on the executor and your input.
124
+
125
+ ## How It Works
126
+
127
+ The boundary between hpc-agent and your experiment repo is documented in [`docs/reference/boundary-contract.md`](docs/reference/boundary-contract.md) and enforced by `tests/test_boundary_contract.py`.
128
+
129
+ 1. Claude reads your executor scripts and their `--help` output.
130
+ 2. You describe what to run in natural language — Claude walks you through writing `.hpc/tasks.py` once: a small Python module exposing `total()` and `resolve(task_id)` that returns the per-task kwargs. The file is committed to git and reused on every subsequent submit.
131
+ 3. A per-run sidecar `.hpc/runs/<run_id>.json` records the executor command, result-dir template, `cmd_sha`, and wave map for this particular submission.
132
+ 4. The framework executor `_hpc_dispatch.py` (zero deps, stdlib-only) is deployed to the cluster's `.hpc/` by `deploy_runtime`.
133
+ 5. The job template runs the dispatcher, which imports your `.hpc/tasks.py`, calls `resolve(task_id)`, formats the result_dir, and execs your executor command with kwargs as env vars.
134
+ 6. Your executor reads kwargs as ordinary env vars (uppercased + `HPC_KW_*`) — no HPC awareness needed.
135
+
136
+ ### Parallelism Model
137
+
138
+ The parallelization axis lives entirely in user code (`.hpc/tasks.py`). The framework is agnostic to whether you're doing a Cartesian grid, chunking by row count, date-window backtests, or something else — it just calls `total()` and `resolve(i)`. The canonical reference at `hpc_agent/mapreduce/templates/scaffolds/tasks_example.py` shows three patterns inline; the agent helps you keep whichever applies and delete the rest.
139
+
140
+ ### Memory across campaigns
141
+
142
+ Two primitives — `interview` and `recall` — close the loop between consecutive campaigns. The interview agent (Claude Code or any external orchestrator) persists structured intent (`goal`, `task_count`, `budget`, `abort_if`, `task_generator`, `cluster_target`, `transcript`, provenance) into `<campaign_dir>/interview.json` next to the materialized `tasks.py`. The next interview calls `recall --root <experiments-dir>` to query past intents, returning recency-sorted summaries plus a 3-tier rollup (counts/histograms/quantiles, optional walltime aggregation, optional per-generator parameter envelopes). Observed ranges only — reasoning over them stays in the calling agent.
143
+
144
+ See [`docs/workflows/memory-across-campaigns.md`](docs/workflows/memory-across-campaigns.md) for the full flow, including the `task_generator` typed materializer (5 shapes: `enumerated`, `cartesian_product`, `items_x_seeds`, `numeric_logspace`, `numeric_linspace`) and the `~/.hpc-agent/config.json:experiment_roots` default-root config.
145
+
146
+ ### Throughput Optimization
147
+
148
+ hpc-agent automatically optimizes job submissions for cluster constraints. When constraints are configured (max array size, walltime, concurrent job limits), the optimizer packs tasks into batched waves:
149
+
150
+ - Tasks are split into arrays of ≤max_array_size
151
+ - Arrays are grouped into waves of ≤max_concurrent_jobs
152
+ - Waves are staggered via scheduler dependencies (SLURM `--dependency`, SGE `-hold_jid`)
153
+ - Total wall-clock time is estimated when per-task duration is known
154
+
155
+ Configure constraints in `clusters.yaml` (cluster-level); per-experiment overrides resolved at `/submit` time are persisted to the run sidecar at `.hpc/runs/<run_id>.json`.
156
+
157
+ ## Commands
158
+
159
+ | Command | What it does |
160
+ |---------|-------------|
161
+ | `/preflight` | Standalone: verify SSH agent, ssh/rsync on PATH, clusters.yaml parses, cluster reachable. `/submit-hpc` auto-runs the same checks as a 24h-cached gate, so direct invocation is mostly for ad-hoc diagnostics. |
162
+ | `/submit-hpc` | Discover executors (scaffolds inline if none found), build grid conversationally, write `.hpc/tasks.py` with FLAGS dict + `.hpc/cli.py` dispatcher, sync code, submit array jobs |
163
+ | `/monitor-hpc` | Poll status, diagnose failures, auto-resubmit, self-schedule next check |
164
+ | `/aggregate-hpc` | Validate completeness, run aggregation on cluster, download summaries |
165
+ | `/campaign-hpc` | Closed-loop iteration: tag submits, read prior history, repeat `/submit-hpc campaign_id=<slug>` until the strategy stops. See [`docs/workflows/campaign.md`](docs/workflows/campaign.md). |
166
+ | `/hpc-axes-init` | Write `<experiment>/.hpc/axes.yaml` with the parallel-axis enumeration + homogeneity hint that drives the cold-start (and warm-path) array-axis picker. |
167
+
168
+ ### Primitives
169
+
170
+ The slash commands above compose ~50 primitives exposed as `hpc-agent <name>`. Full machine-readable catalog at `docs/generated/operations.md` (auto-regenerated). High-traffic ones for agent orchestration:
171
+
172
+ | Primitive | Replaces |
173
+ |---|---|
174
+ | `submit-flow` / `submit-flow-batch` | rsync + deploy + qsub + record (single or N-spec batch with shared rsync). Auto-dispatches when the spec is `{specs: [...]}`. |
175
+ | `monitor-flow` | Poll-and-combine loop the slash command's tick body wraps. |
176
+ | `aggregate-flow` | rsync_pull `_combiner/` + `reduce_partials` + optional summary pull + ingest runtime samples. |
177
+ | `build-submit-spec` | Resolved-interview-values → validated `submit_flow.input.json` spec. |
178
+ | `build-tasks-py` | Cartesian-product axes → `.hpc/tasks.py` from the canonical Pattern 1 template. |
179
+ | `discover-executors` / `discover-reducers` | Scan repo for executor scripts / aggregator scripts (find existing reducer instead of writing a fresh one). |
180
+ | `decide-monitor-arm` | Pick cron/loop/none + cadence + cron schedule + literal `armed:` line. |
181
+ | `monitor-summary` | Canonical user-facing tick summary (byte-stable framing). |
182
+ | `summarize-submit-plan` | Canonical pre-submit confirmation summary. |
183
+ | `verify-canary` | Wait + grep + output-check protocol for 1-task canary submissions. |
184
+ | `verify-aggregation-complete` | All-waves-combined / all-tasks-present / no-cross-run-contamination invariant report. |
185
+ | `suggest-setup-action` / `find-prior-run` | `/submit-hpc` Setup priority cascade + `cmd_sha` resume detection. |
186
+ | `prune-orphan-sidecars` | Clean half-baked sidecars from failed batches. |
187
+
188
+ `hpc-agent <name> --help` shows the per-primitive args; many take `--spec <path>` for a JSON input. See `docs/primitives/<name>.md` for the per-primitive contract (idempotency, side effects, error codes, schemas).
189
+
190
+ ## Configuration
191
+
192
+ ### `clusters.yaml` (required)
193
+
194
+ Cluster infrastructure definitions. Ships inside the package at `hpc_agent/config/clusters.yaml`. Override the active path with `HPC_CLUSTERS_CONFIG=/your/clusters.yaml` (useful for integrators who want to keep their cluster definitions outside the package):
195
+
196
+ ```yaml
197
+ hoffman2:
198
+ host: hoffman2.idre.ucla.edu
199
+ user: <your_user>
200
+ scheduler: sge
201
+ scratch: <your_scratch>
202
+ modules: [python/3.11.9]
203
+ conda_source: /u/local/apps/anaconda3/2024.06/etc/profile.d/conda.sh
204
+ conda_envs: [<your_env>] # optional — Claude presents these as options
205
+ gpu_types: [a100, h200, a6000]
206
+ ```
207
+
208
+ ### `~/.hpc-agent/config.json` (optional)
209
+
210
+ Per-user config for the `recall` primitive's default `--root`. List one or more directories under `experiment_roots` and `recall` walks them all when `--root` is omitted:
211
+
212
+ ```json
213
+ {
214
+ "experiment_roots": [
215
+ "/home/user/experiments",
216
+ "/scratch/user/campaigns"
217
+ ]
218
+ }
219
+ ```
220
+
221
+ The `--root` CLI flag still wins when set. If neither flag nor config is present, `recall` errors with `spec_invalid` rather than silently falling back to cwd.
222
+
223
+ ### Caching
224
+
225
+ Claude remembers your preferences (cluster, executor directory, environment, resources) across conversations via Claude Code memory. The `.hpc/runs/<run_id>.json` sidecars (paired with `.hpc/tasks.py`) serve as the submission record for monitoring and resubmission.
226
+
227
+ ## Job Templates
228
+
229
+ | Template | SGE | SLURM |
230
+ |----------|-----|-------|
231
+ | CPU array | `hpc_agent/mapreduce/templates/runtime/sge/cpu_array.sh` | `hpc_agent/mapreduce/templates/runtime/slurm/cpu_array.slurm` |
232
+ | GPU array | `hpc_agent/mapreduce/templates/runtime/sge/gpu_array.sh` | `hpc_agent/mapreduce/templates/runtime/slurm/gpu_array.slurm` |
233
+
234
+ Templates are parameterized via environment variables injected at submission time. Resolve paths via `hpc_agent.get_template_path(scheduler, template)`. The GPU template is used when the configured resources include `gpus`; otherwise the CPU template is used.
235
+
236
+ ## Supported Clusters
237
+
238
+ | Cluster | Institution | Scheduler |
239
+ |---------|------------|-----------|
240
+ | Hoffman2 | UCLA IDRE | SGE |
241
+ | Discovery | USC CARC | SLURM |
242
+
243
+ Cluster connection details are in `hpc_agent/config/clusters.yaml` (or whatever `HPC_CLUSTERS_CONFIG` points at).
244
+
245
+ ## Python API
246
+
247
+ ```python
248
+ from hpc_agent import (
249
+ # Framework subdirectory layout
250
+ framework_subdir, runs_subdir, tasks_path, load_tasks_module,
251
+ # Per-run sidecars
252
+ compute_cmd_sha, write_run_sidecar, read_run_sidecar,
253
+ find_run_by_cmd_sha, find_existing_runs,
254
+ # Cluster config
255
+ load_clusters_config, get_template_path, _PACKAGE_ROOT,
256
+ # Submission
257
+ ClusterConstraints, parse_constraints,
258
+ WorkloadSpec, compute_submission_plan, build_wave_map,
259
+ deploy_runtime, run_combiner_checked,
260
+ )
261
+ from hpc_agent.infra.backends import get_backend
262
+ ```
263
+
264
+ ## Development
265
+
266
+ ```bash
267
+ pip install -e '.[dev]'
268
+ pre-commit install # auto-runs ruff, frontmatter regen, index regen
269
+ pytest -q # 1400+ tests
270
+ ```
271
+
272
+ The pre-commit hook regenerates `docs/primitives/*.md` frontmatter,
273
+ `docs/primitives/README.md` catalog, and `docs/generated/operations.md`
274
+ from the `@primitive` registry, then auto-stages the result. Without it
275
+ you'll see CI fail on the corresponding `--check` gates and have to
276
+ push a follow-up `chore: regenerate ...` commit.
277
+