hpc-agent 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_agent-0.3.0/LICENSE +21 -0
- hpc_agent-0.3.0/PKG-INFO +313 -0
- hpc_agent-0.3.0/README.md +277 -0
- hpc_agent-0.3.0/pyproject.toml +155 -0
- hpc_agent-0.3.0/setup.cfg +4 -0
- hpc_agent-0.3.0/src/hpc_agent/__init__.py +295 -0
- hpc_agent-0.3.0/src/hpc_agent/__main__.py +8 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/__init__.py +1 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/io.py +224 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/layout.py +175 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/lifecycle.py +147 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/operations.py +251 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/playbook.py +144 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/plugins.py +89 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/primitive.py +387 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/schema.py +160 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/session/__init__.py +85 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/session/index.py +237 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/session/journal.py +210 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/session/run_record.py +247 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/telemetry.py +137 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/time.py +50 -0
- hpc_agent-0.3.0/src/hpc_agent/_internal/version.py +115 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/__init__.py +24 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/_shared.py +145 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/__init__.py +0 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/build_executor.py +19 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/build_submit_spec.py +54 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/build_tasks_py.py +102 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/cluster_reduce.py +36 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/combine_wave.py +34 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/interview.py +300 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/resubmit.py +79 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/submit.py +41 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/actions/update_run_constraints.py +62 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/__init__.py +0 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/axes.py +84 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/campaign_manifest.py +64 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/envelope.py +69 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/fixtures/stages.py +68 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/__init__.py +0 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/campaign.py +55 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/campaign_health.py +39 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/capabilities.py +87 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/clusters.py +49 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/decide_monitor_arm.py +63 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/discover.py +20 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/failures.py +60 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/find_prior_run.py +31 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/inspect_cluster.py +35 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/list_in_flight.py +34 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/monitor_summary.py +42 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/recall.py +156 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/recommend_partition.py +66 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/reconcile.py +30 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/runtime_prior.py +47 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/status.py +46 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/suggest_setup_action.py +30 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/summarize_submit_plan.py +21 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/queries/verify_aggregation_complete.py +44 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/__init__.py +0 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/preflight.py +22 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_executor_signatures.py +52 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_input_dataset.py +44 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_self_qos_limit.py +49 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_stochastic_marker.py +72 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/validators/validate_walltime_against_history.py +41 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/__init__.py +0 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/aggregate_flow.py +125 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/monitor_flow.py +128 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/submit_flow.py +173 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/submit_flow_batch.py +92 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/validate_campaign.py +167 -0
- hpc_agent-0.3.0/src/hpc_agent/_schema_models/workflows/verify_canary.py +46 -0
- hpc_agent-0.3.0/src/hpc_agent/agent_assets.py +119 -0
- hpc_agent-0.3.0/src/hpc_agent/agent_cli.py +2808 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/__init__.py +7 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/aggregation_invariants.py +205 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/axes_init.py +93 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/build_executor.py +73 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/build_submit_spec.py +239 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/build_tasks_py.py +552 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/build_template.py +159 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_advance.py +103 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_budget.py +126 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_converged.py +168 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_health.py +208 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_init.py +120 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_list.py +43 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_replay.py +72 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/campaign_status.py +42 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/canary_verify.py +293 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/capabilities.py +90 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/cluster_reduce.py +298 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/clusters.py +77 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/failures.py +159 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/interview.py +299 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/list_in_flight.py +75 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/logs.py +111 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/monitor_arm.py +259 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/monitor_summary.py +175 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/plan_throughput.py +118 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/preflight.py +121 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/recall.py +455 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/recommend_partition.py +159 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/setup_actions.py +248 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/submit_plan_summary.py +130 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/validate_executor_signatures.py +240 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/validate_input_dataset.py +283 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/validate_self_qos_limit.py +110 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/validate_stochastic_marker.py +122 -0
- hpc_agent-0.3.0/src/hpc_agent/atoms/validate_walltime_against_history.py +218 -0
- hpc_agent-0.3.0/src/hpc_agent/campaign/__init__.py +31 -0
- hpc_agent-0.3.0/src/hpc_agent/campaign/cursor.py +130 -0
- hpc_agent-0.3.0/src/hpc_agent/campaign/dirs.py +34 -0
- hpc_agent-0.3.0/src/hpc_agent/campaign/manifest.py +123 -0
- hpc_agent-0.3.0/src/hpc_agent/config/clusters.yaml +114 -0
- hpc_agent-0.3.0/src/hpc_agent/errors.py +255 -0
- hpc_agent-0.3.0/src/hpc_agent/executor_cli.py +192 -0
- hpc_agent-0.3.0/src/hpc_agent/flows/__init__.py +16 -0
- hpc_agent-0.3.0/src/hpc_agent/flows/aggregate_flow.py +378 -0
- hpc_agent-0.3.0/src/hpc_agent/flows/monitor_flow.py +613 -0
- hpc_agent-0.3.0/src/hpc_agent/flows/resubmit_flow.py +578 -0
- hpc_agent-0.3.0/src/hpc_agent/flows/submit_flow.py +610 -0
- hpc_agent-0.3.0/src/hpc_agent/flows/validate_campaign.py +201 -0
- hpc_agent-0.3.0/src/hpc_agent/hooks/__init__.py +9 -0
- hpc_agent-0.3.0/src/hpc_agent/hooks/install.py +196 -0
- hpc_agent-0.3.0/src/hpc_agent/hooks/monitor_armed_check.py +258 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/__init__.py +14 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/backends/__init__.py +389 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/backends/_remote_base.py +67 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/backends/query.py +454 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/backends/sge.py +177 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/backends/sge_remote.py +69 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/backends/slurm.py +196 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/backends/slurm_remote.py +88 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/cache.py +143 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/clusters.py +424 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/gpu.py +432 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/inspect/__init__.py +172 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/inspect/_common.py +181 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/inspect/_persist.py +203 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/inspect/sge.py +184 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/inspect/slurm.py +340 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/parsing.py +247 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/remote.py +890 -0
- hpc_agent-0.3.0/src/hpc_agent/infra/slurm_reservations.py +311 -0
- hpc_agent-0.3.0/src/hpc_agent/integration/__init__.py +87 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/__init__.py +1 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/combiner.py +390 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/dispatch.py +609 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/metrics_io.py +109 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/__init__.py +23 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/classify.py +120 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/history.py +161 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/metrics.py +277 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/rollup.py +95 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/status.py +680 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/reduce/tui.py +576 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/common/gpu_preamble.sh +85 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/common/hpc_preamble.sh +231 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/sge/cpu_array.sh +82 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/sge/gpu_array.sh +90 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/slurm/cpu_array.slurm +88 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/runtime/slurm/gpu_array.slurm +95 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/scaffolds/cli_dispatcher.py +154 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/scaffolds/executor_template.py +174 -0
- hpc_agent-0.3.0/src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py +109 -0
- hpc_agent-0.3.0/src/hpc_agent/operations.json +620 -0
- hpc_agent-0.3.0/src/hpc_agent/planning/__init__.py +15 -0
- hpc_agent-0.3.0/src/hpc_agent/planning/axes.py +384 -0
- hpc_agent-0.3.0/src/hpc_agent/planning/constraints.py +64 -0
- hpc_agent-0.3.0/src/hpc_agent/planning/resubmit_batching.py +215 -0
- hpc_agent-0.3.0/src/hpc_agent/planning/stages.py +128 -0
- hpc_agent-0.3.0/src/hpc_agent/planning/throughput.py +188 -0
- hpc_agent-0.3.0/src/hpc_agent/py.typed +0 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/__init__.py +73 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/_ssh.py +26 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/aggregate.py +193 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/combine.py +78 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/failure_signatures.py +206 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/failures.py +267 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/logs.py +71 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/reconcile.py +219 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/resubmit.py +129 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/status.py +123 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/submit.py +160 -0
- hpc_agent-0.3.0/src/hpc_agent/runner/update_constraints.py +173 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/aggregate_flow.input.json +69 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/aggregate_flow.output.json +81 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/axes.json +75 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/build_executor.output.json +30 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/build_submit_spec.input.json +266 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/build_tasks_py.input.json +184 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/campaign.output.json +113 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/campaign_health.input.json +59 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/campaign_health.output.json +94 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/campaign_manifest.json +246 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/capabilities.output.json +243 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/cluster_reduce.output.json +52 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/clusters_describe.output.json +40 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/clusters_list.output.json +50 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/combine_wave.output.json +59 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/decide_monitor_arm.input.json +89 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/decide_monitor_arm.output.json +90 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/discover.output.json +57 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/envelope.json +156 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/failures.output.json +143 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/find_prior_run.output.json +116 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/inspect_cluster.output.json +170 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/interview.input.json +550 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/interview.output.json +92 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/list_in_flight.output.json +90 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/monitor_flow.input.json +51 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/monitor_flow.output.json +79 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/monitor_summary.output.json +54 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/preflight.output.json +58 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/recall.input.json +82 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/recall.output.json +502 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/recommend_partition.input.json +80 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/recommend_partition.output.json +43 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/reconcile.output.json +53 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/resubmit.input.json +144 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/runtime_prior.output.json +158 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/stages.input.json +244 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/status.output.json +109 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/submit.input.json +89 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/submit.output.json +37 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/submit_flow.input.json +194 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/submit_flow.output.json +73 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/submit_flow_batch.input.json +244 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/submit_flow_batch.output.json +98 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/suggest_setup_action.output.json +62 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/summarize_submit_plan.output.json +28 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/update_run_constraints.input.json +43 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/update_run_constraints.output.json +37 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_campaign.input.json +164 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_campaign.output.json +120 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_executor_signatures.input.json +37 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_executor_signatures.output.json +99 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_input_dataset.input.json +41 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_input_dataset.output.json +99 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_self_qos_limit.input.json +52 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_self_qos_limit.output.json +99 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_stochastic_marker.input.json +26 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_stochastic_marker.output.json +107 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_walltime_against_history.input.json +48 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/validate_walltime_against_history.output.json +99 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/verify_aggregation_complete.output.json +94 -0
- hpc_agent-0.3.0/src/hpc_agent/schemas/verify_canary.output.json +65 -0
- hpc_agent-0.3.0/src/hpc_agent/state/__init__.py +14 -0
- hpc_agent-0.3.0/src/hpc_agent/state/discover.py +479 -0
- hpc_agent-0.3.0/src/hpc_agent/state/runs.py +649 -0
- hpc_agent-0.3.0/src/hpc_agent/state/runtime_prior.py +498 -0
- hpc_agent-0.3.0/src/hpc_agent/state/user_profiles.py +392 -0
- hpc_agent-0.3.0/src/hpc_agent/template/__init__.py +110 -0
- hpc_agent-0.3.0/src/hpc_agent/template/_runtime.py +266 -0
- hpc_agent-0.3.0/src/hpc_agent/template/axis.py +159 -0
- hpc_agent-0.3.0/src/hpc_agent/template/discover.py +182 -0
- hpc_agent-0.3.0/src/hpc_agent/template/elision.py +192 -0
- hpc_agent-0.3.0/src/hpc_agent/template/notebook.py +151 -0
- hpc_agent-0.3.0/src/hpc_agent/template/plan.py +158 -0
- hpc_agent-0.3.0/src/hpc_agent/template/reduce.py +67 -0
- hpc_agent-0.3.0/src/hpc_agent/template/register.py +20 -0
- hpc_agent-0.3.0/src/hpc_agent/template/scaffold/Makefile.tmpl +8 -0
- hpc_agent-0.3.0/src/hpc_agent/template/scaffold/ci.yml.tmpl +28 -0
- hpc_agent-0.3.0/src/hpc_agent/template/scaffold/conftest.py.tmpl +25 -0
- hpc_agent-0.3.0/src/hpc_agent/template/scaffold/experiment.ipynb.tmpl +57 -0
- hpc_agent-0.3.0/src/hpc_agent/template/scaffold/pre-commit-config.yaml.tmpl +12 -0
- hpc_agent-0.3.0/src/hpc_agent/template/scaffold/pyproject.toml.tmpl +24 -0
- hpc_agent-0.3.0/src/hpc_agent/template/scaffold/scaffold.py.tmpl +113 -0
- hpc_agent-0.3.0/src/hpc_agent/template/scaffold/template.mk.tmpl +29 -0
- hpc_agent-0.3.0/src/hpc_agent/template/series.py +32 -0
- hpc_agent-0.3.0/src/hpc_agent/template/signature.py +318 -0
- hpc_agent-0.3.0/src/hpc_agent.egg-info/PKG-INFO +313 -0
- hpc_agent-0.3.0/src/hpc_agent.egg-info/SOURCES.txt +292 -0
- hpc_agent-0.3.0/src/hpc_agent.egg-info/dependency_links.txt +1 -0
- hpc_agent-0.3.0/src/hpc_agent.egg-info/entry_points.txt +2 -0
- hpc_agent-0.3.0/src/hpc_agent.egg-info/requires.txt +17 -0
- hpc_agent-0.3.0/src/hpc_agent.egg-info/top_level.txt +2 -0
- hpc_agent-0.3.0/src/slash_commands/__init__.py +0 -0
- hpc_agent-0.3.0/src/slash_commands/commands/aggregate-hpc.md +52 -0
- hpc_agent-0.3.0/src/slash_commands/commands/campaign-hpc.md +34 -0
- hpc_agent-0.3.0/src/slash_commands/commands/hpc-axes-init.md +31 -0
- hpc_agent-0.3.0/src/slash_commands/commands/monitor-hpc.md +49 -0
- hpc_agent-0.3.0/src/slash_commands/commands/preflight.md +19 -0
- hpc_agent-0.3.0/src/slash_commands/commands/submit-hpc.md +69 -0
- hpc_agent-0.3.0/src/slash_commands/commands/validate-campaign.md +101 -0
- hpc_agent-0.3.0/src/slash_commands/skills/hpc-aggregate/SKILL.md +71 -0
- hpc_agent-0.3.0/src/slash_commands/skills/hpc-build-executor/SKILL.md +57 -0
- hpc_agent-0.3.0/src/slash_commands/skills/hpc-campaign/SKILL.md +63 -0
- hpc_agent-0.3.0/src/slash_commands/skills/hpc-preflight/SKILL.md +32 -0
- hpc_agent-0.3.0/src/slash_commands/skills/hpc-status/SKILL.md +53 -0
- hpc_agent-0.3.0/src/slash_commands/skills/hpc-submit/SKILL.md +259 -0
hpc_agent-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 hpc-agent contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
hpc_agent-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hpc-agent
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: HPC orchestrator for Claude Code and external agent harnesses
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/jamesdchen/hpc-agent
|
|
7
|
+
Project-URL: Repository, https://github.com/jamesdchen/hpc-agent
|
|
8
|
+
Project-URL: Issues, https://github.com/jamesdchen/hpc-agent/issues
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: POSIX
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: pyyaml>=6.0
|
|
21
|
+
Requires-Dist: jsonschema>=4.18
|
|
22
|
+
Requires-Dist: referencing>=0.30
|
|
23
|
+
Requires-Dist: pydantic>=2.6
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
26
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-xdist>=3.8; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
30
|
+
Requires-Dist: hypothesis>=6.100; extra == "dev"
|
|
31
|
+
Requires-Dist: types-PyYAML>=6.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pre-commit>=3.5; extra == "dev"
|
|
33
|
+
Provides-Extra: tui
|
|
34
|
+
Requires-Dist: rich>=13.0; extra == "tui"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# hpc-agent
|
|
38
|
+
|
|
39
|
+
HPC orchestrator for array-batch experiments on SGE/SLURM clusters. Two surfaces over one core:
|
|
40
|
+
|
|
41
|
+
- **Slash commands for humans** in Claude Code (`/submit-hpc`, `/monitor-hpc`, `/aggregate-hpc`, `/campaign-hpc`, `/preflight`) — interactive markdown templates in `slash_commands/commands/*.md` that walk you through choosing a cluster and authoring `.hpc/tasks.py`. Executor scaffolding is folded into `/submit-hpc` Step 1; preflight is folded into `/submit-hpc` Step 6b as an idempotent gate (with `/preflight` still available as a standalone diagnostic).
|
|
42
|
+
- **CLI for agents and automation** (`hpc-agent <subcommand>`) — JSON-in, JSON-out, exit codes. Designed to be invoked via a `Bash`-style tool by external orchestrators. This is a POSIX-native agent surface: any tool that can shell out and parse JSON can drive a cluster — see [`docs/reference/agent-surface.md`](docs/reference/agent-surface.md). For integrators: [`docs/integrations/CONTRACT.md`](docs/integrations/CONTRACT.md).
|
|
43
|
+
|
|
44
|
+
Both surfaces invoke `hpc-agent <subcommand>`. The slash commands are pure markdown that orchestrate the binary; the binary's atomic-ops layer (`hpc_agent.runner`) ensures cross-surface state — in-flight runs, journal records under `~/.claude/hpc/<repo_hash>/` — is shared automatically.
|
|
45
|
+
|
|
46
|
+
## Quick Start
|
|
47
|
+
|
|
48
|
+
### For humans (Claude Code)
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install hpc-agent # or `pip install -e .` from a checkout
|
|
52
|
+
hpc-agent setup # copy commands + skills, wire the Stop hooks
|
|
53
|
+
```
|
|
54
|
+
`hpc-agent setup` copies the bundled slash commands into
|
|
55
|
+
`~/.claude/commands/` and the skills into `~/.claude/skills/`, then
|
|
56
|
+
installs hpc-agent's Stop hooks — all idempotent, so re-running is
|
|
57
|
+
safe. Both asset trees ship inside the package, so this works the same
|
|
58
|
+
from a wheel install or an editable checkout. Pass `--no-hooks` to
|
|
59
|
+
skip the hook step or `--dry-run` to preview. Every command
|
|
60
|
+
(`/preflight`, `/submit-hpc`, `/monitor-hpc`, `/aggregate-hpc`,
|
|
61
|
+
`/campaign-hpc`, `/hpc-axes-init`) and skill ships inside the package.
|
|
62
|
+
|
|
63
|
+
Once installed:
|
|
64
|
+
|
|
65
|
+
- `/preflight` (optional) — verify SSH agent + cluster reachability. `/submit-hpc` auto-runs this as a cached gate, so you only need it for ad-hoc diagnostics.
|
|
66
|
+
- `/submit-hpc` — answer prompts about cluster, executor, grid params. Scaffolds the executor inline if none exists.
|
|
67
|
+
- `/monitor-hpc` to monitor, `/aggregate-hpc` to collect results.
|
|
68
|
+
|
|
69
|
+
### For agents and automation
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install hpc-agent
|
|
73
|
+
hpc-agent preflight --cluster hoffman2 # health check
|
|
74
|
+
hpc-agent interview --spec intent.json --campaign-dir <d> # persist campaign intent next to tasks.py
|
|
75
|
+
hpc-agent recall --root ~/experiments --task-kind <kind> # query past interviews for next-interview grounding
|
|
76
|
+
hpc-agent submit --spec spec.json # JSON envelope on stdout
|
|
77
|
+
hpc-agent status --run-id <id> # one-shot snapshot; poll as needed
|
|
78
|
+
hpc-agent aggregate --run-id <id> --wave 1 # combiner + result pull
|
|
79
|
+
```
|
|
80
|
+
Stdout is a single-line JSON envelope: `{"ok": true, "idempotent": ..., "data": {...}}` or `{"ok": false, "error_code": ..., "retry_safe": ..., "remediation": ...}`. Exit codes: 0 ok, 1 user error, 2 cluster/network, 3 internal. Full schema in [`docs/reference/cli-spec.md`](docs/reference/cli-spec.md); JSON Schema files for runtime validation under `hpc_agent/schemas/`.
|
|
81
|
+
|
|
82
|
+
### For integrators
|
|
83
|
+
|
|
84
|
+
hpc-agent is `Bash`-invokable from any agent harness with a JSON
|
|
85
|
+
parser. See **[`docs/integrations/CONTRACT.md`](docs/integrations/CONTRACT.md)**
|
|
86
|
+
for the full contract: the spawn env block,
|
|
87
|
+
`error_code` → retry policy table, the `find-prior-run` → `submit` →
|
|
88
|
+
`monitor-summary` → `verify-aggregation-complete` workflow, the
|
|
89
|
+
`.hpc/tasks.py` boundary, and the executor import allowlist.
|
|
90
|
+
|
|
91
|
+
The canonical reference for `.hpc/tasks.py` is shipped inside the
|
|
92
|
+
package at
|
|
93
|
+
[`src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py`](src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py).
|
|
94
|
+
It demonstrates three patterns (Cartesian product, chunking by row
|
|
95
|
+
count, date-window backtests) inline. Integrators locate it at runtime
|
|
96
|
+
via `from hpc_agent import _PACKAGE_ROOT` or `rglob("tasks_example.py")`.
|
|
97
|
+
|
|
98
|
+
The most common first-time failure is the harness's default-empty
|
|
99
|
+
spawn env dropping `SSH_AUTH_SOCK`. `hpc-agent
|
|
100
|
+
status`/`aggregate`/`reconcile` fail fast with `error_code:
|
|
101
|
+
"ssh_unreachable"` (exit 2) instead of hanging on auth — run
|
|
102
|
+
`hpc-agent preflight` first to verify the spawn env. hpc-agent does
|
|
103
|
+
not kill cluster jobs by design (`settings.json` denies
|
|
104
|
+
`scancel`/`qdel`); if the integrator decides a run is bad, stop
|
|
105
|
+
polling and let it expire.
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Standalone usage
|
|
110
|
+
|
|
111
|
+
### Organize your experiment repo
|
|
112
|
+
|
|
113
|
+
Keep standalone executor scripts in a dedicated directory, separate from shared utilities:
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
my_experiment/
|
|
117
|
+
├── executors/ # or src/ — each file is a runnable experiment
|
|
118
|
+
│ ├── ml_ridge.py # python3 executors/ml_ridge.py --help
|
|
119
|
+
│ ├── ml_xgboost.py
|
|
120
|
+
│ └── dl_patchts.py
|
|
121
|
+
├── lib/ # shared utilities (not executors)
|
|
122
|
+
│ ├── loading.py
|
|
123
|
+
│ └── transforms.py
|
|
124
|
+
└── data/
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Each executor accepts experiment-specific arguments (`--horizon`, `--start`, `--end`, `--features`, etc.). No HPC awareness is needed — all parameters arrive as CLI flags.
|
|
128
|
+
|
|
129
|
+
### Run
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
/preflight → verify SSH agent + cluster reachability before first submit
|
|
133
|
+
/submit → discovers executors, walks you through .hpc/tasks.py, syncs code, submits
|
|
134
|
+
/monitor-hpc → tracks completion per grid point, diagnoses failures, auto-resubmits
|
|
135
|
+
/aggregate → validates completeness, runs aggregation, downloads summaries
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**Example conversation:**
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
You: /submit run ridge and xgboost with horizon=[1, 5, 25]
|
|
142
|
+
|
|
143
|
+
Claude: I found these executors in src/:
|
|
144
|
+
ml_ridge.py — --horizon, --start, --end, --output-file
|
|
145
|
+
ml_xgboost.py — --horizon, --start, --end, --output-file
|
|
146
|
+
|
|
147
|
+
Proposed plan:
|
|
148
|
+
Cluster: hoffman2 (SGE)
|
|
149
|
+
Grid: executor=[ml_ridge, ml_xgboost] × horizon=[1, 5, 25] → 6 grid points
|
|
150
|
+
Total: 6 tasks
|
|
151
|
+
Resources: 1 CPU, 16G, 4:00:00
|
|
152
|
+
Confirm?
|
|
153
|
+
|
|
154
|
+
You: yes
|
|
155
|
+
|
|
156
|
+
Claude: Submitted job 12345678 (6 tasks). Run /monitor-hpc to track progress.
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
No config files required. Claude discovers your executors by reading their source and `--help`, then suggests resources conversationally based on the executor and your input.
|
|
160
|
+
|
|
161
|
+
## How It Works
|
|
162
|
+
|
|
163
|
+
The boundary between hpc-agent and your experiment repo is documented in [`docs/reference/boundary-contract.md`](docs/reference/boundary-contract.md) and enforced by `tests/test_boundary_contract.py`.
|
|
164
|
+
|
|
165
|
+
1. Claude reads your executor scripts and their `--help` output.
|
|
166
|
+
2. You describe what to run in natural language — Claude walks you through writing `.hpc/tasks.py` once: a small Python module exposing `total()` and `resolve(task_id)` that returns the per-task kwargs. The file is committed to git and reused on every subsequent submit.
|
|
167
|
+
3. A per-run sidecar `.hpc/runs/<run_id>.json` records the executor command, result-dir template, `cmd_sha`, and wave map for this particular submission.
|
|
168
|
+
4. The framework executor `_hpc_dispatch.py` (zero deps, stdlib-only) is deployed to the cluster's `.hpc/` by `deploy_runtime`.
|
|
169
|
+
5. The job template runs the dispatcher, which imports your `.hpc/tasks.py`, calls `resolve(task_id)`, formats the result_dir, and execs your executor command with kwargs as env vars.
|
|
170
|
+
6. Your executor reads kwargs as ordinary env vars (uppercased + `HPC_KW_*`) — no HPC awareness needed.
|
|
171
|
+
|
|
172
|
+
### Parallelism Model
|
|
173
|
+
|
|
174
|
+
The parallelization axis lives entirely in user code (`.hpc/tasks.py`). The framework is agnostic to whether you're doing a Cartesian grid, chunking by row count, date-window backtests, or something else — it just calls `total()` and `resolve(i)`. The canonical reference at `hpc_agent/mapreduce/templates/scaffolds/tasks_example.py` shows three patterns inline; the agent helps you keep whichever applies and delete the rest.
|
|
175
|
+
|
|
176
|
+
### Memory across campaigns
|
|
177
|
+
|
|
178
|
+
Two primitives — `interview` and `recall` — close the loop between consecutive campaigns. The interview agent (Claude Code or any external orchestrator) persists structured intent (`goal`, `task_count`, `budget`, `abort_if`, `task_generator`, `cluster_target`, `transcript`, provenance) into `<campaign_dir>/interview.json` next to the materialized `tasks.py`. The next interview calls `recall --root <experiments-dir>` to query past intents, returning recency-sorted summaries plus a 3-tier rollup (counts/histograms/quantiles, optional walltime aggregation, optional per-generator parameter envelopes). Observed ranges only — reasoning over them stays in the calling agent.
|
|
179
|
+
|
|
180
|
+
See [`docs/workflows/memory-across-campaigns.md`](docs/workflows/memory-across-campaigns.md) for the full flow, including the `task_generator` typed materializer (5 shapes: `enumerated`, `cartesian_product`, `items_x_seeds`, `numeric_logspace`, `numeric_linspace`) and the `~/.hpc-agent/config.json:experiment_roots` default-root config.
|
|
181
|
+
|
|
182
|
+
### Throughput Optimization
|
|
183
|
+
|
|
184
|
+
hpc-agent automatically optimizes job submissions for cluster constraints. When constraints are configured (max array size, walltime, concurrent job limits), the optimizer packs tasks into batched waves:
|
|
185
|
+
|
|
186
|
+
- Tasks are split into arrays of ≤max_array_size
|
|
187
|
+
- Arrays are grouped into waves of ≤max_concurrent_jobs
|
|
188
|
+
- Waves are staggered via scheduler dependencies (SLURM `--dependency`, SGE `-hold_jid`)
|
|
189
|
+
- Total wall-clock time is estimated when per-task duration is known
|
|
190
|
+
|
|
191
|
+
Configure constraints in `clusters.yaml` (cluster-level); per-experiment overrides resolved at `/submit` time are persisted to the run sidecar at `.hpc/runs/<run_id>.json`.
|
|
192
|
+
|
|
193
|
+
## Commands
|
|
194
|
+
|
|
195
|
+
| Command | What it does |
|
|
196
|
+
|---------|-------------|
|
|
197
|
+
| `/preflight` | Standalone: verify SSH agent, ssh/rsync on PATH, clusters.yaml parses, cluster reachable. `/submit-hpc` auto-runs the same checks as a 24h-cached gate, so direct invocation is mostly for ad-hoc diagnostics. |
|
|
198
|
+
| `/submit-hpc` | Discover executors (scaffolds inline if none found), build grid conversationally, write `.hpc/tasks.py` with FLAGS dict + `.hpc/cli.py` dispatcher, sync code, submit array jobs |
|
|
199
|
+
| `/monitor-hpc` | Poll status, diagnose failures, auto-resubmit, self-schedule next check |
|
|
200
|
+
| `/aggregate-hpc` | Validate completeness, run aggregation on cluster, download summaries |
|
|
201
|
+
| `/campaign-hpc` | Closed-loop iteration: tag submits, read prior history, repeat `/submit-hpc campaign_id=<slug>` until the strategy stops. See [`docs/workflows/campaign.md`](docs/workflows/campaign.md). |
|
|
202
|
+
| `/hpc-axes-init` | Write `<experiment>/.hpc/axes.yaml` with the parallel-axis enumeration + homogeneity hint that drives the cold-start (and warm-path) array-axis picker. |
|
|
203
|
+
|
|
204
|
+
### Primitives
|
|
205
|
+
|
|
206
|
+
The slash commands above compose ~50 primitives exposed as `hpc-agent <name>`. Full machine-readable catalog at `docs/generated/operations.md` (auto-regenerated). High-traffic ones for agent orchestration:
|
|
207
|
+
|
|
208
|
+
| Primitive | Replaces |
|
|
209
|
+
|---|---|
|
|
210
|
+
| `submit-flow` / `submit-flow-batch` | rsync + deploy + qsub + record (single or N-spec batch with shared rsync). Auto-dispatches when the spec is `{specs: [...]}`. |
|
|
211
|
+
| `monitor-flow` | Poll-and-combine loop the slash command's tick body wraps. |
|
|
212
|
+
| `aggregate-flow` | rsync_pull `_combiner/` + `reduce_partials` + optional summary pull + ingest runtime samples. |
|
|
213
|
+
| `build-submit-spec` | Resolved-interview-values → validated `submit_flow.input.json` spec. |
|
|
214
|
+
| `build-tasks-py` | Cartesian-product axes → `.hpc/tasks.py` from the canonical Pattern 1 template. |
|
|
215
|
+
| `discover-executors` / `discover-reducers` | Scan repo for executor scripts / aggregator scripts (find existing reducer instead of writing a fresh one). |
|
|
216
|
+
| `decide-monitor-arm` | Pick cron/loop/none + cadence + cron schedule + literal `armed:` line. |
|
|
217
|
+
| `monitor-summary` | Canonical user-facing tick summary (byte-stable framing). |
|
|
218
|
+
| `summarize-submit-plan` | Canonical pre-submit confirmation summary. |
|
|
219
|
+
| `verify-canary` | Wait + grep + output-check protocol for 1-task canary submissions. |
|
|
220
|
+
| `verify-aggregation-complete` | All-waves-combined / all-tasks-present / no-cross-run-contamination invariant report. |
|
|
221
|
+
| `suggest-setup-action` / `find-prior-run` | `/submit-hpc` Setup priority cascade + `cmd_sha` resume detection. |
|
|
222
|
+
| `prune-orphan-sidecars` | Clean half-baked sidecars from failed batches. |
|
|
223
|
+
|
|
224
|
+
`hpc-agent <name> --help` shows the per-primitive args; many take `--spec <path>` for a JSON input. See `docs/primitives/<name>.md` for the per-primitive contract (idempotency, side effects, error codes, schemas).
|
|
225
|
+
|
|
226
|
+
## Configuration
|
|
227
|
+
|
|
228
|
+
### `clusters.yaml` (required)
|
|
229
|
+
|
|
230
|
+
Cluster infrastructure definitions. Ships inside the package at `hpc_agent/config/clusters.yaml`. Override the active path with `HPC_CLUSTERS_CONFIG=/your/clusters.yaml` (useful for integrators who want to keep their cluster definitions outside the package):
|
|
231
|
+
|
|
232
|
+
```yaml
|
|
233
|
+
hoffman2:
|
|
234
|
+
host: hoffman2.idre.ucla.edu
|
|
235
|
+
user: <your_user>
|
|
236
|
+
scheduler: sge
|
|
237
|
+
scratch: <your_scratch>
|
|
238
|
+
modules: [python/3.11.9]
|
|
239
|
+
conda_source: /u/local/apps/anaconda3/2024.06/etc/profile.d/conda.sh
|
|
240
|
+
conda_envs: [<your_env>] # optional — Claude presents these as options
|
|
241
|
+
gpu_types: [a100, h200, a6000]
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### `~/.hpc-agent/config.json` (optional)
|
|
245
|
+
|
|
246
|
+
Per-user config for the `recall` primitive's default `--root`. List one or more directories under `experiment_roots` and `recall` walks them all when `--root` is omitted:
|
|
247
|
+
|
|
248
|
+
```json
|
|
249
|
+
{
|
|
250
|
+
"experiment_roots": [
|
|
251
|
+
"/home/user/experiments",
|
|
252
|
+
"/scratch/user/campaigns"
|
|
253
|
+
]
|
|
254
|
+
}
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
The `--root` CLI flag still wins when set. If neither flag nor config is present, `recall` errors with `spec_invalid` rather than silently falling back to cwd.
|
|
258
|
+
|
|
259
|
+
### Caching
|
|
260
|
+
|
|
261
|
+
Claude remembers your preferences (cluster, executor directory, environment, resources) across conversations via Claude Code memory. The `.hpc/runs/<run_id>.json` sidecars (paired with `.hpc/tasks.py`) serve as the submission record for monitoring and resubmission.
|
|
262
|
+
|
|
263
|
+
## Job Templates
|
|
264
|
+
|
|
265
|
+
| Template | SGE | SLURM |
|
|
266
|
+
|----------|-----|-------|
|
|
267
|
+
| CPU array | `hpc_agent/mapreduce/templates/runtime/sge/cpu_array.sh` | `hpc_agent/mapreduce/templates/runtime/slurm/cpu_array.slurm` |
|
|
268
|
+
| GPU array | `hpc_agent/mapreduce/templates/runtime/sge/gpu_array.sh` | `hpc_agent/mapreduce/templates/runtime/slurm/gpu_array.slurm` |
|
|
269
|
+
|
|
270
|
+
Templates are parameterized via environment variables injected at submission time. Resolve paths via `hpc_agent.get_template_path(scheduler, template)`. The GPU template is used when the configured resources include `gpus`; otherwise the CPU template is used.
|
|
271
|
+
|
|
272
|
+
## Supported Clusters
|
|
273
|
+
|
|
274
|
+
| Cluster | Institution | Scheduler |
|
|
275
|
+
|---------|------------|-----------|
|
|
276
|
+
| Hoffman2 | UCLA IDRE | SGE |
|
|
277
|
+
| Discovery | USC CARC | SLURM |
|
|
278
|
+
|
|
279
|
+
Cluster connection details are in `hpc_agent/config/clusters.yaml` (or whatever `HPC_CLUSTERS_CONFIG` points at).
|
|
280
|
+
|
|
281
|
+
## Python API
|
|
282
|
+
|
|
283
|
+
```python
|
|
284
|
+
from hpc_agent import (
|
|
285
|
+
# Framework subdirectory layout
|
|
286
|
+
framework_subdir, runs_subdir, tasks_path, load_tasks_module,
|
|
287
|
+
# Per-run sidecars
|
|
288
|
+
compute_cmd_sha, write_run_sidecar, read_run_sidecar,
|
|
289
|
+
find_run_by_cmd_sha, find_existing_runs,
|
|
290
|
+
# Cluster config
|
|
291
|
+
load_clusters_config, get_template_path, _PACKAGE_ROOT,
|
|
292
|
+
# Submission
|
|
293
|
+
ClusterConstraints, parse_constraints,
|
|
294
|
+
WorkloadSpec, compute_submission_plan, build_wave_map,
|
|
295
|
+
deploy_runtime, run_combiner_checked,
|
|
296
|
+
)
|
|
297
|
+
from hpc_agent.infra.backends import get_backend
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## Development
|
|
301
|
+
|
|
302
|
+
```bash
|
|
303
|
+
pip install -e '.[dev]'
|
|
304
|
+
pre-commit install # auto-runs ruff, frontmatter regen, index regen
|
|
305
|
+
pytest -q # 1400+ tests
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
The pre-commit hook regenerates `docs/primitives/*.md` frontmatter,
|
|
309
|
+
`docs/primitives/README.md` catalog, and `docs/generated/operations.md`
|
|
310
|
+
from the `@primitive` registry, then auto-stages the result. Without it
|
|
311
|
+
you'll see CI fail on the corresponding `--check` gates and have to
|
|
312
|
+
push a follow-up `chore: regenerate ...` commit.
|
|
313
|
+
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# hpc-agent
|
|
2
|
+
|
|
3
|
+
HPC orchestrator for array-batch experiments on SGE/SLURM clusters. Two surfaces over one core:
|
|
4
|
+
|
|
5
|
+
- **Slash commands for humans** in Claude Code (`/submit-hpc`, `/monitor-hpc`, `/aggregate-hpc`, `/campaign-hpc`, `/preflight`) — interactive markdown templates in `slash_commands/commands/*.md` that walk you through choosing a cluster and authoring `.hpc/tasks.py`. Executor scaffolding is folded into `/submit-hpc` Step 1; preflight is folded into `/submit-hpc` Step 6b as an idempotent gate (with `/preflight` still available as a standalone diagnostic).
|
|
6
|
+
- **CLI for agents and automation** (`hpc-agent <subcommand>`) — JSON-in, JSON-out, exit codes. Designed to be invoked via a `Bash`-style tool by external orchestrators. This is a POSIX-native agent surface: any tool that can shell out and parse JSON can drive a cluster — see [`docs/reference/agent-surface.md`](docs/reference/agent-surface.md). For integrators: [`docs/integrations/CONTRACT.md`](docs/integrations/CONTRACT.md).
|
|
7
|
+
|
|
8
|
+
Both surfaces invoke `hpc-agent <subcommand>`. The slash commands are pure markdown that orchestrate the binary; the binary's atomic-ops layer (`hpc_agent.runner`) ensures cross-surface state — in-flight runs, journal records under `~/.claude/hpc/<repo_hash>/` — is shared automatically.
|
|
9
|
+
|
|
10
|
+
## Quick Start
|
|
11
|
+
|
|
12
|
+
### For humans (Claude Code)
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install hpc-agent # or `pip install -e .` from a checkout
|
|
16
|
+
hpc-agent setup # copy commands + skills, wire the Stop hooks
|
|
17
|
+
```
|
|
18
|
+
`hpc-agent setup` copies the bundled slash commands into
|
|
19
|
+
`~/.claude/commands/` and the skills into `~/.claude/skills/`, then
|
|
20
|
+
installs hpc-agent's Stop hooks — all idempotent, so re-running is
|
|
21
|
+
safe. Both asset trees ship inside the package, so this works the same
|
|
22
|
+
from a wheel install or an editable checkout. Pass `--no-hooks` to
|
|
23
|
+
skip the hook step or `--dry-run` to preview. Every command
|
|
24
|
+
(`/preflight`, `/submit-hpc`, `/monitor-hpc`, `/aggregate-hpc`,
|
|
25
|
+
`/campaign-hpc`, `/hpc-axes-init`) and skill ships inside the package.
|
|
26
|
+
|
|
27
|
+
Once installed:
|
|
28
|
+
|
|
29
|
+
- `/preflight` (optional) — verify SSH agent + cluster reachability. `/submit-hpc` auto-runs this as a cached gate, so you only need it for ad-hoc diagnostics.
|
|
30
|
+
- `/submit-hpc` — answer prompts about cluster, executor, grid params. Scaffolds the executor inline if none exists.
|
|
31
|
+
- `/monitor-hpc` to monitor, `/aggregate-hpc` to collect results.
|
|
32
|
+
|
|
33
|
+
### For agents and automation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install hpc-agent
|
|
37
|
+
hpc-agent preflight --cluster hoffman2 # health check
|
|
38
|
+
hpc-agent interview --spec intent.json --campaign-dir <d> # persist campaign intent next to tasks.py
|
|
39
|
+
hpc-agent recall --root ~/experiments --task-kind <kind> # query past interviews for next-interview grounding
|
|
40
|
+
hpc-agent submit --spec spec.json # JSON envelope on stdout
|
|
41
|
+
hpc-agent status --run-id <id> # one-shot snapshot; poll as needed
|
|
42
|
+
hpc-agent aggregate --run-id <id> --wave 1 # combiner + result pull
|
|
43
|
+
```
|
|
44
|
+
Stdout is a single-line JSON envelope: `{"ok": true, "idempotent": ..., "data": {...}}` or `{"ok": false, "error_code": ..., "retry_safe": ..., "remediation": ...}`. Exit codes: 0 ok, 1 user error, 2 cluster/network, 3 internal. Full schema in [`docs/reference/cli-spec.md`](docs/reference/cli-spec.md); JSON Schema files for runtime validation under `hpc_agent/schemas/`.
|
|
45
|
+
|
|
46
|
+
### For integrators
|
|
47
|
+
|
|
48
|
+
hpc-agent is `Bash`-invokable from any agent harness with a JSON
|
|
49
|
+
parser. See **[`docs/integrations/CONTRACT.md`](docs/integrations/CONTRACT.md)**
|
|
50
|
+
for the full contract: the spawn env block,
|
|
51
|
+
`error_code` → retry policy table, the `find-prior-run` → `submit` →
|
|
52
|
+
`monitor-summary` → `verify-aggregation-complete` workflow, the
|
|
53
|
+
`.hpc/tasks.py` boundary, and the executor import allowlist.
|
|
54
|
+
|
|
55
|
+
The canonical reference for `.hpc/tasks.py` is shipped inside the
|
|
56
|
+
package at
|
|
57
|
+
[`src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py`](src/hpc_agent/mapreduce/templates/scaffolds/tasks_example.py).
|
|
58
|
+
It demonstrates three patterns (Cartesian product, chunking by row
|
|
59
|
+
count, date-window backtests) inline. Integrators locate it at runtime
|
|
60
|
+
via `from hpc_agent import _PACKAGE_ROOT` or `rglob("tasks_example.py")`.
|
|
61
|
+
|
|
62
|
+
The most common first-time failure is the harness's default-empty
|
|
63
|
+
spawn env dropping `SSH_AUTH_SOCK`. `hpc-agent
|
|
64
|
+
status`/`aggregate`/`reconcile` fail fast with `error_code:
|
|
65
|
+
"ssh_unreachable"` (exit 2) instead of hanging on auth — run
|
|
66
|
+
`hpc-agent preflight` first to verify the spawn env. hpc-agent does
|
|
67
|
+
not kill cluster jobs by design (`settings.json` denies
|
|
68
|
+
`scancel`/`qdel`); if the integrator decides a run is bad, stop
|
|
69
|
+
polling and let it expire.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Standalone usage
|
|
74
|
+
|
|
75
|
+
### Organize your experiment repo
|
|
76
|
+
|
|
77
|
+
Keep standalone executor scripts in a dedicated directory, separate from shared utilities:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
my_experiment/
|
|
81
|
+
├── executors/ # or src/ — each file is a runnable experiment
|
|
82
|
+
│ ├── ml_ridge.py # python3 executors/ml_ridge.py --help
|
|
83
|
+
│ ├── ml_xgboost.py
|
|
84
|
+
│ └── dl_patchts.py
|
|
85
|
+
├── lib/ # shared utilities (not executors)
|
|
86
|
+
│ ├── loading.py
|
|
87
|
+
│ └── transforms.py
|
|
88
|
+
└── data/
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Each executor accepts experiment-specific arguments (`--horizon`, `--start`, `--end`, `--features`, etc.). No HPC awareness is needed — all parameters arrive as CLI flags.
|
|
92
|
+
|
|
93
|
+
### Run
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
/preflight → verify SSH agent + cluster reachability before first submit
|
|
97
|
+
/submit → discovers executors, walks you through .hpc/tasks.py, syncs code, submits
|
|
98
|
+
/monitor-hpc → tracks completion per grid point, diagnoses failures, auto-resubmits
|
|
99
|
+
/aggregate → validates completeness, runs aggregation, downloads summaries
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Example conversation:**
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
You: /submit run ridge and xgboost with horizon=[1, 5, 25]
|
|
106
|
+
|
|
107
|
+
Claude: I found these executors in src/:
|
|
108
|
+
ml_ridge.py — --horizon, --start, --end, --output-file
|
|
109
|
+
ml_xgboost.py — --horizon, --start, --end, --output-file
|
|
110
|
+
|
|
111
|
+
Proposed plan:
|
|
112
|
+
Cluster: hoffman2 (SGE)
|
|
113
|
+
Grid: executor=[ml_ridge, ml_xgboost] × horizon=[1, 5, 25] → 6 grid points
|
|
114
|
+
Total: 6 tasks
|
|
115
|
+
Resources: 1 CPU, 16G, 4:00:00
|
|
116
|
+
Confirm?
|
|
117
|
+
|
|
118
|
+
You: yes
|
|
119
|
+
|
|
120
|
+
Claude: Submitted job 12345678 (6 tasks). Run /monitor-hpc to track progress.
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
No config files required. Claude discovers your executors by reading their source and `--help`, then suggests resources conversationally based on the executor and your input.
|
|
124
|
+
|
|
125
|
+
## How It Works
|
|
126
|
+
|
|
127
|
+
The boundary between hpc-agent and your experiment repo is documented in [`docs/reference/boundary-contract.md`](docs/reference/boundary-contract.md) and enforced by `tests/test_boundary_contract.py`.
|
|
128
|
+
|
|
129
|
+
1. Claude reads your executor scripts and their `--help` output.
|
|
130
|
+
2. You describe what to run in natural language — Claude walks you through writing `.hpc/tasks.py` once: a small Python module exposing `total()` and `resolve(task_id)` that returns the per-task kwargs. The file is committed to git and reused on every subsequent submit.
|
|
131
|
+
3. A per-run sidecar `.hpc/runs/<run_id>.json` records the executor command, result-dir template, `cmd_sha`, and wave map for this particular submission.
|
|
132
|
+
4. The framework executor `_hpc_dispatch.py` (zero deps, stdlib-only) is deployed to the cluster's `.hpc/` by `deploy_runtime`.
|
|
133
|
+
5. The job template runs the dispatcher, which imports your `.hpc/tasks.py`, calls `resolve(task_id)`, formats the result_dir, and execs your executor command with kwargs as env vars.
|
|
134
|
+
6. Your executor reads kwargs as ordinary env vars (uppercased + `HPC_KW_*`) — no HPC awareness needed.
|
|
135
|
+
|
|
136
|
+
### Parallelism Model
|
|
137
|
+
|
|
138
|
+
The parallelization axis lives entirely in user code (`.hpc/tasks.py`). The framework is agnostic to whether you're doing a Cartesian grid, chunking by row count, date-window backtests, or something else — it just calls `total()` and `resolve(i)`. The canonical reference at `hpc_agent/mapreduce/templates/scaffolds/tasks_example.py` shows three patterns inline; the agent helps you keep whichever applies and delete the rest.
|
|
139
|
+
|
|
140
|
+
### Memory across campaigns
|
|
141
|
+
|
|
142
|
+
Two primitives — `interview` and `recall` — close the loop between consecutive campaigns. The interview agent (Claude Code or any external orchestrator) persists structured intent (`goal`, `task_count`, `budget`, `abort_if`, `task_generator`, `cluster_target`, `transcript`, provenance) into `<campaign_dir>/interview.json` next to the materialized `tasks.py`. The next interview calls `recall --root <experiments-dir>` to query past intents, returning recency-sorted summaries plus a 3-tier rollup (counts/histograms/quantiles, optional walltime aggregation, optional per-generator parameter envelopes). Observed ranges only — reasoning over them stays in the calling agent.
|
|
143
|
+
|
|
144
|
+
See [`docs/workflows/memory-across-campaigns.md`](docs/workflows/memory-across-campaigns.md) for the full flow, including the `task_generator` typed materializer (5 shapes: `enumerated`, `cartesian_product`, `items_x_seeds`, `numeric_logspace`, `numeric_linspace`) and the `~/.hpc-agent/config.json:experiment_roots` default-root config.
|
|
145
|
+
|
|
146
|
+
### Throughput Optimization
|
|
147
|
+
|
|
148
|
+
hpc-agent automatically optimizes job submissions for cluster constraints. When constraints are configured (max array size, walltime, concurrent job limits), the optimizer packs tasks into batched waves:
|
|
149
|
+
|
|
150
|
+
- Tasks are split into arrays of ≤max_array_size
|
|
151
|
+
- Arrays are grouped into waves of ≤max_concurrent_jobs
|
|
152
|
+
- Waves are staggered via scheduler dependencies (SLURM `--dependency`, SGE `-hold_jid`)
|
|
153
|
+
- Total wall-clock time is estimated when per-task duration is known
|
|
154
|
+
|
|
155
|
+
Configure constraints in `clusters.yaml` (cluster-level); per-experiment overrides resolved at `/submit` time are persisted to the run sidecar at `.hpc/runs/<run_id>.json`.
|
|
156
|
+
|
|
157
|
+
## Commands
|
|
158
|
+
|
|
159
|
+
| Command | What it does |
|
|
160
|
+
|---------|-------------|
|
|
161
|
+
| `/preflight` | Standalone: verify SSH agent, ssh/rsync on PATH, clusters.yaml parses, cluster reachable. `/submit-hpc` auto-runs the same checks as a 24h-cached gate, so direct invocation is mostly for ad-hoc diagnostics. |
|
|
162
|
+
| `/submit-hpc` | Discover executors (scaffolds inline if none found), build grid conversationally, write `.hpc/tasks.py` with FLAGS dict + `.hpc/cli.py` dispatcher, sync code, submit array jobs |
|
|
163
|
+
| `/monitor-hpc` | Poll status, diagnose failures, auto-resubmit, self-schedule next check |
|
|
164
|
+
| `/aggregate-hpc` | Validate completeness, run aggregation on cluster, download summaries |
|
|
165
|
+
| `/campaign-hpc` | Closed-loop iteration: tag submits, read prior history, repeat `/submit-hpc campaign_id=<slug>` until the strategy stops. See [`docs/workflows/campaign.md`](docs/workflows/campaign.md). |
|
|
166
|
+
| `/hpc-axes-init` | Write `<experiment>/.hpc/axes.yaml` with the parallel-axis enumeration + homogeneity hint that drives the cold-start (and warm-path) array-axis picker. |
|
|
167
|
+
|
|
168
|
+
### Primitives
|
|
169
|
+
|
|
170
|
+
The slash commands above compose ~50 primitives exposed as `hpc-agent <name>`. Full machine-readable catalog at `docs/generated/operations.md` (auto-regenerated). High-traffic ones for agent orchestration:
|
|
171
|
+
|
|
172
|
+
| Primitive | Replaces |
|
|
173
|
+
|---|---|
|
|
174
|
+
| `submit-flow` / `submit-flow-batch` | rsync + deploy + qsub + record (single or N-spec batch with shared rsync). Auto-dispatches when the spec is `{specs: [...]}`. |
|
|
175
|
+
| `monitor-flow` | Poll-and-combine loop the slash command's tick body wraps. |
|
|
176
|
+
| `aggregate-flow` | rsync_pull `_combiner/` + `reduce_partials` + optional summary pull + ingest runtime samples. |
|
|
177
|
+
| `build-submit-spec` | Resolved-interview-values → validated `submit_flow.input.json` spec. |
|
|
178
|
+
| `build-tasks-py` | Cartesian-product axes → `.hpc/tasks.py` from the canonical Pattern 1 template. |
|
|
179
|
+
| `discover-executors` / `discover-reducers` | Scan repo for executor scripts / aggregator scripts (find existing reducer instead of writing a fresh one). |
|
|
180
|
+
| `decide-monitor-arm` | Pick cron/loop/none + cadence + cron schedule + literal `armed:` line. |
|
|
181
|
+
| `monitor-summary` | Canonical user-facing tick summary (byte-stable framing). |
|
|
182
|
+
| `summarize-submit-plan` | Canonical pre-submit confirmation summary. |
|
|
183
|
+
| `verify-canary` | Wait + grep + output-check protocol for 1-task canary submissions. |
|
|
184
|
+
| `verify-aggregation-complete` | All-waves-combined / all-tasks-present / no-cross-run-contamination invariant report. |
|
|
185
|
+
| `suggest-setup-action` / `find-prior-run` | `/submit-hpc` Setup priority cascade + `cmd_sha` resume detection. |
|
|
186
|
+
| `prune-orphan-sidecars` | Clean half-baked sidecars from failed batches. |
|
|
187
|
+
|
|
188
|
+
`hpc-agent <name> --help` shows the per-primitive args; many take `--spec <path>` for a JSON input. See `docs/primitives/<name>.md` for the per-primitive contract (idempotency, side effects, error codes, schemas).
|
|
189
|
+
|
|
190
|
+
## Configuration
|
|
191
|
+
|
|
192
|
+
### `clusters.yaml` (required)
|
|
193
|
+
|
|
194
|
+
Cluster infrastructure definitions. Ships inside the package at `hpc_agent/config/clusters.yaml`. Override the active path with `HPC_CLUSTERS_CONFIG=/your/clusters.yaml` (useful for integrators who want to keep their cluster definitions outside the package):
|
|
195
|
+
|
|
196
|
+
```yaml
|
|
197
|
+
hoffman2:
|
|
198
|
+
host: hoffman2.idre.ucla.edu
|
|
199
|
+
user: <your_user>
|
|
200
|
+
scheduler: sge
|
|
201
|
+
scratch: <your_scratch>
|
|
202
|
+
modules: [python/3.11.9]
|
|
203
|
+
conda_source: /u/local/apps/anaconda3/2024.06/etc/profile.d/conda.sh
|
|
204
|
+
conda_envs: [<your_env>] # optional — Claude presents these as options
|
|
205
|
+
gpu_types: [a100, h200, a6000]
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### `~/.hpc-agent/config.json` (optional)
|
|
209
|
+
|
|
210
|
+
Per-user config for the `recall` primitive's default `--root`. List one or more directories under `experiment_roots` and `recall` walks them all when `--root` is omitted:
|
|
211
|
+
|
|
212
|
+
```json
|
|
213
|
+
{
|
|
214
|
+
"experiment_roots": [
|
|
215
|
+
"/home/user/experiments",
|
|
216
|
+
"/scratch/user/campaigns"
|
|
217
|
+
]
|
|
218
|
+
}
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
The `--root` CLI flag still wins when set. If neither flag nor config is present, `recall` errors with `spec_invalid` rather than silently falling back to cwd.
|
|
222
|
+
|
|
223
|
+
### Caching
|
|
224
|
+
|
|
225
|
+
Claude remembers your preferences (cluster, executor directory, environment, resources) across conversations via Claude Code memory. The `.hpc/runs/<run_id>.json` sidecars (paired with `.hpc/tasks.py`) serve as the submission record for monitoring and resubmission.
|
|
226
|
+
|
|
227
|
+
## Job Templates
|
|
228
|
+
|
|
229
|
+
| Template | SGE | SLURM |
|
|
230
|
+
|----------|-----|-------|
|
|
231
|
+
| CPU array | `hpc_agent/mapreduce/templates/runtime/sge/cpu_array.sh` | `hpc_agent/mapreduce/templates/runtime/slurm/cpu_array.slurm` |
|
|
232
|
+
| GPU array | `hpc_agent/mapreduce/templates/runtime/sge/gpu_array.sh` | `hpc_agent/mapreduce/templates/runtime/slurm/gpu_array.slurm` |
|
|
233
|
+
|
|
234
|
+
Templates are parameterized via environment variables injected at submission time. Resolve paths via `hpc_agent.get_template_path(scheduler, template)`. The GPU template is used when the configured resources include `gpus`; otherwise the CPU template is used.
|
|
235
|
+
|
|
236
|
+
## Supported Clusters
|
|
237
|
+
|
|
238
|
+
| Cluster | Institution | Scheduler |
|
|
239
|
+
|---------|------------|-----------|
|
|
240
|
+
| Hoffman2 | UCLA IDRE | SGE |
|
|
241
|
+
| Discovery | USC CARC | SLURM |
|
|
242
|
+
|
|
243
|
+
Cluster connection details are in `hpc_agent/config/clusters.yaml` (or whatever `HPC_CLUSTERS_CONFIG` points at).
|
|
244
|
+
|
|
245
|
+
## Python API
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
from hpc_agent import (
|
|
249
|
+
# Framework subdirectory layout
|
|
250
|
+
framework_subdir, runs_subdir, tasks_path, load_tasks_module,
|
|
251
|
+
# Per-run sidecars
|
|
252
|
+
compute_cmd_sha, write_run_sidecar, read_run_sidecar,
|
|
253
|
+
find_run_by_cmd_sha, find_existing_runs,
|
|
254
|
+
# Cluster config
|
|
255
|
+
load_clusters_config, get_template_path, _PACKAGE_ROOT,
|
|
256
|
+
# Submission
|
|
257
|
+
ClusterConstraints, parse_constraints,
|
|
258
|
+
WorkloadSpec, compute_submission_plan, build_wave_map,
|
|
259
|
+
deploy_runtime, run_combiner_checked,
|
|
260
|
+
)
|
|
261
|
+
from hpc_agent.infra.backends import get_backend
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
## Development
|
|
265
|
+
|
|
266
|
+
```bash
|
|
267
|
+
pip install -e '.[dev]'
|
|
268
|
+
pre-commit install # auto-runs ruff, frontmatter regen, index regen
|
|
269
|
+
pytest -q # 1400+ tests
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
The pre-commit hook regenerates `docs/primitives/*.md` frontmatter,
|
|
273
|
+
`docs/primitives/README.md` catalog, and `docs/generated/operations.md`
|
|
274
|
+
from the `@primitive` registry, then auto-stages the result. Without it
|
|
275
|
+
you'll see CI fail on the corresponding `--check` gates and have to
|
|
276
|
+
push a follow-up `chore: regenerate ...` commit.
|
|
277
|
+
|