slurmforge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (371) hide show
  1. slurmforge/__init__.py +7 -0
  2. slurmforge/cli/__init__.py +1 -0
  3. slurmforge/cli/common.py +95 -0
  4. slurmforge/cli/examples.py +50 -0
  5. slurmforge/cli/generate.py +102 -0
  6. slurmforge/cli/init.py +148 -0
  7. slurmforge/cli/init_wizard.py +80 -0
  8. slurmforge/cli/replay.py +102 -0
  9. slurmforge/cli/rerun.py +76 -0
  10. slurmforge/cli/status.py +85 -0
  11. slurmforge/cli/validate.py +108 -0
  12. slurmforge/errors.py +29 -0
  13. slurmforge/example_configs.py +70 -0
  14. slurmforge/examples/__init__.py +1 -0
  15. slurmforge/examples/adapter_hpc.yaml +67 -0
  16. slurmforge/examples/adapter_minimal.yaml +14 -0
  17. slurmforge/examples/adapter_starter.yaml +60 -0
  18. slurmforge/examples/command_hpc.yaml +63 -0
  19. slurmforge/examples/command_minimal.yaml +9 -0
  20. slurmforge/examples/command_starter.yaml +56 -0
  21. slurmforge/examples/model_registry.yaml +7 -0
  22. slurmforge/examples/registry_hpc.yaml +86 -0
  23. slurmforge/examples/registry_starter.yaml +64 -0
  24. slurmforge/examples/script_hpc.yaml +104 -0
  25. slurmforge/examples/script_starter.yaml +62 -0
  26. slurmforge/execution/__init__.py +1 -0
  27. slurmforge/execution/artifacts/__init__.py +23 -0
  28. slurmforge/execution/artifacts/api.py +18 -0
  29. slurmforge/execution/artifacts/cli.py +52 -0
  30. slurmforge/execution/artifacts/copier.py +40 -0
  31. slurmforge/execution/artifacts/discovery.py +61 -0
  32. slurmforge/execution/artifacts/manifest.py +24 -0
  33. slurmforge/execution/artifacts/sync.py +125 -0
  34. slurmforge/execution/run_plan/__init__.py +18 -0
  35. slurmforge/execution/run_plan/api.py +42 -0
  36. slurmforge/execution/run_plan/cli.py +32 -0
  37. slurmforge/execution/run_plan/helper_bins.py +25 -0
  38. slurmforge/execution/run_plan/loader.py +47 -0
  39. slurmforge/execution/run_plan/post_run.py +20 -0
  40. slurmforge/execution/run_plan/shell_runner.py +44 -0
  41. slurmforge/execution/write_attempt_result.py +59 -0
  42. slurmforge/execution/write_train_outputs.py +59 -0
  43. slurmforge/identity.py +19 -0
  44. slurmforge/launcher.py +36 -0
  45. slurmforge/model_support/__init__.py +23 -0
  46. slurmforge/model_support/argparse_introspect.py +129 -0
  47. slurmforge/model_support/catalog/__init__.py +27 -0
  48. slurmforge/model_support/catalog/api.py +24 -0
  49. slurmforge/model_support/catalog/canonicalize.py +24 -0
  50. slurmforge/model_support/catalog/codecs.py +24 -0
  51. slurmforge/model_support/catalog/merge.py +50 -0
  52. slurmforge/model_support/catalog/models.py +29 -0
  53. slurmforge/model_support/catalog/registry_loader.py +62 -0
  54. slurmforge/model_support/catalog/resolver.py +176 -0
  55. slurmforge/model_support/gpu_estimator.py +324 -0
  56. slurmforge/pipeline/__init__.py +29 -0
  57. slurmforge/pipeline/checkpoints/__init__.py +23 -0
  58. slurmforge/pipeline/checkpoints/api.py +21 -0
  59. slurmforge/pipeline/checkpoints/codec.py +22 -0
  60. slurmforge/pipeline/checkpoints/discovery.py +37 -0
  61. slurmforge/pipeline/checkpoints/models.py +26 -0
  62. slurmforge/pipeline/checkpoints/selection.py +46 -0
  63. slurmforge/pipeline/checkpoints/store.py +29 -0
  64. slurmforge/pipeline/compiler/__init__.py +23 -0
  65. slurmforge/pipeline/compiler/api.py +96 -0
  66. slurmforge/pipeline/compiler/base.py +83 -0
  67. slurmforge/pipeline/compiler/config_pass.py +77 -0
  68. slurmforge/pipeline/compiler/diagnostics.py +43 -0
  69. slurmforge/pipeline/compiler/engine.py +104 -0
  70. slurmforge/pipeline/compiler/flows/__init__.py +6 -0
  71. slurmforge/pipeline/compiler/flows/authoring/__init__.py +5 -0
  72. slurmforge/pipeline/compiler/flows/authoring/api.py +56 -0
  73. slurmforge/pipeline/compiler/flows/authoring/collect.py +35 -0
  74. slurmforge/pipeline/compiler/flows/authoring/context.py +32 -0
  75. slurmforge/pipeline/compiler/flows/authoring/identity.py +53 -0
  76. slurmforge/pipeline/compiler/flows/authoring/spec_builder.py +34 -0
  77. slurmforge/pipeline/compiler/flows/replay/__init__.py +5 -0
  78. slurmforge/pipeline/compiler/flows/replay/api.py +58 -0
  79. slurmforge/pipeline/compiler/flows/replay/collect.py +45 -0
  80. slurmforge/pipeline/compiler/flows/replay/context.py +90 -0
  81. slurmforge/pipeline/compiler/flows/replay/identity.py +47 -0
  82. slurmforge/pipeline/compiler/flows/replay/spec_builder.py +23 -0
  83. slurmforge/pipeline/compiler/planning_pass.py +71 -0
  84. slurmforge/pipeline/compiler/reporting.py +46 -0
  85. slurmforge/pipeline/compiler/reports/__init__.py +39 -0
  86. slurmforge/pipeline/compiler/reports/actions.py +23 -0
  87. slurmforge/pipeline/compiler/reports/builders.py +87 -0
  88. slurmforge/pipeline/compiler/reports/errors.py +19 -0
  89. slurmforge/pipeline/compiler/reports/models.py +74 -0
  90. slurmforge/pipeline/compiler/reports/summary.py +77 -0
  91. slurmforge/pipeline/compiler/reports/validator.py +14 -0
  92. slurmforge/pipeline/compiler/requests.py +86 -0
  93. slurmforge/pipeline/compiler/state.py +59 -0
  94. slurmforge/pipeline/config/__init__.py +13 -0
  95. slurmforge/pipeline/config/api.py +49 -0
  96. slurmforge/pipeline/config/assembly/__init__.py +45 -0
  97. slurmforge/pipeline/config/assembly/authoring/__init__.py +16 -0
  98. slurmforge/pipeline/config/assembly/authoring/builders.py +70 -0
  99. slurmforge/pipeline/config/assembly/authoring/expansion.py +66 -0
  100. slurmforge/pipeline/config/assembly/authoring/models.py +16 -0
  101. slurmforge/pipeline/config/assembly/authoring/shared.py +58 -0
  102. slurmforge/pipeline/config/assembly/authoring/validation.py +51 -0
  103. slurmforge/pipeline/config/assembly/batch_contract.py +20 -0
  104. slurmforge/pipeline/config/assembly/catalog.py +30 -0
  105. slurmforge/pipeline/config/assembly/eval.py +114 -0
  106. slurmforge/pipeline/config/assembly/experiment/__init__.py +9 -0
  107. slurmforge/pipeline/config/assembly/experiment/api.py +26 -0
  108. slurmforge/pipeline/config/assembly/experiment/assembler.py +64 -0
  109. slurmforge/pipeline/config/assembly/experiment/hints.py +14 -0
  110. slurmforge/pipeline/config/assembly/experiment/inputs.py +61 -0
  111. slurmforge/pipeline/config/assembly/experiment/sections.py +103 -0
  112. slurmforge/pipeline/config/assembly/output.py +28 -0
  113. slurmforge/pipeline/config/assembly/replay.py +31 -0
  114. slurmforge/pipeline/config/assembly/run/__init__.py +17 -0
  115. slurmforge/pipeline/config/assembly/run/adapter.py +56 -0
  116. slurmforge/pipeline/config/assembly/run/builder.py +67 -0
  117. slurmforge/pipeline/config/assembly/run/external_runtime.py +56 -0
  118. slurmforge/pipeline/config/assembly/run/model.py +47 -0
  119. slurmforge/pipeline/config/assembly/run/shared.py +29 -0
  120. slurmforge/pipeline/config/assembly/spec_builder.py +81 -0
  121. slurmforge/pipeline/config/codecs/__init__.py +25 -0
  122. slurmforge/pipeline/config/codecs/eval.py +82 -0
  123. slurmforge/pipeline/config/codecs/experiment.py +54 -0
  124. slurmforge/pipeline/config/codecs/model.py +20 -0
  125. slurmforge/pipeline/config/codecs/output.py +16 -0
  126. slurmforge/pipeline/config/codecs/run.py +48 -0
  127. slurmforge/pipeline/config/codecs/runtime.py +10 -0
  128. slurmforge/pipeline/config/constants.py +7 -0
  129. slurmforge/pipeline/config/mode_detection.py +36 -0
  130. slurmforge/pipeline/config/models/__init__.py +23 -0
  131. slurmforge/pipeline/config/models/eval.py +32 -0
  132. slurmforge/pipeline/config/models/experiment.py +91 -0
  133. slurmforge/pipeline/config/models/model.py +13 -0
  134. slurmforge/pipeline/config/models/output.py +10 -0
  135. slurmforge/pipeline/config/models/run.py +35 -0
  136. slurmforge/pipeline/config/models/runtime.py +16 -0
  137. slurmforge/pipeline/config/normalize/__init__.py +42 -0
  138. slurmforge/pipeline/config/normalize/artifacts.py +27 -0
  139. slurmforge/pipeline/config/normalize/cluster.py +33 -0
  140. slurmforge/pipeline/config/normalize/env.py +34 -0
  141. slurmforge/pipeline/config/normalize/launcher.py +50 -0
  142. slurmforge/pipeline/config/normalize/notify.py +37 -0
  143. slurmforge/pipeline/config/normalize/resources.py +58 -0
  144. slurmforge/pipeline/config/normalize/shared.py +19 -0
  145. slurmforge/pipeline/config/normalize/slurm_deps.py +72 -0
  146. slurmforge/pipeline/config/normalize/validation.py +40 -0
  147. slurmforge/pipeline/config/replay_payload.py +58 -0
  148. slurmforge/pipeline/config/runtime/__init__.py +53 -0
  149. slurmforge/pipeline/config/runtime/api.py +57 -0
  150. slurmforge/pipeline/config/runtime/codecs/__init__.py +20 -0
  151. slurmforge/pipeline/config/runtime/codecs/artifacts.py +14 -0
  152. slurmforge/pipeline/config/runtime/codecs/cluster.py +20 -0
  153. slurmforge/pipeline/config/runtime/codecs/env.py +14 -0
  154. slurmforge/pipeline/config/runtime/codecs/launcher.py +24 -0
  155. slurmforge/pipeline/config/runtime/codecs/notify.py +13 -0
  156. slurmforge/pipeline/config/runtime/codecs/resources.py +17 -0
  157. slurmforge/pipeline/config/runtime/codecs/validation.py +14 -0
  158. slurmforge/pipeline/config/runtime/defaults.py +30 -0
  159. slurmforge/pipeline/config/runtime/models/__init__.py +20 -0
  160. slurmforge/pipeline/config/runtime/models/artifacts.py +43 -0
  161. slurmforge/pipeline/config/runtime/models/cluster.py +21 -0
  162. slurmforge/pipeline/config/runtime/models/env.py +19 -0
  163. slurmforge/pipeline/config/runtime/models/launcher.py +24 -0
  164. slurmforge/pipeline/config/runtime/models/notify.py +10 -0
  165. slurmforge/pipeline/config/runtime/models/resources.py +14 -0
  166. slurmforge/pipeline/config/runtime/models/validation.py +11 -0
  167. slurmforge/pipeline/config/scalars.py +25 -0
  168. slurmforge/pipeline/config/utils.py +67 -0
  169. slurmforge/pipeline/config/validation/__init__.py +28 -0
  170. slurmforge/pipeline/config/validation/_helpers.py +59 -0
  171. slurmforge/pipeline/config/validation/advisory.py +94 -0
  172. slurmforge/pipeline/config/validation/api.py +34 -0
  173. slurmforge/pipeline/config/validation/authoring.py +41 -0
  174. slurmforge/pipeline/config/validation/completeness.py +170 -0
  175. slurmforge/pipeline/config/validation/correctness.py +114 -0
  176. slurmforge/pipeline/config/validation/definitions.py +234 -0
  177. slurmforge/pipeline/config/validation/messages.py +188 -0
  178. slurmforge/pipeline/config/validation/replay.py +32 -0
  179. slurmforge/pipeline/config/validation/sections.py +132 -0
  180. slurmforge/pipeline/config/validation/sweep_rules.py +72 -0
  181. slurmforge/pipeline/config/validation/traversal.py +34 -0
  182. slurmforge/pipeline/launch/__init__.py +15 -0
  183. slurmforge/pipeline/launch/cli_args.py +47 -0
  184. slurmforge/pipeline/launch/command_builder.py +43 -0
  185. slurmforge/pipeline/launch/strategies.py +69 -0
  186. slurmforge/pipeline/launch/types.py +16 -0
  187. slurmforge/pipeline/materialization/__init__.py +29 -0
  188. slurmforge/pipeline/materialization/api.py +87 -0
  189. slurmforge/pipeline/materialization/array_scripts.py +140 -0
  190. slurmforge/pipeline/materialization/blocks/__init__.py +1 -0
  191. slurmforge/pipeline/materialization/blocks/artifacts.py +67 -0
  192. slurmforge/pipeline/materialization/blocks/common.py +25 -0
  193. slurmforge/pipeline/materialization/blocks/env_setup.py +59 -0
  194. slurmforge/pipeline/materialization/blocks/eval.py +72 -0
  195. slurmforge/pipeline/materialization/blocks/finalize.py +22 -0
  196. slurmforge/pipeline/materialization/blocks/preamble.py +47 -0
  197. slurmforge/pipeline/materialization/blocks/preflight.py +68 -0
  198. slurmforge/pipeline/materialization/blocks/train.py +63 -0
  199. slurmforge/pipeline/materialization/blocks/train_outputs.py +48 -0
  200. slurmforge/pipeline/materialization/commit.py +16 -0
  201. slurmforge/pipeline/materialization/context.py +45 -0
  202. slurmforge/pipeline/materialization/grouping.py +110 -0
  203. slurmforge/pipeline/materialization/layout.py +49 -0
  204. slurmforge/pipeline/materialization/manifest_writer.py +57 -0
  205. slurmforge/pipeline/materialization/record_writer.py +136 -0
  206. slurmforge/pipeline/materialization/reporting.py +55 -0
  207. slurmforge/pipeline/materialization/run_assets.py +37 -0
  208. slurmforge/pipeline/materialization/shell_builder.py +33 -0
  209. slurmforge/pipeline/materialization/slurm_deps.py +17 -0
  210. slurmforge/pipeline/materialization/submit_writer.py +72 -0
  211. slurmforge/pipeline/planning/__init__.py +35 -0
  212. slurmforge/pipeline/planning/api.py +20 -0
  213. slurmforge/pipeline/planning/batch.py +62 -0
  214. slurmforge/pipeline/planning/batch_validator.py +42 -0
  215. slurmforge/pipeline/planning/codecs/__init__.py +32 -0
  216. slurmforge/pipeline/planning/codecs/diagnostics.py +23 -0
  217. slurmforge/pipeline/planning/codecs/resources.py +74 -0
  218. slurmforge/pipeline/planning/codecs/stages.py +99 -0
  219. slurmforge/pipeline/planning/contracts.py +58 -0
  220. slurmforge/pipeline/planning/enums.py +65 -0
  221. slurmforge/pipeline/planning/eval/__init__.py +5 -0
  222. slurmforge/pipeline/planning/eval/api.py +53 -0
  223. slurmforge/pipeline/planning/eval/command.py +59 -0
  224. slurmforge/pipeline/planning/eval/common.py +28 -0
  225. slurmforge/pipeline/planning/eval/launcher_merge.py +59 -0
  226. slurmforge/pipeline/planning/eval/script.py +79 -0
  227. slurmforge/pipeline/planning/external_command.py +34 -0
  228. slurmforge/pipeline/planning/fingerprint.py +32 -0
  229. slurmforge/pipeline/planning/identity.py +62 -0
  230. slurmforge/pipeline/planning/models/__init__.py +15 -0
  231. slurmforge/pipeline/planning/models/diagnostics.py +61 -0
  232. slurmforge/pipeline/planning/models/resources.py +98 -0
  233. slurmforge/pipeline/planning/models/stages.py +94 -0
  234. slurmforge/pipeline/planning/replay_builder.py +23 -0
  235. slurmforge/pipeline/planning/run/__init__.py +9 -0
  236. slurmforge/pipeline/planning/run/api.py +11 -0
  237. slurmforge/pipeline/planning/run/assembly.py +12 -0
  238. slurmforge/pipeline/planning/run/identity.py +38 -0
  239. slurmforge/pipeline/planning/run/plan_factory.py +78 -0
  240. slurmforge/pipeline/planning/run/planned_run_factory.py +52 -0
  241. slurmforge/pipeline/planning/run/stages.py +47 -0
  242. slurmforge/pipeline/planning/snapshot_builder.py +33 -0
  243. slurmforge/pipeline/planning/train/__init__.py +23 -0
  244. slurmforge/pipeline/planning/train/allocation.py +51 -0
  245. slurmforge/pipeline/planning/train/api.py +55 -0
  246. slurmforge/pipeline/planning/train/context.py +55 -0
  247. slurmforge/pipeline/planning/train/model_resolution.py +94 -0
  248. slurmforge/pipeline/planning/train/strategies/__init__.py +3 -0
  249. slurmforge/pipeline/planning/train/strategies/adapter.py +61 -0
  250. slurmforge/pipeline/planning/train/strategies/base.py +14 -0
  251. slurmforge/pipeline/planning/train/strategies/command.py +79 -0
  252. slurmforge/pipeline/planning/train/strategies/model_cli.py +26 -0
  253. slurmforge/pipeline/planning/train/strategies/scripted.py +87 -0
  254. slurmforge/pipeline/planning/train/topology.py +134 -0
  255. slurmforge/pipeline/planning/validation/__init__.py +11 -0
  256. slurmforge/pipeline/planning/validation/api.py +25 -0
  257. slurmforge/pipeline/planning/validation/common.py +28 -0
  258. slurmforge/pipeline/planning/validation/errors.py +15 -0
  259. slurmforge/pipeline/planning/validation/formatter.py +9 -0
  260. slurmforge/pipeline/planning/validation/passes/__init__.py +3 -0
  261. slurmforge/pipeline/planning/validation/passes/cli_args.py +53 -0
  262. slurmforge/pipeline/planning/validation/passes/resources.py +151 -0
  263. slurmforge/pipeline/planning/validation/passes/summary.py +21 -0
  264. slurmforge/pipeline/planning/validation/passes/topology.py +115 -0
  265. slurmforge/pipeline/planning/validation/policies.py +50 -0
  266. slurmforge/pipeline/planning/validator.py +9 -0
  267. slurmforge/pipeline/records/__init__.py +3 -0
  268. slurmforge/pipeline/records/api.py +57 -0
  269. slurmforge/pipeline/records/batch_io.py +42 -0
  270. slurmforge/pipeline/records/batch_paths.py +83 -0
  271. slurmforge/pipeline/records/codecs/__init__.py +41 -0
  272. slurmforge/pipeline/records/codecs/array_assignment.py +31 -0
  273. slurmforge/pipeline/records/codecs/metadata.py +27 -0
  274. slurmforge/pipeline/records/codecs/run_plan.py +107 -0
  275. slurmforge/pipeline/records/codecs/run_snapshot.py +42 -0
  276. slurmforge/pipeline/records/io_utils.py +20 -0
  277. slurmforge/pipeline/records/models/__init__.py +23 -0
  278. slurmforge/pipeline/records/models/array_assignment.py +19 -0
  279. slurmforge/pipeline/records/models/dispatch.py +30 -0
  280. slurmforge/pipeline/records/models/metadata.py +11 -0
  281. slurmforge/pipeline/records/models/run_plan.py +113 -0
  282. slurmforge/pipeline/records/models/run_snapshot.py +22 -0
  283. slurmforge/pipeline/records/replay_spec/__init__.py +14 -0
  284. slurmforge/pipeline/records/replay_spec/builders.py +31 -0
  285. slurmforge/pipeline/records/replay_spec/codecs.py +47 -0
  286. slurmforge/pipeline/records/replay_spec/model.py +17 -0
  287. slurmforge/pipeline/records/snapshot_io.py +19 -0
  288. slurmforge/pipeline/sources/__init__.py +28 -0
  289. slurmforge/pipeline/sources/api.py +18 -0
  290. slurmforge/pipeline/sources/authoring/__init__.py +10 -0
  291. slurmforge/pipeline/sources/authoring/collector.py +129 -0
  292. slurmforge/pipeline/sources/authoring/loader.py +36 -0
  293. slurmforge/pipeline/sources/authoring/models.py +20 -0
  294. slurmforge/pipeline/sources/failures.py +49 -0
  295. slurmforge/pipeline/sources/inference.py +31 -0
  296. slurmforge/pipeline/sources/models.py +115 -0
  297. slurmforge/pipeline/sources/replay/__init__.py +21 -0
  298. slurmforge/pipeline/sources/replay/checkpoint.py +27 -0
  299. slurmforge/pipeline/sources/replay/collector.py +63 -0
  300. slurmforge/pipeline/sources/replay/loaders.py +130 -0
  301. slurmforge/pipeline/sources/replay/models.py +30 -0
  302. slurmforge/pipeline/sources/replay/overrides.py +21 -0
  303. slurmforge/pipeline/sources/replay/relocation.py +91 -0
  304. slurmforge/pipeline/sources/replay/resume_patch.py +34 -0
  305. slurmforge/pipeline/sources/replay/retry_refs.py +39 -0
  306. slurmforge/pipeline/sources/replay/retry_selection.py +82 -0
  307. slurmforge/pipeline/sources/replay/selectors.py +57 -0
  308. slurmforge/pipeline/sources/replay/variants/__init__.py +3 -0
  309. slurmforge/pipeline/sources/replay/variants/batch.py +76 -0
  310. slurmforge/pipeline/sources/replay/variants/retry.py +100 -0
  311. slurmforge/pipeline/sources/replay/variants/run.py +57 -0
  312. slurmforge/pipeline/sources/replay/variants/snapshot.py +43 -0
  313. slurmforge/pipeline/status/__init__.py +42 -0
  314. slurmforge/pipeline/status/api.py +12 -0
  315. slurmforge/pipeline/status/builders.py +46 -0
  316. slurmforge/pipeline/status/classifier/__init__.py +12 -0
  317. slurmforge/pipeline/status/classifier/discovery.py +61 -0
  318. slurmforge/pipeline/status/classifier/patterns.py +28 -0
  319. slurmforge/pipeline/status/classifier/reader.py +31 -0
  320. slurmforge/pipeline/status/classifier/rules.py +70 -0
  321. slurmforge/pipeline/status/codecs/__init__.py +15 -0
  322. slurmforge/pipeline/status/codecs/api.py +11 -0
  323. slurmforge/pipeline/status/codecs/attempt_result.py +77 -0
  324. slurmforge/pipeline/status/codecs/execution_status.py +88 -0
  325. slurmforge/pipeline/status/codecs/path_fields.py +59 -0
  326. slurmforge/pipeline/status/lifecycle.py +144 -0
  327. slurmforge/pipeline/status/models.py +57 -0
  328. slurmforge/pipeline/status/paths.py +31 -0
  329. slurmforge/pipeline/status/reconcile.py +193 -0
  330. slurmforge/pipeline/status/slurm.py +107 -0
  331. slurmforge/pipeline/status/store.py +62 -0
  332. slurmforge/pipeline/train_outputs/__init__.py +21 -0
  333. slurmforge/pipeline/train_outputs/api.py +17 -0
  334. slurmforge/pipeline/train_outputs/cache.py +63 -0
  335. slurmforge/pipeline/train_outputs/codec.py +44 -0
  336. slurmforge/pipeline/train_outputs/contract.py +97 -0
  337. slurmforge/pipeline/train_outputs/discovery.py +71 -0
  338. slurmforge/pipeline/train_outputs/env_writer.py +48 -0
  339. slurmforge/pipeline/train_outputs/models.py +20 -0
  340. slurmforge/pipeline/train_outputs/paths.py +11 -0
  341. slurmforge/pipeline/train_outputs/selection.py +64 -0
  342. slurmforge/pipeline/utils/__init__.py +6 -0
  343. slurmforge/pipeline/utils/merge.py +14 -0
  344. slurmforge/pipeline/utils/schema.py +12 -0
  345. slurmforge/resource_io.py +26 -0
  346. slurmforge/starter_catalog.py +203 -0
  347. slurmforge/starter_projects.py +69 -0
  348. slurmforge/starter_templates/README.md.tmpl +42 -0
  349. slurmforge/starter_templates/__init__.py +1 -0
  350. slurmforge/starter_templates/adapter_train.py.tmpl +49 -0
  351. slurmforge/starter_templates/command_train.py.tmpl +46 -0
  352. slurmforge/starter_templates/eval.py.tmpl +45 -0
  353. slurmforge/starter_templates/hpc_train.py.tmpl +73 -0
  354. slurmforge/starter_templates/model_cli_train.py.tmpl +63 -0
  355. slurmforge/sweep/__init__.py +33 -0
  356. slurmforge/sweep/api.py +23 -0
  357. slurmforge/sweep/expansion.py +87 -0
  358. slurmforge/sweep/materialize.py +37 -0
  359. slurmforge/sweep/models.py +25 -0
  360. slurmforge/sweep/overrides.py +24 -0
  361. slurmforge/sweep/validation.py +171 -0
  362. slurmforge/templates/sbatch_array_group.sh.j2 +42 -0
  363. slurmforge/templates/sbatch_notify.sh.j2 +8 -0
  364. slurmforge/templating.py +16 -0
  365. slurmforge/text_safety.py +22 -0
  366. slurmforge-0.1.0.dist-info/METADATA +716 -0
  367. slurmforge-0.1.0.dist-info/RECORD +371 -0
  368. slurmforge-0.1.0.dist-info/WHEEL +5 -0
  369. slurmforge-0.1.0.dist-info/entry_points.txt +6 -0
  370. slurmforge-0.1.0.dist-info/licenses/LICENSE +21 -0
  371. slurmforge-0.1.0.dist-info/top_level.txt +1 -0
slurmforge/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Generic Slurm-oriented experiment orchestration package for slurmforge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .identity import PACKAGE_NAME, __version__
6
+
7
+ __all__ = ["PACKAGE_NAME", "__version__"]
@@ -0,0 +1 @@
1
+ """CLI subcommands for slurmforge."""
@@ -0,0 +1,95 @@
1
+ """Shared CLI argument builders and batch materialization helpers."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ from pathlib import Path
6
+
7
+ import yaml
8
+
9
+ from ..errors import ConfigContractError
10
+ from ..identity import __version__, regenerate_after_upgrade_note
11
+ from ..pipeline.materialization import MaterializationResult, materialize_batch, print_dry_run
12
+ from ..pipeline.planning import PlannedBatch
13
+ from ..sweep import deep_set, parse_override
14
+ from ..templating import build_template_env
15
+
16
+
17
+ def load_raw_cfg(config_path: Path) -> dict:
18
+ """Load a YAML config file as a plain dict, without applying any overrides."""
19
+ text = config_path.read_text(encoding="utf-8")
20
+ cfg = yaml.safe_load(text)
21
+ if not isinstance(cfg, dict):
22
+ raise ConfigContractError(f"Config must be a YAML mapping: {config_path}")
23
+ return cfg
24
+
25
+
26
+ def load_effective_cfg(config_path: Path, cli_overrides: list[str]) -> dict:
27
+ """
28
+ Load YAML config and apply CLI overrides, producing the effective config.
29
+
30
+ This represents the user's declared intent — raw YAML merged with any
31
+ ``--set`` overrides — and is the single config view that all pre-compile
32
+ checks (completeness, correctness, advisory) should operate on.
33
+
34
+ The compiler receives ``(config_path, cli_overrides)`` separately because
35
+ it tracks override provenance for diagnostics and replay.
36
+ """
37
+ cfg = load_raw_cfg(config_path)
38
+ for override in cli_overrides:
39
+ key, value = parse_override(override)
40
+ deep_set(cfg, key, value)
41
+ return cfg
42
+
43
+
44
+ def add_config_override_args(parser: argparse.ArgumentParser) -> None:
45
+ parser.add_argument(
46
+ "--set",
47
+ action="append",
48
+ default=[],
49
+ help="Override config by dot-path, e.g. --set run.args.lr=0.004",
50
+ )
51
+ parser.add_argument(
52
+ "--project_root",
53
+ default=None,
54
+ help="Override project root used to resolve relative paths (default: config file directory)",
55
+ )
56
+
57
+
58
+ def add_dry_run_arg(parser: argparse.ArgumentParser) -> None:
59
+ parser.add_argument(
60
+ "--dry_run",
61
+ action="store_true",
62
+ help="Expand runs and print commands without writing files",
63
+ )
64
+
65
+
66
+ def add_common_args(parser: argparse.ArgumentParser) -> None:
67
+ add_config_override_args(parser)
68
+ add_dry_run_arg(parser)
69
+
70
+
71
+ def materialize_or_print_batch(
72
+ *,
73
+ planned_batch: PlannedBatch,
74
+ dry_run: bool,
75
+ ) -> MaterializationResult | None:
76
+ if dry_run:
77
+ print_dry_run(planned_run.plan for planned_run in planned_batch.planned_runs)
78
+ return None
79
+
80
+ env = build_template_env()
81
+ return materialize_batch(
82
+ planned_batch=planned_batch,
83
+ env=env,
84
+ )
85
+
86
+
87
+ def print_batch_ready(*, dispatch: MaterializationResult, sbatch_dir: Path, retry: bool = False) -> None:
88
+ if retry:
89
+ print(f"[OK] Generated retry batch with {len(dispatch.array_groups_meta)} array sbatch file(s) in: {sbatch_dir}")
90
+ else:
91
+ print(f"[OK] Generated {len(dispatch.array_groups_meta)} array sbatch file(s) in: {sbatch_dir}")
92
+ print(f"[OK] Generated by: slurmforge {__version__}")
93
+ print(f"[OK] Submit all via: {dispatch.submit_script}")
94
+ print(f"[OK] Batch manifest: {dispatch.manifest_path}")
95
+ print(f"[NOTE] {regenerate_after_upgrade_note()}")
@@ -0,0 +1,50 @@
1
+ """``sforge examples`` -- list, show, or export shipped YAML reference examples."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ from pathlib import Path
6
+
7
+ from ..example_configs import export_example, list_example_catalog, read_example_text
8
+
9
+
10
+ def handle_examples_list(_args: argparse.Namespace) -> None:
11
+ for name, description in list_example_catalog():
12
+ if description:
13
+ print(f"{name:<26} {description}")
14
+ else:
15
+ print(name)
16
+
17
+
18
+ def handle_examples_show(args: argparse.Namespace) -> None:
19
+ print(read_example_text(args.name), end="")
20
+
21
+
22
+ def handle_examples_export(args: argparse.Namespace) -> None:
23
+ exported = export_example(args.name, Path(args.out), force=args.force)
24
+ print(f"[OK] Wrote example config: {exported}")
25
+
26
+
27
+ def add_subparser(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
28
+ examples_parser = subparsers.add_parser(
29
+ "examples",
30
+ help="List, show, or export shipped raw YAML reference examples",
31
+ )
32
+ examples_subparsers = examples_parser.add_subparsers(dest="examples_command")
33
+ examples_subparsers.required = True
34
+
35
+ list_parser = examples_subparsers.add_parser("list", help="List shipped raw YAML reference examples")
36
+ list_parser.set_defaults(handler=handle_examples_list)
37
+
38
+ show_parser = examples_subparsers.add_parser("show", help="Print one shipped raw YAML reference example")
39
+ show_parser.add_argument("name", help="Example name, with or without .yaml suffix")
40
+ show_parser.set_defaults(handler=handle_examples_show)
41
+
42
+ export_parser = examples_subparsers.add_parser("export", help="Copy one shipped raw YAML reference example to a file")
43
+ export_parser.add_argument("name", help="Example name, with or without .yaml suffix")
44
+ export_parser.add_argument("--out", required=True, help="Destination path for the exported raw YAML example")
45
+ export_parser.add_argument(
46
+ "--force",
47
+ action="store_true",
48
+ help="Overwrite the destination file if it already exists",
49
+ )
50
+ export_parser.set_defaults(handler=handle_examples_export)
@@ -0,0 +1,102 @@
1
+ """``sforge generate`` -- expand an experiment config into sbatch arrays."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import datetime
6
+ from pathlib import Path
7
+
8
+ from ..pipeline.compiler import AuthoringSourceRequest, BatchCompileError, compile_source, iter_compile_report_lines
9
+ from ..pipeline.compiler.reports import require_success
10
+ from ..pipeline.config.validation.advisory import check_advisory
11
+ from ..pipeline.config.validation.completeness import assert_complete
12
+ from ..pipeline.config.validation.correctness import check_correctness
13
+ from ..pipeline.config.validation.messages import format_advisory_report, format_correctness_report
14
+ from .common import (
15
+ add_common_args,
16
+ load_effective_cfg,
17
+ materialize_or_print_batch,
18
+ print_batch_ready,
19
+ )
20
+
21
+
22
+ def render_generate(
23
+ *,
24
+ config_path: Path,
25
+ cli_overrides: list[str],
26
+ dry_run: bool,
27
+ project_root_override: str | None,
28
+ ) -> None:
29
+ resolved_config_path = config_path.resolve()
30
+ project_root = (
31
+ Path(project_root_override).resolve()
32
+ if project_root_override is not None
33
+ else resolved_config_path.parent
34
+ )
35
+
36
+ effective_cfg = load_effective_cfg(resolved_config_path, cli_overrides)
37
+
38
+ # ── Level 0: completeness gate (hard — null sentinels block generation) ──
39
+ assert_complete(effective_cfg, config_path=resolved_config_path, project_root=project_root)
40
+
41
+ # ── Level 1: correctness gate (hard — format/logic errors block generation) ─
42
+ correctness_errors = check_correctness(effective_cfg, config_path=resolved_config_path)
43
+ if correctness_errors:
44
+ print(format_correctness_report(
45
+ correctness_errors,
46
+ config_path=resolved_config_path,
47
+ force_flag_available=False,
48
+ ))
49
+ raise SystemExit(1)
50
+
51
+ # ── Level 2: advisory (soft — warnings printed, generation continues) ────
52
+ advisory_warnings = check_advisory(effective_cfg, config_path=resolved_config_path)
53
+ if advisory_warnings:
54
+ print(format_advisory_report(advisory_warnings, config_path=resolved_config_path))
55
+ print()
56
+
57
+ # ── Compiler pipeline ─────────────────────────────────────────────────────
58
+ report = compile_source(
59
+ AuthoringSourceRequest(
60
+ config_path=resolved_config_path,
61
+ cli_overrides=tuple(cli_overrides),
62
+ project_root=None if project_root_override is None else Path(project_root_override),
63
+ default_batch_name=datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f"),
64
+ ),
65
+ )
66
+ for line in iter_compile_report_lines(report):
67
+ print(line)
68
+ try:
69
+ planned_batch = require_success(report)
70
+ except BatchCompileError:
71
+ raise
72
+
73
+ dispatch = materialize_or_print_batch(
74
+ planned_batch=planned_batch,
75
+ dry_run=dry_run,
76
+ )
77
+ if dispatch is None:
78
+ return
79
+ print_batch_ready(dispatch=dispatch, sbatch_dir=planned_batch.sbatch_dir)
80
+
81
+
82
+ def handle_generate(args: argparse.Namespace) -> None:
83
+ render_generate(
84
+ config_path=Path(args.config),
85
+ cli_overrides=args.set,
86
+ dry_run=args.dry_run,
87
+ project_root_override=args.project_root,
88
+ )
89
+
90
+
91
+ def add_subparser(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
92
+ generate_parser = subparsers.add_parser(
93
+ "generate",
94
+ help="Expand config into a new batch and render sbatch arrays",
95
+ )
96
+ generate_parser.add_argument(
97
+ "--config",
98
+ required=True,
99
+ help="Path to experiment config yaml",
100
+ )
101
+ add_common_args(generate_parser)
102
+ generate_parser.set_defaults(handler=handle_generate)
slurmforge/cli/init.py ADDED
@@ -0,0 +1,148 @@
1
+ """
2
+ `sforge init` — create a starter project scaffold.
3
+
4
+ Decision tree (two orthogonal choices):
5
+
6
+ TRAINING TYPE (how is your training code invoked?)
7
+ script → train.py with CLI args [most common]
8
+ command → complete shell command
9
+ registry → shared team model registry
10
+ adapter → interface bridge script
11
+
12
+ PROFILE (cluster complexity)
13
+ starter → single GPU, minimal config [default]
14
+ hpc → multi-GPU, sweep, eval, artifact sync
15
+
16
+ Examples
17
+ --------
18
+ sforge init # interactive wizard
19
+ sforge init script # script · starter profile
20
+ sforge init script --profile hpc # script · hpc profile
21
+ sforge init command
22
+ sforge init command --profile hpc
23
+ sforge init registry
24
+ sforge init registry --profile hpc
25
+ sforge init adapter
26
+ sforge init adapter --profile hpc
27
+ """
28
+ from __future__ import annotations
29
+
30
+ import argparse
31
+ from pathlib import Path
32
+
33
+ from ..starter_catalog import PROFILES, TEMPLATE_TYPES, get_starter_spec
34
+ from ..starter_projects import init_project
35
+ from .init_wizard import run_wizard
36
+
37
+ _DEFAULT_OUT = "./slurmforge_starter"
38
+
39
+ _TYPE_DESCRIPTIONS = {
40
+ "script": "Scaffold for a train.py-style script — slurmforge manages args and submission.",
41
+ "command": "Scaffold that wraps a complete shell command in Slurm.",
42
+ "registry": "Scaffold using a shared team model registry.",
43
+ "adapter": "Scaffold with an interface-bridge adapter script (advanced).",
44
+ }
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Shared argument builder
49
+ # ---------------------------------------------------------------------------
50
+
51
+ def _add_common_args(parser: argparse.ArgumentParser) -> None:
52
+ parser.add_argument(
53
+ "--profile",
54
+ default="starter",
55
+ choices=PROFILES,
56
+ metavar="PROFILE",
57
+ help=(
58
+ "Cluster complexity profile. "
59
+ "'starter' = single GPU, minimal config (default). "
60
+ "'hpc' = multi-GPU, sweep, eval, artifact sync."
61
+ ),
62
+ )
63
+ parser.add_argument(
64
+ "--out",
65
+ default=_DEFAULT_OUT,
66
+ metavar="DIR",
67
+ help="Destination directory for the project scaffold (default: %(default)s)",
68
+ )
69
+ parser.add_argument(
70
+ "--force",
71
+ action="store_true",
72
+ help="Overwrite existing files in the destination directory",
73
+ )
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # Handlers
78
+ # ---------------------------------------------------------------------------
79
+
80
+ def _do_init(*, template_type: str, profile: str, out: str, force: bool) -> None:
81
+ spec = get_starter_spec(template_type, profile)
82
+ written = init_project(template_type, profile, Path(out), force=force)
83
+ out_dir = Path(out).expanduser().resolve()
84
+ print(f"[OK] Initialized '{template_type}' scaffold (profile: {profile}) in: {out_dir}")
85
+ print(f"[INFO] {spec.post_init_guidance}")
86
+ print()
87
+ print(" Files created:")
88
+ for path in written:
89
+ print(f" {path}")
90
+ print()
91
+ print(" Next steps:")
92
+ print(f" 1. Open {out_dir / 'experiment.yaml'}")
93
+ print(" 2. Fill in every field marked with ~ (required — see STEP 1 comments)")
94
+ print(f" 3. Run: sforge validate --config {out_dir / 'experiment.yaml'}")
95
+ print(f" 4. Run: sforge generate --config {out_dir / 'experiment.yaml'}")
96
+
97
+
98
+ def handle_init_template(args: argparse.Namespace) -> None:
99
+ _do_init(
100
+ template_type=args.template_type,
101
+ profile=args.profile,
102
+ out=args.out,
103
+ force=args.force,
104
+ )
105
+
106
+
107
+ def handle_init_wizard(args: argparse.Namespace) -> None:
108
+ """Fallback handler when no TYPE subcommand is given — launches interactive wizard."""
109
+ template_type, profile, out = run_wizard(out=args.out, force=args.force)
110
+ _do_init(template_type=template_type, profile=profile, out=out, force=args.force)
111
+
112
+
113
+ # ---------------------------------------------------------------------------
114
+ # Parser registration
115
+ # ---------------------------------------------------------------------------
116
+
117
+ def add_subparser(subparsers: argparse._SubParsersAction) -> None: # type: ignore[type-arg]
118
+ init_parser = subparsers.add_parser(
119
+ "init",
120
+ description=__doc__,
121
+ help="Create a starter project scaffold (run 'sforge init' for interactive setup)",
122
+ formatter_class=argparse.RawDescriptionHelpFormatter,
123
+ )
124
+ # Top-level --out/--force for wizard path (TYPE subcommand overrides these)
125
+ init_parser.add_argument(
126
+ "--out",
127
+ default=_DEFAULT_OUT,
128
+ metavar="DIR",
129
+ help="Output directory (wizard mode — overridden by TYPE subcommand flags)",
130
+ )
131
+ init_parser.add_argument(
132
+ "--force",
133
+ action="store_true",
134
+ help="Overwrite existing files",
135
+ )
136
+ init_parser.set_defaults(handler=handle_init_wizard)
137
+
138
+ # TYPE subcommands: script / command / registry / adapter
139
+ type_subparsers = init_parser.add_subparsers(dest="template_type")
140
+
141
+ for ttype in TEMPLATE_TYPES:
142
+ tp = type_subparsers.add_parser(
143
+ ttype,
144
+ help=_TYPE_DESCRIPTIONS.get(ttype, ""),
145
+ description=_TYPE_DESCRIPTIONS.get(ttype, ""),
146
+ )
147
+ tp.set_defaults(template_type=ttype, handler=handle_init_template)
148
+ _add_common_args(tp)
@@ -0,0 +1,80 @@
1
+ """
2
+ Interactive 2-question wizard for `sforge init` (no arguments).
3
+ Falls back gracefully when stdin is not a TTY (CI, piped input).
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import sys
8
+
9
+
10
+ _TRAINING_TYPES = [
11
+ ("script", "I have a train.py — slurmforge manages my args and submission"),
12
+ ("command", "I have a complete launch command — just wrap it in Slurm"),
13
+ ("registry", "My training code lives in a shared team model registry"),
14
+ ("adapter", "I need an interface bridge between slurmforge and my training code"),
15
+ ]
16
+
17
+ _PROFILES = [
18
+ ("starter", "Quick start — single GPU, minimal config (recommended for first run)"),
19
+ ("hpc", "Full HPC — multi-GPU, sweep, eval, artifact sync"),
20
+ ]
21
+
22
+
23
+ def _prompt_choice(
24
+ prompt: str,
25
+ options: list[tuple[str, str]],
26
+ ) -> str:
27
+ """Display numbered options and return the selected key."""
28
+ print(prompt)
29
+ for i, (key, description) in enumerate(options, 1):
30
+ print(f" {i}) {key:<12} {description}")
31
+ print()
32
+ while True:
33
+ try:
34
+ raw = input(" Enter number: ").strip()
35
+ except (EOFError, KeyboardInterrupt):
36
+ print()
37
+ sys.exit(0)
38
+ if raw.isdigit():
39
+ idx = int(raw) - 1
40
+ if 0 <= idx < len(options):
41
+ chosen_key = options[idx][0]
42
+ print(f" → {chosen_key}")
43
+ print()
44
+ return chosen_key
45
+ print(f" Please enter a number between 1 and {len(options)}.")
46
+
47
+
48
+ def run_wizard(*, out: str, force: bool) -> tuple[str, str, str]:
49
+ """
50
+ Run the interactive wizard and return (template_type, profile, out_dir).
51
+ Raises SystemExit if the user cancels.
52
+ """
53
+ if not sys.stdin.isatty():
54
+ print(
55
+ "[sforge init] No template type specified.\n"
56
+ "Usage: sforge init <TYPE> [--profile starter|hpc] [--out DIR]\n"
57
+ "\n"
58
+ "Available types: script, command, registry, adapter\n"
59
+ "Run 'sforge init --help' for full usage.",
60
+ file=sys.stderr,
61
+ )
62
+ sys.exit(1)
63
+
64
+ print()
65
+ print(" ┌──────────────────────────────────────────────────────┐")
66
+ print(" │ sforge init · project setup wizard │")
67
+ print(" └──────────────────────────────────────────────────────┘")
68
+ print()
69
+
70
+ template_type = _prompt_choice(
71
+ " How is your training code invoked?",
72
+ _TRAINING_TYPES,
73
+ )
74
+
75
+ profile = _prompt_choice(
76
+ " Which cluster profile fits your setup?",
77
+ _PROFILES,
78
+ )
79
+
80
+ return template_type, profile, out
@@ -0,0 +1,102 @@
1
+ """``sforge replay`` -- regenerate a batch from persisted run snapshots."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import datetime
6
+ from pathlib import Path
7
+
8
+ from ..pipeline.compiler import BatchCompileError, ReplaySourceRequest, compile_source, iter_compile_report_lines
9
+ from ..pipeline.compiler.reports import report_total_runs, require_success
10
+ from .common import add_common_args, materialize_or_print_batch, print_batch_ready
11
+
12
+
13
+ def render_replay(
14
+ *,
15
+ source_run_dir: Path | None,
16
+ source_snapshot_path: Path | None,
17
+ source_batch_root: Path | None,
18
+ run_ids: list[str],
19
+ run_indices: list[int],
20
+ cli_overrides: list[str],
21
+ dry_run: bool,
22
+ project_root_override: str | None,
23
+ ) -> None:
24
+ default_batch_name = datetime.datetime.now().strftime("replay_%Y%m%d_%H%M%S_%f")
25
+ request = ReplaySourceRequest(
26
+ source_run_dir=source_run_dir,
27
+ source_snapshot_path=source_snapshot_path,
28
+ source_batch_root=source_batch_root,
29
+ run_ids=tuple(run_ids),
30
+ run_indices=tuple(run_indices),
31
+ cli_overrides=tuple(cli_overrides),
32
+ project_root=None if project_root_override is None else Path(project_root_override),
33
+ default_batch_name=default_batch_name,
34
+ )
35
+ report = compile_source(request)
36
+ source_summary = getattr(report, "source_summary", "") or "<missing replay source>"
37
+ print(f"[REPLAY] source={source_summary} selected_runs={report_total_runs(report)}")
38
+ for line in iter_compile_report_lines(report):
39
+ print(line)
40
+ try:
41
+ planned_batch = require_success(report)
42
+ except BatchCompileError:
43
+ raise
44
+
45
+ dispatch = materialize_or_print_batch(
46
+ planned_batch=planned_batch,
47
+ dry_run=dry_run,
48
+ )
49
+ if dispatch is None:
50
+ return
51
+ print_batch_ready(dispatch=dispatch, sbatch_dir=planned_batch.sbatch_dir)
52
+
53
+
54
+ def handle_replay(args: argparse.Namespace) -> None:
55
+ render_replay(
56
+ source_run_dir=None if args.source_run_dir is None else Path(args.source_run_dir),
57
+ source_snapshot_path=None if args.source_snapshot_path is None else Path(args.source_snapshot_path),
58
+ source_batch_root=None if args.source_batch_root is None else Path(args.source_batch_root),
59
+ run_ids=args.run_id,
60
+ run_indices=args.run_index,
61
+ cli_overrides=args.set,
62
+ dry_run=args.dry_run,
63
+ project_root_override=args.project_root,
64
+ )
65
+
66
+
67
+ def add_subparser(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
68
+ replay_parser = subparsers.add_parser(
69
+ "replay",
70
+ help="Replay one or more persisted runs from a run dir, snapshot, or batch root",
71
+ )
72
+ source_group = replay_parser.add_mutually_exclusive_group(required=True)
73
+ source_group.add_argument(
74
+ "--from-run",
75
+ dest="source_run_dir",
76
+ help="Path to a persisted run directory containing meta/run_snapshot.json",
77
+ )
78
+ source_group.add_argument(
79
+ "--from-snapshot",
80
+ dest="source_snapshot_path",
81
+ help="Path to a persisted run_snapshot.json file",
82
+ )
83
+ source_group.add_argument(
84
+ "--from-batch",
85
+ dest="source_batch_root",
86
+ help="Path to an existing batch_root; replays all runs unless selectors are provided",
87
+ )
88
+ replay_parser.add_argument(
89
+ "--run_id",
90
+ action="append",
91
+ default=[],
92
+ help="Select specific run_id values when replaying from --from-batch",
93
+ )
94
+ replay_parser.add_argument(
95
+ "--run_index",
96
+ action="append",
97
+ type=int,
98
+ default=[],
99
+ help="Select specific run_index values when replaying from --from-batch",
100
+ )
101
+ add_common_args(replay_parser)
102
+ replay_parser.set_defaults(handler=handle_replay)
@@ -0,0 +1,76 @@
1
+ """``sforge rerun`` -- rebuild a retry batch from an existing batch's run records."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import datetime
6
+ from pathlib import Path
7
+
8
+ from ..pipeline.compiler import BatchCompileError, RetrySourceRequest, compile_source, iter_compile_report_lines
9
+ from ..pipeline.compiler.reports import report_total_runs, require_success
10
+ from .common import add_common_args, materialize_or_print_batch, print_batch_ready
11
+
12
+
13
+ def render_rerun(
14
+ *,
15
+ source_batch_root: Path,
16
+ cli_overrides: list[str],
17
+ dry_run: bool,
18
+ project_root_override: str | None,
19
+ status_query: str,
20
+ ) -> None:
21
+ default_batch_name = datetime.datetime.now().strftime("retry_%Y%m%d_%H%M%S_%f")
22
+ report = compile_source(
23
+ RetrySourceRequest(
24
+ source_batch_root=source_batch_root,
25
+ status_query=status_query,
26
+ cli_overrides=tuple(cli_overrides),
27
+ project_root=None if project_root_override is None else Path(project_root_override),
28
+ default_batch_name=default_batch_name,
29
+ )
30
+ )
31
+ source_summary = getattr(report, "source_summary", "") or str(source_batch_root)
32
+ print(f"[RETRY] source={source_summary} selected_runs={report_total_runs(report)}")
33
+ for line in iter_compile_report_lines(report):
34
+ print(line)
35
+ try:
36
+ planned_batch = require_success(report)
37
+ except BatchCompileError:
38
+ raise
39
+
40
+ dispatch = materialize_or_print_batch(
41
+ planned_batch=planned_batch,
42
+ dry_run=dry_run,
43
+ )
44
+ if dispatch is None:
45
+ return
46
+ print_batch_ready(dispatch=dispatch, sbatch_dir=planned_batch.sbatch_dir, retry=True)
47
+
48
+
49
+ def handle_rerun(args: argparse.Namespace) -> None:
50
+ render_rerun(
51
+ source_batch_root=Path(args.source_batch_root).resolve(),
52
+ cli_overrides=args.set,
53
+ dry_run=args.dry_run,
54
+ project_root_override=args.project_root,
55
+ status_query=args.status,
56
+ )
57
+
58
+
59
+ def add_subparser(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
60
+ rerun_parser = subparsers.add_parser("rerun", help="Rebuild a retry batch from an existing batch_root")
61
+ rerun_parser.add_argument(
62
+ "--from",
63
+ dest="source_batch_root",
64
+ required=True,
65
+ help="Path to an existing batch_root; rebuild and resubmit a filtered retry batch from its run records",
66
+ )
67
+ rerun_parser.add_argument(
68
+ "--status",
69
+ default="failed",
70
+ help=(
71
+ "Retry filter for existing batch runs: failed(non-success), success, "
72
+ "pending, running, oom, preempted, node_failure, script_error, eval_failed, all"
73
+ ),
74
+ )
75
+ add_common_args(rerun_parser)
76
+ rerun_parser.set_defaults(handler=handle_rerun)