slurmforge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- slurmforge/__init__.py +7 -0
- slurmforge/cli/__init__.py +1 -0
- slurmforge/cli/common.py +95 -0
- slurmforge/cli/examples.py +50 -0
- slurmforge/cli/generate.py +102 -0
- slurmforge/cli/init.py +148 -0
- slurmforge/cli/init_wizard.py +80 -0
- slurmforge/cli/replay.py +102 -0
- slurmforge/cli/rerun.py +76 -0
- slurmforge/cli/status.py +85 -0
- slurmforge/cli/validate.py +108 -0
- slurmforge/errors.py +29 -0
- slurmforge/example_configs.py +70 -0
- slurmforge/examples/__init__.py +1 -0
- slurmforge/examples/adapter_hpc.yaml +67 -0
- slurmforge/examples/adapter_minimal.yaml +14 -0
- slurmforge/examples/adapter_starter.yaml +60 -0
- slurmforge/examples/command_hpc.yaml +63 -0
- slurmforge/examples/command_minimal.yaml +9 -0
- slurmforge/examples/command_starter.yaml +56 -0
- slurmforge/examples/model_registry.yaml +7 -0
- slurmforge/examples/registry_hpc.yaml +86 -0
- slurmforge/examples/registry_starter.yaml +64 -0
- slurmforge/examples/script_hpc.yaml +104 -0
- slurmforge/examples/script_starter.yaml +62 -0
- slurmforge/execution/__init__.py +1 -0
- slurmforge/execution/artifacts/__init__.py +23 -0
- slurmforge/execution/artifacts/api.py +18 -0
- slurmforge/execution/artifacts/cli.py +52 -0
- slurmforge/execution/artifacts/copier.py +40 -0
- slurmforge/execution/artifacts/discovery.py +61 -0
- slurmforge/execution/artifacts/manifest.py +24 -0
- slurmforge/execution/artifacts/sync.py +125 -0
- slurmforge/execution/run_plan/__init__.py +18 -0
- slurmforge/execution/run_plan/api.py +42 -0
- slurmforge/execution/run_plan/cli.py +32 -0
- slurmforge/execution/run_plan/helper_bins.py +25 -0
- slurmforge/execution/run_plan/loader.py +47 -0
- slurmforge/execution/run_plan/post_run.py +20 -0
- slurmforge/execution/run_plan/shell_runner.py +44 -0
- slurmforge/execution/write_attempt_result.py +59 -0
- slurmforge/execution/write_train_outputs.py +59 -0
- slurmforge/identity.py +19 -0
- slurmforge/launcher.py +36 -0
- slurmforge/model_support/__init__.py +23 -0
- slurmforge/model_support/argparse_introspect.py +129 -0
- slurmforge/model_support/catalog/__init__.py +27 -0
- slurmforge/model_support/catalog/api.py +24 -0
- slurmforge/model_support/catalog/canonicalize.py +24 -0
- slurmforge/model_support/catalog/codecs.py +24 -0
- slurmforge/model_support/catalog/merge.py +50 -0
- slurmforge/model_support/catalog/models.py +29 -0
- slurmforge/model_support/catalog/registry_loader.py +62 -0
- slurmforge/model_support/catalog/resolver.py +176 -0
- slurmforge/model_support/gpu_estimator.py +324 -0
- slurmforge/pipeline/__init__.py +29 -0
- slurmforge/pipeline/checkpoints/__init__.py +23 -0
- slurmforge/pipeline/checkpoints/api.py +21 -0
- slurmforge/pipeline/checkpoints/codec.py +22 -0
- slurmforge/pipeline/checkpoints/discovery.py +37 -0
- slurmforge/pipeline/checkpoints/models.py +26 -0
- slurmforge/pipeline/checkpoints/selection.py +46 -0
- slurmforge/pipeline/checkpoints/store.py +29 -0
- slurmforge/pipeline/compiler/__init__.py +23 -0
- slurmforge/pipeline/compiler/api.py +96 -0
- slurmforge/pipeline/compiler/base.py +83 -0
- slurmforge/pipeline/compiler/config_pass.py +77 -0
- slurmforge/pipeline/compiler/diagnostics.py +43 -0
- slurmforge/pipeline/compiler/engine.py +104 -0
- slurmforge/pipeline/compiler/flows/__init__.py +6 -0
- slurmforge/pipeline/compiler/flows/authoring/__init__.py +5 -0
- slurmforge/pipeline/compiler/flows/authoring/api.py +56 -0
- slurmforge/pipeline/compiler/flows/authoring/collect.py +35 -0
- slurmforge/pipeline/compiler/flows/authoring/context.py +32 -0
- slurmforge/pipeline/compiler/flows/authoring/identity.py +53 -0
- slurmforge/pipeline/compiler/flows/authoring/spec_builder.py +34 -0
- slurmforge/pipeline/compiler/flows/replay/__init__.py +5 -0
- slurmforge/pipeline/compiler/flows/replay/api.py +58 -0
- slurmforge/pipeline/compiler/flows/replay/collect.py +45 -0
- slurmforge/pipeline/compiler/flows/replay/context.py +90 -0
- slurmforge/pipeline/compiler/flows/replay/identity.py +47 -0
- slurmforge/pipeline/compiler/flows/replay/spec_builder.py +23 -0
- slurmforge/pipeline/compiler/planning_pass.py +71 -0
- slurmforge/pipeline/compiler/reporting.py +46 -0
- slurmforge/pipeline/compiler/reports/__init__.py +39 -0
- slurmforge/pipeline/compiler/reports/actions.py +23 -0
- slurmforge/pipeline/compiler/reports/builders.py +87 -0
- slurmforge/pipeline/compiler/reports/errors.py +19 -0
- slurmforge/pipeline/compiler/reports/models.py +74 -0
- slurmforge/pipeline/compiler/reports/summary.py +77 -0
- slurmforge/pipeline/compiler/reports/validator.py +14 -0
- slurmforge/pipeline/compiler/requests.py +86 -0
- slurmforge/pipeline/compiler/state.py +59 -0
- slurmforge/pipeline/config/__init__.py +13 -0
- slurmforge/pipeline/config/api.py +49 -0
- slurmforge/pipeline/config/assembly/__init__.py +45 -0
- slurmforge/pipeline/config/assembly/authoring/__init__.py +16 -0
- slurmforge/pipeline/config/assembly/authoring/builders.py +70 -0
- slurmforge/pipeline/config/assembly/authoring/expansion.py +66 -0
- slurmforge/pipeline/config/assembly/authoring/models.py +16 -0
- slurmforge/pipeline/config/assembly/authoring/shared.py +58 -0
- slurmforge/pipeline/config/assembly/authoring/validation.py +51 -0
- slurmforge/pipeline/config/assembly/batch_contract.py +20 -0
- slurmforge/pipeline/config/assembly/catalog.py +30 -0
- slurmforge/pipeline/config/assembly/eval.py +114 -0
- slurmforge/pipeline/config/assembly/experiment/__init__.py +9 -0
- slurmforge/pipeline/config/assembly/experiment/api.py +26 -0
- slurmforge/pipeline/config/assembly/experiment/assembler.py +64 -0
- slurmforge/pipeline/config/assembly/experiment/hints.py +14 -0
- slurmforge/pipeline/config/assembly/experiment/inputs.py +61 -0
- slurmforge/pipeline/config/assembly/experiment/sections.py +103 -0
- slurmforge/pipeline/config/assembly/output.py +28 -0
- slurmforge/pipeline/config/assembly/replay.py +31 -0
- slurmforge/pipeline/config/assembly/run/__init__.py +17 -0
- slurmforge/pipeline/config/assembly/run/adapter.py +56 -0
- slurmforge/pipeline/config/assembly/run/builder.py +67 -0
- slurmforge/pipeline/config/assembly/run/external_runtime.py +56 -0
- slurmforge/pipeline/config/assembly/run/model.py +47 -0
- slurmforge/pipeline/config/assembly/run/shared.py +29 -0
- slurmforge/pipeline/config/assembly/spec_builder.py +81 -0
- slurmforge/pipeline/config/codecs/__init__.py +25 -0
- slurmforge/pipeline/config/codecs/eval.py +82 -0
- slurmforge/pipeline/config/codecs/experiment.py +54 -0
- slurmforge/pipeline/config/codecs/model.py +20 -0
- slurmforge/pipeline/config/codecs/output.py +16 -0
- slurmforge/pipeline/config/codecs/run.py +48 -0
- slurmforge/pipeline/config/codecs/runtime.py +10 -0
- slurmforge/pipeline/config/constants.py +7 -0
- slurmforge/pipeline/config/mode_detection.py +36 -0
- slurmforge/pipeline/config/models/__init__.py +23 -0
- slurmforge/pipeline/config/models/eval.py +32 -0
- slurmforge/pipeline/config/models/experiment.py +91 -0
- slurmforge/pipeline/config/models/model.py +13 -0
- slurmforge/pipeline/config/models/output.py +10 -0
- slurmforge/pipeline/config/models/run.py +35 -0
- slurmforge/pipeline/config/models/runtime.py +16 -0
- slurmforge/pipeline/config/normalize/__init__.py +42 -0
- slurmforge/pipeline/config/normalize/artifacts.py +27 -0
- slurmforge/pipeline/config/normalize/cluster.py +33 -0
- slurmforge/pipeline/config/normalize/env.py +34 -0
- slurmforge/pipeline/config/normalize/launcher.py +50 -0
- slurmforge/pipeline/config/normalize/notify.py +37 -0
- slurmforge/pipeline/config/normalize/resources.py +58 -0
- slurmforge/pipeline/config/normalize/shared.py +19 -0
- slurmforge/pipeline/config/normalize/slurm_deps.py +72 -0
- slurmforge/pipeline/config/normalize/validation.py +40 -0
- slurmforge/pipeline/config/replay_payload.py +58 -0
- slurmforge/pipeline/config/runtime/__init__.py +53 -0
- slurmforge/pipeline/config/runtime/api.py +57 -0
- slurmforge/pipeline/config/runtime/codecs/__init__.py +20 -0
- slurmforge/pipeline/config/runtime/codecs/artifacts.py +14 -0
- slurmforge/pipeline/config/runtime/codecs/cluster.py +20 -0
- slurmforge/pipeline/config/runtime/codecs/env.py +14 -0
- slurmforge/pipeline/config/runtime/codecs/launcher.py +24 -0
- slurmforge/pipeline/config/runtime/codecs/notify.py +13 -0
- slurmforge/pipeline/config/runtime/codecs/resources.py +17 -0
- slurmforge/pipeline/config/runtime/codecs/validation.py +14 -0
- slurmforge/pipeline/config/runtime/defaults.py +30 -0
- slurmforge/pipeline/config/runtime/models/__init__.py +20 -0
- slurmforge/pipeline/config/runtime/models/artifacts.py +43 -0
- slurmforge/pipeline/config/runtime/models/cluster.py +21 -0
- slurmforge/pipeline/config/runtime/models/env.py +19 -0
- slurmforge/pipeline/config/runtime/models/launcher.py +24 -0
- slurmforge/pipeline/config/runtime/models/notify.py +10 -0
- slurmforge/pipeline/config/runtime/models/resources.py +14 -0
- slurmforge/pipeline/config/runtime/models/validation.py +11 -0
- slurmforge/pipeline/config/scalars.py +25 -0
- slurmforge/pipeline/config/utils.py +67 -0
- slurmforge/pipeline/config/validation/__init__.py +28 -0
- slurmforge/pipeline/config/validation/_helpers.py +59 -0
- slurmforge/pipeline/config/validation/advisory.py +94 -0
- slurmforge/pipeline/config/validation/api.py +34 -0
- slurmforge/pipeline/config/validation/authoring.py +41 -0
- slurmforge/pipeline/config/validation/completeness.py +170 -0
- slurmforge/pipeline/config/validation/correctness.py +114 -0
- slurmforge/pipeline/config/validation/definitions.py +234 -0
- slurmforge/pipeline/config/validation/messages.py +188 -0
- slurmforge/pipeline/config/validation/replay.py +32 -0
- slurmforge/pipeline/config/validation/sections.py +132 -0
- slurmforge/pipeline/config/validation/sweep_rules.py +72 -0
- slurmforge/pipeline/config/validation/traversal.py +34 -0
- slurmforge/pipeline/launch/__init__.py +15 -0
- slurmforge/pipeline/launch/cli_args.py +47 -0
- slurmforge/pipeline/launch/command_builder.py +43 -0
- slurmforge/pipeline/launch/strategies.py +69 -0
- slurmforge/pipeline/launch/types.py +16 -0
- slurmforge/pipeline/materialization/__init__.py +29 -0
- slurmforge/pipeline/materialization/api.py +87 -0
- slurmforge/pipeline/materialization/array_scripts.py +140 -0
- slurmforge/pipeline/materialization/blocks/__init__.py +1 -0
- slurmforge/pipeline/materialization/blocks/artifacts.py +67 -0
- slurmforge/pipeline/materialization/blocks/common.py +25 -0
- slurmforge/pipeline/materialization/blocks/env_setup.py +59 -0
- slurmforge/pipeline/materialization/blocks/eval.py +72 -0
- slurmforge/pipeline/materialization/blocks/finalize.py +22 -0
- slurmforge/pipeline/materialization/blocks/preamble.py +47 -0
- slurmforge/pipeline/materialization/blocks/preflight.py +68 -0
- slurmforge/pipeline/materialization/blocks/train.py +63 -0
- slurmforge/pipeline/materialization/blocks/train_outputs.py +48 -0
- slurmforge/pipeline/materialization/commit.py +16 -0
- slurmforge/pipeline/materialization/context.py +45 -0
- slurmforge/pipeline/materialization/grouping.py +110 -0
- slurmforge/pipeline/materialization/layout.py +49 -0
- slurmforge/pipeline/materialization/manifest_writer.py +57 -0
- slurmforge/pipeline/materialization/record_writer.py +136 -0
- slurmforge/pipeline/materialization/reporting.py +55 -0
- slurmforge/pipeline/materialization/run_assets.py +37 -0
- slurmforge/pipeline/materialization/shell_builder.py +33 -0
- slurmforge/pipeline/materialization/slurm_deps.py +17 -0
- slurmforge/pipeline/materialization/submit_writer.py +72 -0
- slurmforge/pipeline/planning/__init__.py +35 -0
- slurmforge/pipeline/planning/api.py +20 -0
- slurmforge/pipeline/planning/batch.py +62 -0
- slurmforge/pipeline/planning/batch_validator.py +42 -0
- slurmforge/pipeline/planning/codecs/__init__.py +32 -0
- slurmforge/pipeline/planning/codecs/diagnostics.py +23 -0
- slurmforge/pipeline/planning/codecs/resources.py +74 -0
- slurmforge/pipeline/planning/codecs/stages.py +99 -0
- slurmforge/pipeline/planning/contracts.py +58 -0
- slurmforge/pipeline/planning/enums.py +65 -0
- slurmforge/pipeline/planning/eval/__init__.py +5 -0
- slurmforge/pipeline/planning/eval/api.py +53 -0
- slurmforge/pipeline/planning/eval/command.py +59 -0
- slurmforge/pipeline/planning/eval/common.py +28 -0
- slurmforge/pipeline/planning/eval/launcher_merge.py +59 -0
- slurmforge/pipeline/planning/eval/script.py +79 -0
- slurmforge/pipeline/planning/external_command.py +34 -0
- slurmforge/pipeline/planning/fingerprint.py +32 -0
- slurmforge/pipeline/planning/identity.py +62 -0
- slurmforge/pipeline/planning/models/__init__.py +15 -0
- slurmforge/pipeline/planning/models/diagnostics.py +61 -0
- slurmforge/pipeline/planning/models/resources.py +98 -0
- slurmforge/pipeline/planning/models/stages.py +94 -0
- slurmforge/pipeline/planning/replay_builder.py +23 -0
- slurmforge/pipeline/planning/run/__init__.py +9 -0
- slurmforge/pipeline/planning/run/api.py +11 -0
- slurmforge/pipeline/planning/run/assembly.py +12 -0
- slurmforge/pipeline/planning/run/identity.py +38 -0
- slurmforge/pipeline/planning/run/plan_factory.py +78 -0
- slurmforge/pipeline/planning/run/planned_run_factory.py +52 -0
- slurmforge/pipeline/planning/run/stages.py +47 -0
- slurmforge/pipeline/planning/snapshot_builder.py +33 -0
- slurmforge/pipeline/planning/train/__init__.py +23 -0
- slurmforge/pipeline/planning/train/allocation.py +51 -0
- slurmforge/pipeline/planning/train/api.py +55 -0
- slurmforge/pipeline/planning/train/context.py +55 -0
- slurmforge/pipeline/planning/train/model_resolution.py +94 -0
- slurmforge/pipeline/planning/train/strategies/__init__.py +3 -0
- slurmforge/pipeline/planning/train/strategies/adapter.py +61 -0
- slurmforge/pipeline/planning/train/strategies/base.py +14 -0
- slurmforge/pipeline/planning/train/strategies/command.py +79 -0
- slurmforge/pipeline/planning/train/strategies/model_cli.py +26 -0
- slurmforge/pipeline/planning/train/strategies/scripted.py +87 -0
- slurmforge/pipeline/planning/train/topology.py +134 -0
- slurmforge/pipeline/planning/validation/__init__.py +11 -0
- slurmforge/pipeline/planning/validation/api.py +25 -0
- slurmforge/pipeline/planning/validation/common.py +28 -0
- slurmforge/pipeline/planning/validation/errors.py +15 -0
- slurmforge/pipeline/planning/validation/formatter.py +9 -0
- slurmforge/pipeline/planning/validation/passes/__init__.py +3 -0
- slurmforge/pipeline/planning/validation/passes/cli_args.py +53 -0
- slurmforge/pipeline/planning/validation/passes/resources.py +151 -0
- slurmforge/pipeline/planning/validation/passes/summary.py +21 -0
- slurmforge/pipeline/planning/validation/passes/topology.py +115 -0
- slurmforge/pipeline/planning/validation/policies.py +50 -0
- slurmforge/pipeline/planning/validator.py +9 -0
- slurmforge/pipeline/records/__init__.py +3 -0
- slurmforge/pipeline/records/api.py +57 -0
- slurmforge/pipeline/records/batch_io.py +42 -0
- slurmforge/pipeline/records/batch_paths.py +83 -0
- slurmforge/pipeline/records/codecs/__init__.py +41 -0
- slurmforge/pipeline/records/codecs/array_assignment.py +31 -0
- slurmforge/pipeline/records/codecs/metadata.py +27 -0
- slurmforge/pipeline/records/codecs/run_plan.py +107 -0
- slurmforge/pipeline/records/codecs/run_snapshot.py +42 -0
- slurmforge/pipeline/records/io_utils.py +20 -0
- slurmforge/pipeline/records/models/__init__.py +23 -0
- slurmforge/pipeline/records/models/array_assignment.py +19 -0
- slurmforge/pipeline/records/models/dispatch.py +30 -0
- slurmforge/pipeline/records/models/metadata.py +11 -0
- slurmforge/pipeline/records/models/run_plan.py +113 -0
- slurmforge/pipeline/records/models/run_snapshot.py +22 -0
- slurmforge/pipeline/records/replay_spec/__init__.py +14 -0
- slurmforge/pipeline/records/replay_spec/builders.py +31 -0
- slurmforge/pipeline/records/replay_spec/codecs.py +47 -0
- slurmforge/pipeline/records/replay_spec/model.py +17 -0
- slurmforge/pipeline/records/snapshot_io.py +19 -0
- slurmforge/pipeline/sources/__init__.py +28 -0
- slurmforge/pipeline/sources/api.py +18 -0
- slurmforge/pipeline/sources/authoring/__init__.py +10 -0
- slurmforge/pipeline/sources/authoring/collector.py +129 -0
- slurmforge/pipeline/sources/authoring/loader.py +36 -0
- slurmforge/pipeline/sources/authoring/models.py +20 -0
- slurmforge/pipeline/sources/failures.py +49 -0
- slurmforge/pipeline/sources/inference.py +31 -0
- slurmforge/pipeline/sources/models.py +115 -0
- slurmforge/pipeline/sources/replay/__init__.py +21 -0
- slurmforge/pipeline/sources/replay/checkpoint.py +27 -0
- slurmforge/pipeline/sources/replay/collector.py +63 -0
- slurmforge/pipeline/sources/replay/loaders.py +130 -0
- slurmforge/pipeline/sources/replay/models.py +30 -0
- slurmforge/pipeline/sources/replay/overrides.py +21 -0
- slurmforge/pipeline/sources/replay/relocation.py +91 -0
- slurmforge/pipeline/sources/replay/resume_patch.py +34 -0
- slurmforge/pipeline/sources/replay/retry_refs.py +39 -0
- slurmforge/pipeline/sources/replay/retry_selection.py +82 -0
- slurmforge/pipeline/sources/replay/selectors.py +57 -0
- slurmforge/pipeline/sources/replay/variants/__init__.py +3 -0
- slurmforge/pipeline/sources/replay/variants/batch.py +76 -0
- slurmforge/pipeline/sources/replay/variants/retry.py +100 -0
- slurmforge/pipeline/sources/replay/variants/run.py +57 -0
- slurmforge/pipeline/sources/replay/variants/snapshot.py +43 -0
- slurmforge/pipeline/status/__init__.py +42 -0
- slurmforge/pipeline/status/api.py +12 -0
- slurmforge/pipeline/status/builders.py +46 -0
- slurmforge/pipeline/status/classifier/__init__.py +12 -0
- slurmforge/pipeline/status/classifier/discovery.py +61 -0
- slurmforge/pipeline/status/classifier/patterns.py +28 -0
- slurmforge/pipeline/status/classifier/reader.py +31 -0
- slurmforge/pipeline/status/classifier/rules.py +70 -0
- slurmforge/pipeline/status/codecs/__init__.py +15 -0
- slurmforge/pipeline/status/codecs/api.py +11 -0
- slurmforge/pipeline/status/codecs/attempt_result.py +77 -0
- slurmforge/pipeline/status/codecs/execution_status.py +88 -0
- slurmforge/pipeline/status/codecs/path_fields.py +59 -0
- slurmforge/pipeline/status/lifecycle.py +144 -0
- slurmforge/pipeline/status/models.py +57 -0
- slurmforge/pipeline/status/paths.py +31 -0
- slurmforge/pipeline/status/reconcile.py +193 -0
- slurmforge/pipeline/status/slurm.py +107 -0
- slurmforge/pipeline/status/store.py +62 -0
- slurmforge/pipeline/train_outputs/__init__.py +21 -0
- slurmforge/pipeline/train_outputs/api.py +17 -0
- slurmforge/pipeline/train_outputs/cache.py +63 -0
- slurmforge/pipeline/train_outputs/codec.py +44 -0
- slurmforge/pipeline/train_outputs/contract.py +97 -0
- slurmforge/pipeline/train_outputs/discovery.py +71 -0
- slurmforge/pipeline/train_outputs/env_writer.py +48 -0
- slurmforge/pipeline/train_outputs/models.py +20 -0
- slurmforge/pipeline/train_outputs/paths.py +11 -0
- slurmforge/pipeline/train_outputs/selection.py +64 -0
- slurmforge/pipeline/utils/__init__.py +6 -0
- slurmforge/pipeline/utils/merge.py +14 -0
- slurmforge/pipeline/utils/schema.py +12 -0
- slurmforge/resource_io.py +26 -0
- slurmforge/starter_catalog.py +203 -0
- slurmforge/starter_projects.py +69 -0
- slurmforge/starter_templates/README.md.tmpl +42 -0
- slurmforge/starter_templates/__init__.py +1 -0
- slurmforge/starter_templates/adapter_train.py.tmpl +49 -0
- slurmforge/starter_templates/command_train.py.tmpl +46 -0
- slurmforge/starter_templates/eval.py.tmpl +45 -0
- slurmforge/starter_templates/hpc_train.py.tmpl +73 -0
- slurmforge/starter_templates/model_cli_train.py.tmpl +63 -0
- slurmforge/sweep/__init__.py +33 -0
- slurmforge/sweep/api.py +23 -0
- slurmforge/sweep/expansion.py +87 -0
- slurmforge/sweep/materialize.py +37 -0
- slurmforge/sweep/models.py +25 -0
- slurmforge/sweep/overrides.py +24 -0
- slurmforge/sweep/validation.py +171 -0
- slurmforge/templates/sbatch_array_group.sh.j2 +42 -0
- slurmforge/templates/sbatch_notify.sh.j2 +8 -0
- slurmforge/templating.py +16 -0
- slurmforge/text_safety.py +22 -0
- slurmforge-0.1.0.dist-info/METADATA +716 -0
- slurmforge-0.1.0.dist-info/RECORD +371 -0
- slurmforge-0.1.0.dist-info/WHEEL +5 -0
- slurmforge-0.1.0.dist-info/entry_points.txt +6 -0
- slurmforge-0.1.0.dist-info/licenses/LICENSE +21 -0
- slurmforge-0.1.0.dist-info/top_level.txt +1 -0
slurmforge/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""CLI subcommands for slurmforge."""
|
slurmforge/cli/common.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Shared CLI argument builders and batch materialization helpers."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from ..errors import ConfigContractError
|
|
10
|
+
from ..identity import __version__, regenerate_after_upgrade_note
|
|
11
|
+
from ..pipeline.materialization import MaterializationResult, materialize_batch, print_dry_run
|
|
12
|
+
from ..pipeline.planning import PlannedBatch
|
|
13
|
+
from ..sweep import deep_set, parse_override
|
|
14
|
+
from ..templating import build_template_env
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_raw_cfg(config_path: Path) -> dict:
|
|
18
|
+
"""Load a YAML config file as a plain dict, without applying any overrides."""
|
|
19
|
+
text = config_path.read_text(encoding="utf-8")
|
|
20
|
+
cfg = yaml.safe_load(text)
|
|
21
|
+
if not isinstance(cfg, dict):
|
|
22
|
+
raise ConfigContractError(f"Config must be a YAML mapping: {config_path}")
|
|
23
|
+
return cfg
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_effective_cfg(config_path: Path, cli_overrides: list[str]) -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Load YAML config and apply CLI overrides, producing the effective config.
|
|
29
|
+
|
|
30
|
+
This represents the user's declared intent — raw YAML merged with any
|
|
31
|
+
``--set`` overrides — and is the single config view that all pre-compile
|
|
32
|
+
checks (completeness, correctness, advisory) should operate on.
|
|
33
|
+
|
|
34
|
+
The compiler receives ``(config_path, cli_overrides)`` separately because
|
|
35
|
+
it tracks override provenance for diagnostics and replay.
|
|
36
|
+
"""
|
|
37
|
+
cfg = load_raw_cfg(config_path)
|
|
38
|
+
for override in cli_overrides:
|
|
39
|
+
key, value = parse_override(override)
|
|
40
|
+
deep_set(cfg, key, value)
|
|
41
|
+
return cfg
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def add_config_override_args(parser: argparse.ArgumentParser) -> None:
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--set",
|
|
47
|
+
action="append",
|
|
48
|
+
default=[],
|
|
49
|
+
help="Override config by dot-path, e.g. --set run.args.lr=0.004",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--project_root",
|
|
53
|
+
default=None,
|
|
54
|
+
help="Override project root used to resolve relative paths (default: config file directory)",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def add_dry_run_arg(parser: argparse.ArgumentParser) -> None:
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--dry_run",
|
|
61
|
+
action="store_true",
|
|
62
|
+
help="Expand runs and print commands without writing files",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def add_common_args(parser: argparse.ArgumentParser) -> None:
|
|
67
|
+
add_config_override_args(parser)
|
|
68
|
+
add_dry_run_arg(parser)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def materialize_or_print_batch(
|
|
72
|
+
*,
|
|
73
|
+
planned_batch: PlannedBatch,
|
|
74
|
+
dry_run: bool,
|
|
75
|
+
) -> MaterializationResult | None:
|
|
76
|
+
if dry_run:
|
|
77
|
+
print_dry_run(planned_run.plan for planned_run in planned_batch.planned_runs)
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
env = build_template_env()
|
|
81
|
+
return materialize_batch(
|
|
82
|
+
planned_batch=planned_batch,
|
|
83
|
+
env=env,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def print_batch_ready(*, dispatch: MaterializationResult, sbatch_dir: Path, retry: bool = False) -> None:
|
|
88
|
+
if retry:
|
|
89
|
+
print(f"[OK] Generated retry batch with {len(dispatch.array_groups_meta)} array sbatch file(s) in: {sbatch_dir}")
|
|
90
|
+
else:
|
|
91
|
+
print(f"[OK] Generated {len(dispatch.array_groups_meta)} array sbatch file(s) in: {sbatch_dir}")
|
|
92
|
+
print(f"[OK] Generated by: slurmforge {__version__}")
|
|
93
|
+
print(f"[OK] Submit all via: {dispatch.submit_script}")
|
|
94
|
+
print(f"[OK] Batch manifest: {dispatch.manifest_path}")
|
|
95
|
+
print(f"[NOTE] {regenerate_after_upgrade_note()}")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""``sforge examples`` -- list, show, or export shipped YAML reference examples."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ..example_configs import export_example, list_example_catalog, read_example_text
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def handle_examples_list(_args: argparse.Namespace) -> None:
|
|
11
|
+
for name, description in list_example_catalog():
|
|
12
|
+
if description:
|
|
13
|
+
print(f"{name:<26} {description}")
|
|
14
|
+
else:
|
|
15
|
+
print(name)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def handle_examples_show(args: argparse.Namespace) -> None:
|
|
19
|
+
print(read_example_text(args.name), end="")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def handle_examples_export(args: argparse.Namespace) -> None:
|
|
23
|
+
exported = export_example(args.name, Path(args.out), force=args.force)
|
|
24
|
+
print(f"[OK] Wrote example config: {exported}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def add_subparser(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
|
|
28
|
+
examples_parser = subparsers.add_parser(
|
|
29
|
+
"examples",
|
|
30
|
+
help="List, show, or export shipped raw YAML reference examples",
|
|
31
|
+
)
|
|
32
|
+
examples_subparsers = examples_parser.add_subparsers(dest="examples_command")
|
|
33
|
+
examples_subparsers.required = True
|
|
34
|
+
|
|
35
|
+
list_parser = examples_subparsers.add_parser("list", help="List shipped raw YAML reference examples")
|
|
36
|
+
list_parser.set_defaults(handler=handle_examples_list)
|
|
37
|
+
|
|
38
|
+
show_parser = examples_subparsers.add_parser("show", help="Print one shipped raw YAML reference example")
|
|
39
|
+
show_parser.add_argument("name", help="Example name, with or without .yaml suffix")
|
|
40
|
+
show_parser.set_defaults(handler=handle_examples_show)
|
|
41
|
+
|
|
42
|
+
export_parser = examples_subparsers.add_parser("export", help="Copy one shipped raw YAML reference example to a file")
|
|
43
|
+
export_parser.add_argument("name", help="Example name, with or without .yaml suffix")
|
|
44
|
+
export_parser.add_argument("--out", required=True, help="Destination path for the exported raw YAML example")
|
|
45
|
+
export_parser.add_argument(
|
|
46
|
+
"--force",
|
|
47
|
+
action="store_true",
|
|
48
|
+
help="Overwrite the destination file if it already exists",
|
|
49
|
+
)
|
|
50
|
+
export_parser.set_defaults(handler=handle_examples_export)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""``sforge generate`` -- expand an experiment config into sbatch arrays."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ..pipeline.compiler import AuthoringSourceRequest, BatchCompileError, compile_source, iter_compile_report_lines
|
|
9
|
+
from ..pipeline.compiler.reports import require_success
|
|
10
|
+
from ..pipeline.config.validation.advisory import check_advisory
|
|
11
|
+
from ..pipeline.config.validation.completeness import assert_complete
|
|
12
|
+
from ..pipeline.config.validation.correctness import check_correctness
|
|
13
|
+
from ..pipeline.config.validation.messages import format_advisory_report, format_correctness_report
|
|
14
|
+
from .common import (
|
|
15
|
+
add_common_args,
|
|
16
|
+
load_effective_cfg,
|
|
17
|
+
materialize_or_print_batch,
|
|
18
|
+
print_batch_ready,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def render_generate(
|
|
23
|
+
*,
|
|
24
|
+
config_path: Path,
|
|
25
|
+
cli_overrides: list[str],
|
|
26
|
+
dry_run: bool,
|
|
27
|
+
project_root_override: str | None,
|
|
28
|
+
) -> None:
|
|
29
|
+
resolved_config_path = config_path.resolve()
|
|
30
|
+
project_root = (
|
|
31
|
+
Path(project_root_override).resolve()
|
|
32
|
+
if project_root_override is not None
|
|
33
|
+
else resolved_config_path.parent
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
effective_cfg = load_effective_cfg(resolved_config_path, cli_overrides)
|
|
37
|
+
|
|
38
|
+
# ── Level 0: completeness gate (hard — null sentinels block generation) ──
|
|
39
|
+
assert_complete(effective_cfg, config_path=resolved_config_path, project_root=project_root)
|
|
40
|
+
|
|
41
|
+
# ── Level 1: correctness gate (hard — format/logic errors block generation) ─
|
|
42
|
+
correctness_errors = check_correctness(effective_cfg, config_path=resolved_config_path)
|
|
43
|
+
if correctness_errors:
|
|
44
|
+
print(format_correctness_report(
|
|
45
|
+
correctness_errors,
|
|
46
|
+
config_path=resolved_config_path,
|
|
47
|
+
force_flag_available=False,
|
|
48
|
+
))
|
|
49
|
+
raise SystemExit(1)
|
|
50
|
+
|
|
51
|
+
# ── Level 2: advisory (soft — warnings printed, generation continues) ────
|
|
52
|
+
advisory_warnings = check_advisory(effective_cfg, config_path=resolved_config_path)
|
|
53
|
+
if advisory_warnings:
|
|
54
|
+
print(format_advisory_report(advisory_warnings, config_path=resolved_config_path))
|
|
55
|
+
print()
|
|
56
|
+
|
|
57
|
+
# ── Compiler pipeline ─────────────────────────────────────────────────────
|
|
58
|
+
report = compile_source(
|
|
59
|
+
AuthoringSourceRequest(
|
|
60
|
+
config_path=resolved_config_path,
|
|
61
|
+
cli_overrides=tuple(cli_overrides),
|
|
62
|
+
project_root=None if project_root_override is None else Path(project_root_override),
|
|
63
|
+
default_batch_name=datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f"),
|
|
64
|
+
),
|
|
65
|
+
)
|
|
66
|
+
for line in iter_compile_report_lines(report):
|
|
67
|
+
print(line)
|
|
68
|
+
try:
|
|
69
|
+
planned_batch = require_success(report)
|
|
70
|
+
except BatchCompileError:
|
|
71
|
+
raise
|
|
72
|
+
|
|
73
|
+
dispatch = materialize_or_print_batch(
|
|
74
|
+
planned_batch=planned_batch,
|
|
75
|
+
dry_run=dry_run,
|
|
76
|
+
)
|
|
77
|
+
if dispatch is None:
|
|
78
|
+
return
|
|
79
|
+
print_batch_ready(dispatch=dispatch, sbatch_dir=planned_batch.sbatch_dir)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def handle_generate(args: argparse.Namespace) -> None:
|
|
83
|
+
render_generate(
|
|
84
|
+
config_path=Path(args.config),
|
|
85
|
+
cli_overrides=args.set,
|
|
86
|
+
dry_run=args.dry_run,
|
|
87
|
+
project_root_override=args.project_root,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def add_subparser(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
|
|
92
|
+
generate_parser = subparsers.add_parser(
|
|
93
|
+
"generate",
|
|
94
|
+
help="Expand config into a new batch and render sbatch arrays",
|
|
95
|
+
)
|
|
96
|
+
generate_parser.add_argument(
|
|
97
|
+
"--config",
|
|
98
|
+
required=True,
|
|
99
|
+
help="Path to experiment config yaml",
|
|
100
|
+
)
|
|
101
|
+
add_common_args(generate_parser)
|
|
102
|
+
generate_parser.set_defaults(handler=handle_generate)
|
slurmforge/cli/init.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""
|
|
2
|
+
`sforge init` — create a starter project scaffold.
|
|
3
|
+
|
|
4
|
+
Decision tree (two orthogonal choices):
|
|
5
|
+
|
|
6
|
+
TRAINING TYPE (how is your training code invoked?)
|
|
7
|
+
script → train.py with CLI args [most common]
|
|
8
|
+
command → complete shell command
|
|
9
|
+
registry → shared team model registry
|
|
10
|
+
adapter → interface bridge script
|
|
11
|
+
|
|
12
|
+
PROFILE (cluster complexity)
|
|
13
|
+
starter → single GPU, minimal config [default]
|
|
14
|
+
hpc → multi-GPU, sweep, eval, artifact sync
|
|
15
|
+
|
|
16
|
+
Examples
|
|
17
|
+
--------
|
|
18
|
+
sforge init # interactive wizard
|
|
19
|
+
sforge init script # script · starter profile
|
|
20
|
+
sforge init script --profile hpc # script · hpc profile
|
|
21
|
+
sforge init command
|
|
22
|
+
sforge init command --profile hpc
|
|
23
|
+
sforge init registry
|
|
24
|
+
sforge init registry --profile hpc
|
|
25
|
+
sforge init adapter
|
|
26
|
+
sforge init adapter --profile hpc
|
|
27
|
+
"""
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import argparse
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
from ..starter_catalog import PROFILES, TEMPLATE_TYPES, get_starter_spec
|
|
34
|
+
from ..starter_projects import init_project
|
|
35
|
+
from .init_wizard import run_wizard
|
|
36
|
+
|
|
37
|
+
_DEFAULT_OUT = "./slurmforge_starter"
|
|
38
|
+
|
|
39
|
+
_TYPE_DESCRIPTIONS = {
|
|
40
|
+
"script": "Scaffold for a train.py-style script — slurmforge manages args and submission.",
|
|
41
|
+
"command": "Scaffold that wraps a complete shell command in Slurm.",
|
|
42
|
+
"registry": "Scaffold using a shared team model registry.",
|
|
43
|
+
"adapter": "Scaffold with an interface-bridge adapter script (advanced).",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Shared argument builder
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
def _add_common_args(parser: argparse.ArgumentParser) -> None:
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"--profile",
|
|
54
|
+
default="starter",
|
|
55
|
+
choices=PROFILES,
|
|
56
|
+
metavar="PROFILE",
|
|
57
|
+
help=(
|
|
58
|
+
"Cluster complexity profile. "
|
|
59
|
+
"'starter' = single GPU, minimal config (default). "
|
|
60
|
+
"'hpc' = multi-GPU, sweep, eval, artifact sync."
|
|
61
|
+
),
|
|
62
|
+
)
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"--out",
|
|
65
|
+
default=_DEFAULT_OUT,
|
|
66
|
+
metavar="DIR",
|
|
67
|
+
help="Destination directory for the project scaffold (default: %(default)s)",
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"--force",
|
|
71
|
+
action="store_true",
|
|
72
|
+
help="Overwrite existing files in the destination directory",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
# Handlers
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
def _do_init(*, template_type: str, profile: str, out: str, force: bool) -> None:
|
|
81
|
+
spec = get_starter_spec(template_type, profile)
|
|
82
|
+
written = init_project(template_type, profile, Path(out), force=force)
|
|
83
|
+
out_dir = Path(out).expanduser().resolve()
|
|
84
|
+
print(f"[OK] Initialized '{template_type}' scaffold (profile: {profile}) in: {out_dir}")
|
|
85
|
+
print(f"[INFO] {spec.post_init_guidance}")
|
|
86
|
+
print()
|
|
87
|
+
print(" Files created:")
|
|
88
|
+
for path in written:
|
|
89
|
+
print(f" {path}")
|
|
90
|
+
print()
|
|
91
|
+
print(" Next steps:")
|
|
92
|
+
print(f" 1. Open {out_dir / 'experiment.yaml'}")
|
|
93
|
+
print(" 2. Fill in every field marked with ~ (required — see STEP 1 comments)")
|
|
94
|
+
print(f" 3. Run: sforge validate --config {out_dir / 'experiment.yaml'}")
|
|
95
|
+
print(f" 4. Run: sforge generate --config {out_dir / 'experiment.yaml'}")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def handle_init_template(args: argparse.Namespace) -> None:
|
|
99
|
+
_do_init(
|
|
100
|
+
template_type=args.template_type,
|
|
101
|
+
profile=args.profile,
|
|
102
|
+
out=args.out,
|
|
103
|
+
force=args.force,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def handle_init_wizard(args: argparse.Namespace) -> None:
|
|
108
|
+
"""Fallback handler when no TYPE subcommand is given — launches interactive wizard."""
|
|
109
|
+
template_type, profile, out = run_wizard(out=args.out, force=args.force)
|
|
110
|
+
_do_init(template_type=template_type, profile=profile, out=out, force=args.force)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
# Parser registration
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
def add_subparser(subparsers: argparse._SubParsersAction) -> None: # type: ignore[type-arg]
|
|
118
|
+
init_parser = subparsers.add_parser(
|
|
119
|
+
"init",
|
|
120
|
+
description=__doc__,
|
|
121
|
+
help="Create a starter project scaffold (run 'sforge init' for interactive setup)",
|
|
122
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
123
|
+
)
|
|
124
|
+
# Top-level --out/--force for wizard path (TYPE subcommand overrides these)
|
|
125
|
+
init_parser.add_argument(
|
|
126
|
+
"--out",
|
|
127
|
+
default=_DEFAULT_OUT,
|
|
128
|
+
metavar="DIR",
|
|
129
|
+
help="Output directory (wizard mode — overridden by TYPE subcommand flags)",
|
|
130
|
+
)
|
|
131
|
+
init_parser.add_argument(
|
|
132
|
+
"--force",
|
|
133
|
+
action="store_true",
|
|
134
|
+
help="Overwrite existing files",
|
|
135
|
+
)
|
|
136
|
+
init_parser.set_defaults(handler=handle_init_wizard)
|
|
137
|
+
|
|
138
|
+
# TYPE subcommands: script / command / registry / adapter
|
|
139
|
+
type_subparsers = init_parser.add_subparsers(dest="template_type")
|
|
140
|
+
|
|
141
|
+
for ttype in TEMPLATE_TYPES:
|
|
142
|
+
tp = type_subparsers.add_parser(
|
|
143
|
+
ttype,
|
|
144
|
+
help=_TYPE_DESCRIPTIONS.get(ttype, ""),
|
|
145
|
+
description=_TYPE_DESCRIPTIONS.get(ttype, ""),
|
|
146
|
+
)
|
|
147
|
+
tp.set_defaults(template_type=ttype, handler=handle_init_template)
|
|
148
|
+
_add_common_args(tp)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Interactive 2-question wizard for `sforge init` (no arguments).
|
|
3
|
+
Falls back gracefully when stdin is not a TTY (CI, piped input).
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_TRAINING_TYPES = [
|
|
11
|
+
("script", "I have a train.py — slurmforge manages my args and submission"),
|
|
12
|
+
("command", "I have a complete launch command — just wrap it in Slurm"),
|
|
13
|
+
("registry", "My training code lives in a shared team model registry"),
|
|
14
|
+
("adapter", "I need an interface bridge between slurmforge and my training code"),
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
_PROFILES = [
|
|
18
|
+
("starter", "Quick start — single GPU, minimal config (recommended for first run)"),
|
|
19
|
+
("hpc", "Full HPC — multi-GPU, sweep, eval, artifact sync"),
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _prompt_choice(
|
|
24
|
+
prompt: str,
|
|
25
|
+
options: list[tuple[str, str]],
|
|
26
|
+
) -> str:
|
|
27
|
+
"""Display numbered options and return the selected key."""
|
|
28
|
+
print(prompt)
|
|
29
|
+
for i, (key, description) in enumerate(options, 1):
|
|
30
|
+
print(f" {i}) {key:<12} {description}")
|
|
31
|
+
print()
|
|
32
|
+
while True:
|
|
33
|
+
try:
|
|
34
|
+
raw = input(" Enter number: ").strip()
|
|
35
|
+
except (EOFError, KeyboardInterrupt):
|
|
36
|
+
print()
|
|
37
|
+
sys.exit(0)
|
|
38
|
+
if raw.isdigit():
|
|
39
|
+
idx = int(raw) - 1
|
|
40
|
+
if 0 <= idx < len(options):
|
|
41
|
+
chosen_key = options[idx][0]
|
|
42
|
+
print(f" → {chosen_key}")
|
|
43
|
+
print()
|
|
44
|
+
return chosen_key
|
|
45
|
+
print(f" Please enter a number between 1 and {len(options)}.")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def run_wizard(*, out: str, force: bool) -> tuple[str, str, str]:
|
|
49
|
+
"""
|
|
50
|
+
Run the interactive wizard and return (template_type, profile, out_dir).
|
|
51
|
+
Raises SystemExit if the user cancels.
|
|
52
|
+
"""
|
|
53
|
+
if not sys.stdin.isatty():
|
|
54
|
+
print(
|
|
55
|
+
"[sforge init] No template type specified.\n"
|
|
56
|
+
"Usage: sforge init <TYPE> [--profile starter|hpc] [--out DIR]\n"
|
|
57
|
+
"\n"
|
|
58
|
+
"Available types: script, command, registry, adapter\n"
|
|
59
|
+
"Run 'sforge init --help' for full usage.",
|
|
60
|
+
file=sys.stderr,
|
|
61
|
+
)
|
|
62
|
+
sys.exit(1)
|
|
63
|
+
|
|
64
|
+
print()
|
|
65
|
+
print(" ┌──────────────────────────────────────────────────────┐")
|
|
66
|
+
print(" │ sforge init · project setup wizard │")
|
|
67
|
+
print(" └──────────────────────────────────────────────────────┘")
|
|
68
|
+
print()
|
|
69
|
+
|
|
70
|
+
template_type = _prompt_choice(
|
|
71
|
+
" How is your training code invoked?",
|
|
72
|
+
_TRAINING_TYPES,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
profile = _prompt_choice(
|
|
76
|
+
" Which cluster profile fits your setup?",
|
|
77
|
+
_PROFILES,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
return template_type, profile, out
|
slurmforge/cli/replay.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""``sforge replay`` -- regenerate a batch from persisted run snapshots."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ..pipeline.compiler import BatchCompileError, ReplaySourceRequest, compile_source, iter_compile_report_lines
|
|
9
|
+
from ..pipeline.compiler.reports import report_total_runs, require_success
|
|
10
|
+
from .common import add_common_args, materialize_or_print_batch, print_batch_ready
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def render_replay(
|
|
14
|
+
*,
|
|
15
|
+
source_run_dir: Path | None,
|
|
16
|
+
source_snapshot_path: Path | None,
|
|
17
|
+
source_batch_root: Path | None,
|
|
18
|
+
run_ids: list[str],
|
|
19
|
+
run_indices: list[int],
|
|
20
|
+
cli_overrides: list[str],
|
|
21
|
+
dry_run: bool,
|
|
22
|
+
project_root_override: str | None,
|
|
23
|
+
) -> None:
|
|
24
|
+
default_batch_name = datetime.datetime.now().strftime("replay_%Y%m%d_%H%M%S_%f")
|
|
25
|
+
request = ReplaySourceRequest(
|
|
26
|
+
source_run_dir=source_run_dir,
|
|
27
|
+
source_snapshot_path=source_snapshot_path,
|
|
28
|
+
source_batch_root=source_batch_root,
|
|
29
|
+
run_ids=tuple(run_ids),
|
|
30
|
+
run_indices=tuple(run_indices),
|
|
31
|
+
cli_overrides=tuple(cli_overrides),
|
|
32
|
+
project_root=None if project_root_override is None else Path(project_root_override),
|
|
33
|
+
default_batch_name=default_batch_name,
|
|
34
|
+
)
|
|
35
|
+
report = compile_source(request)
|
|
36
|
+
source_summary = getattr(report, "source_summary", "") or "<missing replay source>"
|
|
37
|
+
print(f"[REPLAY] source={source_summary} selected_runs={report_total_runs(report)}")
|
|
38
|
+
for line in iter_compile_report_lines(report):
|
|
39
|
+
print(line)
|
|
40
|
+
try:
|
|
41
|
+
planned_batch = require_success(report)
|
|
42
|
+
except BatchCompileError:
|
|
43
|
+
raise
|
|
44
|
+
|
|
45
|
+
dispatch = materialize_or_print_batch(
|
|
46
|
+
planned_batch=planned_batch,
|
|
47
|
+
dry_run=dry_run,
|
|
48
|
+
)
|
|
49
|
+
if dispatch is None:
|
|
50
|
+
return
|
|
51
|
+
print_batch_ready(dispatch=dispatch, sbatch_dir=planned_batch.sbatch_dir)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def handle_replay(args: argparse.Namespace) -> None:
|
|
55
|
+
render_replay(
|
|
56
|
+
source_run_dir=None if args.source_run_dir is None else Path(args.source_run_dir),
|
|
57
|
+
source_snapshot_path=None if args.source_snapshot_path is None else Path(args.source_snapshot_path),
|
|
58
|
+
source_batch_root=None if args.source_batch_root is None else Path(args.source_batch_root),
|
|
59
|
+
run_ids=args.run_id,
|
|
60
|
+
run_indices=args.run_index,
|
|
61
|
+
cli_overrides=args.set,
|
|
62
|
+
dry_run=args.dry_run,
|
|
63
|
+
project_root_override=args.project_root,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def add_subparser(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
|
|
68
|
+
replay_parser = subparsers.add_parser(
|
|
69
|
+
"replay",
|
|
70
|
+
help="Replay one or more persisted runs from a run dir, snapshot, or batch root",
|
|
71
|
+
)
|
|
72
|
+
source_group = replay_parser.add_mutually_exclusive_group(required=True)
|
|
73
|
+
source_group.add_argument(
|
|
74
|
+
"--from-run",
|
|
75
|
+
dest="source_run_dir",
|
|
76
|
+
help="Path to a persisted run directory containing meta/run_snapshot.json",
|
|
77
|
+
)
|
|
78
|
+
source_group.add_argument(
|
|
79
|
+
"--from-snapshot",
|
|
80
|
+
dest="source_snapshot_path",
|
|
81
|
+
help="Path to a persisted run_snapshot.json file",
|
|
82
|
+
)
|
|
83
|
+
source_group.add_argument(
|
|
84
|
+
"--from-batch",
|
|
85
|
+
dest="source_batch_root",
|
|
86
|
+
help="Path to an existing batch_root; replays all runs unless selectors are provided",
|
|
87
|
+
)
|
|
88
|
+
replay_parser.add_argument(
|
|
89
|
+
"--run_id",
|
|
90
|
+
action="append",
|
|
91
|
+
default=[],
|
|
92
|
+
help="Select specific run_id values when replaying from --from-batch",
|
|
93
|
+
)
|
|
94
|
+
replay_parser.add_argument(
|
|
95
|
+
"--run_index",
|
|
96
|
+
action="append",
|
|
97
|
+
type=int,
|
|
98
|
+
default=[],
|
|
99
|
+
help="Select specific run_index values when replaying from --from-batch",
|
|
100
|
+
)
|
|
101
|
+
add_common_args(replay_parser)
|
|
102
|
+
replay_parser.set_defaults(handler=handle_replay)
|
slurmforge/cli/rerun.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""``sforge rerun`` -- rebuild a retry batch from an existing batch's run records."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ..pipeline.compiler import BatchCompileError, RetrySourceRequest, compile_source, iter_compile_report_lines
|
|
9
|
+
from ..pipeline.compiler.reports import report_total_runs, require_success
|
|
10
|
+
from .common import add_common_args, materialize_or_print_batch, print_batch_ready
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def render_rerun(
|
|
14
|
+
*,
|
|
15
|
+
source_batch_root: Path,
|
|
16
|
+
cli_overrides: list[str],
|
|
17
|
+
dry_run: bool,
|
|
18
|
+
project_root_override: str | None,
|
|
19
|
+
status_query: str,
|
|
20
|
+
) -> None:
|
|
21
|
+
default_batch_name = datetime.datetime.now().strftime("retry_%Y%m%d_%H%M%S_%f")
|
|
22
|
+
report = compile_source(
|
|
23
|
+
RetrySourceRequest(
|
|
24
|
+
source_batch_root=source_batch_root,
|
|
25
|
+
status_query=status_query,
|
|
26
|
+
cli_overrides=tuple(cli_overrides),
|
|
27
|
+
project_root=None if project_root_override is None else Path(project_root_override),
|
|
28
|
+
default_batch_name=default_batch_name,
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
source_summary = getattr(report, "source_summary", "") or str(source_batch_root)
|
|
32
|
+
print(f"[RETRY] source={source_summary} selected_runs={report_total_runs(report)}")
|
|
33
|
+
for line in iter_compile_report_lines(report):
|
|
34
|
+
print(line)
|
|
35
|
+
try:
|
|
36
|
+
planned_batch = require_success(report)
|
|
37
|
+
except BatchCompileError:
|
|
38
|
+
raise
|
|
39
|
+
|
|
40
|
+
dispatch = materialize_or_print_batch(
|
|
41
|
+
planned_batch=planned_batch,
|
|
42
|
+
dry_run=dry_run,
|
|
43
|
+
)
|
|
44
|
+
if dispatch is None:
|
|
45
|
+
return
|
|
46
|
+
print_batch_ready(dispatch=dispatch, sbatch_dir=planned_batch.sbatch_dir, retry=True)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def handle_rerun(args: argparse.Namespace) -> None:
|
|
50
|
+
render_rerun(
|
|
51
|
+
source_batch_root=Path(args.source_batch_root).resolve(),
|
|
52
|
+
cli_overrides=args.set,
|
|
53
|
+
dry_run=args.dry_run,
|
|
54
|
+
project_root_override=args.project_root,
|
|
55
|
+
status_query=args.status,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def add_subparser(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
|
|
60
|
+
rerun_parser = subparsers.add_parser("rerun", help="Rebuild a retry batch from an existing batch_root")
|
|
61
|
+
rerun_parser.add_argument(
|
|
62
|
+
"--from",
|
|
63
|
+
dest="source_batch_root",
|
|
64
|
+
required=True,
|
|
65
|
+
help="Path to an existing batch_root; rebuild and resubmit a filtered retry batch from its run records",
|
|
66
|
+
)
|
|
67
|
+
rerun_parser.add_argument(
|
|
68
|
+
"--status",
|
|
69
|
+
default="failed",
|
|
70
|
+
help=(
|
|
71
|
+
"Retry filter for existing batch runs: failed(non-success), success, "
|
|
72
|
+
"pending, running, oom, preempted, node_failure, script_error, eval_failed, all"
|
|
73
|
+
),
|
|
74
|
+
)
|
|
75
|
+
add_common_args(rerun_parser)
|
|
76
|
+
rerun_parser.set_defaults(handler=handle_rerun)
|