srunx 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- srunx-0.1.0/.coverage +0 -0
- srunx-0.1.0/.gitignore +12 -0
- srunx-0.1.0/.pre-commit-config.yaml +43 -0
- srunx-0.1.0/.python-version +1 -0
- srunx-0.1.0/CLAUDE.md +139 -0
- srunx-0.1.0/PKG-INFO +405 -0
- srunx-0.1.0/README.md +392 -0
- srunx-0.1.0/coverage.xml +803 -0
- srunx-0.1.0/examples/workflow.yaml +21 -0
- srunx-0.1.0/htmlcov/.gitignore +2 -0
- srunx-0.1.0/htmlcov/class_index.html +251 -0
- srunx-0.1.0/htmlcov/coverage_html_cb_6fb7b396.js +733 -0
- srunx-0.1.0/htmlcov/favicon_32_cb_58284776.png +0 -0
- srunx-0.1.0/htmlcov/function_index.html +619 -0
- srunx-0.1.0/htmlcov/index.html +160 -0
- srunx-0.1.0/htmlcov/keybd_closed_cb_ce680311.png +0 -0
- srunx-0.1.0/htmlcov/status.json +1 -0
- srunx-0.1.0/htmlcov/style_cb_81f8c14c.css +337 -0
- srunx-0.1.0/htmlcov/z_0a7988b152b0e760_client_py.html +363 -0
- srunx-0.1.0/htmlcov/z_0a7988b152b0e760_logging_py.html +213 -0
- srunx-0.1.0/htmlcov/z_0a7988b152b0e760_models_py.html +335 -0
- srunx-0.1.0/htmlcov/z_0a7988b152b0e760_utils_py.html +163 -0
- srunx-0.1.0/htmlcov/z_92b02f3d01ed93ff_main_py.html +690 -0
- srunx-0.1.0/htmlcov/z_92b02f3d01ed93ff_workflow_py.html +293 -0
- srunx-0.1.0/htmlcov/z_ac7289840e8f55c2_runner_py.html +296 -0
- srunx-0.1.0/htmlcov/z_ac7289840e8f55c2_tasks_py.html +179 -0
- srunx-0.1.0/pyproject.toml +82 -0
- srunx-0.1.0/src/srunx/__init__.py +47 -0
- srunx-0.1.0/src/srunx/cli/__init__.py +5 -0
- srunx-0.1.0/src/srunx/cli/main.py +593 -0
- srunx-0.1.0/src/srunx/cli/workflow.py +196 -0
- srunx-0.1.0/src/srunx/client.py +266 -0
- srunx-0.1.0/src/srunx/logging.py +116 -0
- srunx-0.1.0/src/srunx/models.py +239 -0
- srunx-0.1.0/src/srunx/py.typed +0 -0
- srunx-0.1.0/src/srunx/templates/advanced.slurm.jinja +35 -0
- srunx-0.1.0/src/srunx/templates/base.slurm.jinja +26 -0
- srunx-0.1.0/src/srunx/utils.py +66 -0
- srunx-0.1.0/src/srunx/workflows/__init__.py +6 -0
- srunx-0.1.0/src/srunx/workflows/runner.py +200 -0
- srunx-0.1.0/src/srunx/workflows/tasks.py +83 -0
- srunx-0.1.0/tests/__init__.py +1 -0
- srunx-0.1.0/tests/test_cli_main.py +612 -0
- srunx-0.1.0/tests/test_cli_workflow.py +645 -0
- srunx-0.1.0/tests/test_client.py +468 -0
- srunx-0.1.0/tests/test_models.py +417 -0
- srunx-0.1.0/tests/workflows/__init__.py +1 -0
- srunx-0.1.0/tests/workflows/test_runner.py +558 -0
- srunx-0.1.0/tests/workflows/test_tasks.py +408 -0
- srunx-0.1.0/uv.lock +2985 -0
srunx-0.1.0/.coverage
ADDED
|
Binary file
|
srunx-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
# Ruff (linter + formatter)
|
|
3
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
4
|
+
rev: v0.11.13
|
|
5
|
+
hooks:
|
|
6
|
+
- id: ruff
|
|
7
|
+
args: [--fix]
|
|
8
|
+
require_serial: true
|
|
9
|
+
- id: ruff-format
|
|
10
|
+
require_serial: true
|
|
11
|
+
|
|
12
|
+
# Pre-commit default hooks
|
|
13
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
14
|
+
rev: v5.0.0
|
|
15
|
+
hooks:
|
|
16
|
+
- id: check-added-large-files
|
|
17
|
+
- id: check-json
|
|
18
|
+
- id: check-toml
|
|
19
|
+
- id: check-xml
|
|
20
|
+
- id: check-yaml
|
|
21
|
+
- id: debug-statements
|
|
22
|
+
- id: detect-aws-credentials
|
|
23
|
+
args: [--allow-missing-credentials]
|
|
24
|
+
- id: detect-private-key
|
|
25
|
+
|
|
26
|
+
# uv pip-compile
|
|
27
|
+
- repo: https://github.com/astral-sh/uv-pre-commit
|
|
28
|
+
rev: 0.7.13
|
|
29
|
+
hooks:
|
|
30
|
+
- id: pip-compile
|
|
31
|
+
name: pip-compile requirements.in
|
|
32
|
+
args: [requirements.in, -o, requirements.txt]
|
|
33
|
+
- id: pip-compile
|
|
34
|
+
name: pip-compile requirements-dev.in
|
|
35
|
+
args: [requirements-dev.in, -o, requirements-dev.txt]
|
|
36
|
+
files: ^requirements-dev\\.(in|txt)$
|
|
37
|
+
|
|
38
|
+
# MyPy
|
|
39
|
+
# - repo: https://github.com/pre-commit/mirrors-mypy
|
|
40
|
+
# rev: v1.10.0
|
|
41
|
+
# hooks:
|
|
42
|
+
# - id: mypy
|
|
43
|
+
# args: [--config-file=pyproject.toml]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.11
|
srunx-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Development Commands
|
|
6
|
+
|
|
7
|
+
### Package Management
|
|
8
|
+
- `uv sync` - Install dependencies
|
|
9
|
+
- `uv add <package>` - Add new dependency
|
|
10
|
+
- `uv run <command>` - Run commands in virtual environment
|
|
11
|
+
|
|
12
|
+
### CLI Usage
|
|
13
|
+
- `uv run srunx submit <command>` - Submit SLURM job
|
|
14
|
+
- `uv run srunx status <job_id>` - Check job status
|
|
15
|
+
- `uv run srunx list` - List jobs
|
|
16
|
+
- `uv run srunx cancel <job_id>` - Cancel job
|
|
17
|
+
- `uv run srunx flow run <yaml_file>` - Execute workflow from YAML
|
|
18
|
+
- `uv run srunx flow validate <yaml_file>` - Validate workflow YAML
|
|
19
|
+
|
|
20
|
+
### Testing
|
|
21
|
+
- `uv run pytest` - Run all tests
|
|
22
|
+
- `uv run pytest --cov=srunx` - Run tests with coverage
|
|
23
|
+
- `uv run pytest tests/test_models.py` - Run specific test file
|
|
24
|
+
- `uv run pytest -v` - Run tests with verbose output
|
|
25
|
+
|
|
26
|
+
### Direct Usage Examples
|
|
27
|
+
- `uv run srunx submit python train.py --name ml_job --gpus-per-node 1`
|
|
28
|
+
- `uv run srunx submit python process.py --conda ml_env --nodes 2`
|
|
29
|
+
- `uv run srunx flow run workflow.yaml`
|
|
30
|
+
|
|
31
|
+
## Architecture Overview
|
|
32
|
+
|
|
33
|
+
### New Modular Structure
|
|
34
|
+
```
|
|
35
|
+
src/srunx/
|
|
36
|
+
├── models.py # Data models and validation
|
|
37
|
+
├── client.py # SLURM client for job operations
|
|
38
|
+
├── workflows/ # Workflow management
|
|
39
|
+
│ ├── runner.py # Workflow execution engine
|
|
40
|
+
│ └── tasks.py # Prefect task definitions
|
|
41
|
+
├── cli/ # Command-line interfaces
|
|
42
|
+
│ ├── main.py # Main CLI commands
|
|
43
|
+
│ └── workflow.py # Workflow CLI
|
|
44
|
+
└── templates/ # SLURM script templates
|
|
45
|
+
├── base.slurm.jinja
|
|
46
|
+
└── advanced.slurm.jinja
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Core Components
|
|
50
|
+
|
|
51
|
+
#### Models (`models.py`)
|
|
52
|
+
- **Job**: Complete job configuration with resources and environment
|
|
53
|
+
- **JobResource**: Resource allocation (nodes, GPUs, memory, time)
|
|
54
|
+
- **JobEnvironment**: Environment setup (conda, venv, sqsh)
|
|
55
|
+
- **JobStatus**: Job status enumeration
|
|
56
|
+
- **Workflow/WorkflowTask**: Workflow definitions with dependencies
|
|
57
|
+
- **render_job_script()**: Template rendering function
|
|
58
|
+
|
|
59
|
+
#### Client (`client.py`)
|
|
60
|
+
- **Slurm**: Main interface for SLURM operations
|
|
61
|
+
- `submit_job()`: Submit jobs with full configuration
|
|
62
|
+
- `retrieve_job()`: Query job status
|
|
63
|
+
- `cancel_job()`: Cancel running jobs
|
|
64
|
+
- `list_jobs()`: List user jobs
|
|
65
|
+
- `wait_for_completion()`: Block until job completes
|
|
66
|
+
|
|
67
|
+
#### Workflows (`workflows/`)
|
|
68
|
+
- **WorkflowRunner**: YAML workflow execution with Prefect
|
|
69
|
+
- **Prefect Tasks**:
|
|
70
|
+
- `submit_and_monitor_job()`: Complete job lifecycle
|
|
71
|
+
- `submit_job_async()`: Async job submission
|
|
72
|
+
- `wait_for_job()`: Wait for completion
|
|
73
|
+
|
|
74
|
+
#### CLI (`cli/`)
|
|
75
|
+
- **Main CLI**: Job management commands (submit, status, list, cancel)
|
|
76
|
+
- **Workflow CLI**: YAML workflow execution with validation
|
|
77
|
+
|
|
78
|
+
### Template System
|
|
79
|
+
- Enhanced Jinja2 templates with conditional resource allocation
|
|
80
|
+
- `base.slurm.jinja`: Simple job template
|
|
81
|
+
- `advanced.slurm.jinja`: Full-featured template with all options
|
|
82
|
+
- Automatic environment setup integration
|
|
83
|
+
|
|
84
|
+
### Workflow Definition
|
|
85
|
+
Enhanced YAML workflow format:
|
|
86
|
+
```yaml
|
|
87
|
+
name: ml_pipeline
|
|
88
|
+
tasks:
|
|
89
|
+
- name: preprocess
|
|
90
|
+
command: ["python", "preprocess.py"]
|
|
91
|
+
nodes: 1
|
|
92
|
+
|
|
93
|
+
- name: train
|
|
94
|
+
command: ["python", "train.py"]
|
|
95
|
+
depends_on: [preprocess]
|
|
96
|
+
gpus_per_node: 1
|
|
97
|
+
conda: ml_env
|
|
98
|
+
memory_per_node: "32GB"
|
|
99
|
+
time_limit: "4:00:00"
|
|
100
|
+
|
|
101
|
+
- name: evaluate
|
|
102
|
+
command: ["python", "evaluate.py"]
|
|
103
|
+
depends_on: [train]
|
|
104
|
+
async: true
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Key Improvements
|
|
108
|
+
- **Unified Job Model**: Single `Job` class with comprehensive configuration
|
|
109
|
+
- **Modular Architecture**: Clear separation of concerns
|
|
110
|
+
- **Enhanced CLI**: Subcommands with rich options
|
|
111
|
+
- **Better Error Handling**: Comprehensive validation and error messages
|
|
112
|
+
- **Resource Management**: Full SLURM resource specification
|
|
113
|
+
- **Workflow Validation**: Dependency checking and cycle detection
|
|
114
|
+
|
|
115
|
+
## Dependencies
|
|
116
|
+
- **Jinja2**: Template rendering
|
|
117
|
+
- **Pydantic**: Data validation and serialization
|
|
118
|
+
- **Prefect**: Workflow orchestration
|
|
119
|
+
- **Loguru**: Structured logging
|
|
120
|
+
- **PyYAML**: YAML parsing
|
|
121
|
+
|
|
122
|
+
## Code Quality and Linting
|
|
123
|
+
|
|
124
|
+
### Quality Checks
|
|
125
|
+
- `uv run mypy .` - Type checking with mypy
|
|
126
|
+
- `uv run ruff check .` - Code linting
|
|
127
|
+
- `uv run ruff format .` - Code formatting
|
|
128
|
+
|
|
129
|
+
### Pre-commit Quality Checks
|
|
130
|
+
Always run these before committing:
|
|
131
|
+
```bash
|
|
132
|
+
uv run pytest && uv run mypy . && uv run ruff check .
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
# important-instruction-reminders
|
|
136
|
+
Do what has been asked; nothing more, nothing less.
|
|
137
|
+
NEVER create files unless they're absolutely necessary for achieving your goal.
|
|
138
|
+
ALWAYS prefer editing an existing file to creating a new one.
|
|
139
|
+
NEVER proactively create documentation files (*.md) or README files. Only create documentation files if explicitly requested by the User.
|
srunx-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: srunx
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Author-email: ksterx <kostonerx@gmail.com>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: jinja2>=3.1.6
|
|
8
|
+
Requires-Dist: loguru>=0.7.3
|
|
9
|
+
Requires-Dist: prefect>=3.4.6
|
|
10
|
+
Requires-Dist: pydantic>=2.11.5
|
|
11
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# srunx
|
|
15
|
+
|
|
16
|
+
[](https://www.python.org/downloads/)
|
|
17
|
+
[](https://mypy.readthedocs.io/)
|
|
18
|
+
[](https://github.com/astral-sh/ruff)
|
|
19
|
+
|
|
20
|
+
A modern Python library for SLURM workload manager integration with workflow orchestration capabilities.
|
|
21
|
+
|
|
22
|
+
## Features
|
|
23
|
+
|
|
24
|
+
- 🚀 **Simple Job Submission**: Easy-to-use API for submitting SLURM jobs
|
|
25
|
+
- ⚙️ **Flexible Configuration**: Support for various environments (conda, venv, sqsh)
|
|
26
|
+
- 📋 **Job Management**: Submit, monitor, cancel, and list jobs
|
|
27
|
+
- 🧩 **Workflow Orchestration**: YAML-based workflow definitions with Prefect integration
|
|
28
|
+
- 📝 **Template System**: Customizable Jinja2 templates for SLURM scripts
|
|
29
|
+
- 🛡️ **Type Safe**: Full type hints and mypy compatibility
|
|
30
|
+
- 🖥️ **CLI Tools**: Command-line interfaces for both job management and workflows
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
### Using uv (Recommended)
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
uv add srunx
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Using pip
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install srunx
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Development Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
git clone https://github.com/your-username/srunx.git
|
|
50
|
+
cd srunx
|
|
51
|
+
uv sync --dev
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Quick Start
|
|
55
|
+
|
|
56
|
+
### Basic Job Submission
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from srunx import Job, JobResource, JobEnvironment, Slurm
|
|
60
|
+
|
|
61
|
+
# Create a job configuration
|
|
62
|
+
job = Job(
|
|
63
|
+
name="my_training_job",
|
|
64
|
+
command=["python", "train.py", "--epochs", "100"],
|
|
65
|
+
resources=JobResource(
|
|
66
|
+
nodes=1,
|
|
67
|
+
gpus_per_node=1,
|
|
68
|
+
memory_per_node="32GB",
|
|
69
|
+
time_limit="4:00:00"
|
|
70
|
+
),
|
|
71
|
+
environment=JobEnvironment(conda="ml_env")
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Submit the job
|
|
75
|
+
client = Slurm()
|
|
76
|
+
job = client.run(job)
|
|
77
|
+
print(f"Submitted job {job.job_id}")
|
|
78
|
+
|
|
79
|
+
# Monitor job status
|
|
80
|
+
job = client.retrieve(job.job_id)
|
|
81
|
+
print(f"Job status: {job.status}")
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Command Line Usage
|
|
85
|
+
|
|
86
|
+
#### Submit a Job
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# Basic job submission
|
|
90
|
+
srunx submit python train.py --name ml_job
|
|
91
|
+
|
|
92
|
+
# With resource specifications
|
|
93
|
+
srunx submit python train.py \
|
|
94
|
+
--name gpu_job \
|
|
95
|
+
--gpus-per-node 2 \
|
|
96
|
+
--memory 64GB \
|
|
97
|
+
--time 8:00:00
|
|
98
|
+
|
|
99
|
+
# With environment setup
|
|
100
|
+
srunx submit python train.py \
|
|
101
|
+
--conda ml_env \
|
|
102
|
+
--module cuda/11.8 \
|
|
103
|
+
--module gcc/9.3.0
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
#### Job Management
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# Check job status
|
|
110
|
+
srunx status 12345
|
|
111
|
+
|
|
112
|
+
# List all jobs
|
|
113
|
+
srunx list
|
|
114
|
+
|
|
115
|
+
# Cancel a job
|
|
116
|
+
srunx cancel 12345
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Workflow Orchestration
|
|
120
|
+
|
|
121
|
+
Create a workflow YAML file:
|
|
122
|
+
|
|
123
|
+
```yaml
|
|
124
|
+
# workflow.yaml
|
|
125
|
+
name: ml_pipeline
|
|
126
|
+
tasks:
|
|
127
|
+
- name: preprocess
|
|
128
|
+
command: ["python", "preprocess.py"]
|
|
129
|
+
nodes: 1
|
|
130
|
+
memory_per_node: "16GB"
|
|
131
|
+
|
|
132
|
+
- name: train
|
|
133
|
+
command: ["python", "train.py"]
|
|
134
|
+
depends_on: [preprocess]
|
|
135
|
+
nodes: 1
|
|
136
|
+
gpus_per_node: 2
|
|
137
|
+
memory_per_node: "32GB"
|
|
138
|
+
time_limit: "8:00:00"
|
|
139
|
+
conda: ml_env
|
|
140
|
+
|
|
141
|
+
- name: evaluate
|
|
142
|
+
command: ["python", "evaluate.py"]
|
|
143
|
+
depends_on: [train]
|
|
144
|
+
nodes: 1
|
|
145
|
+
|
|
146
|
+
- name: notify
|
|
147
|
+
command: ["python", "notify.py"]
|
|
148
|
+
depends_on: [train, evaluate]
|
|
149
|
+
async: true
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Execute the workflow:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
# Run workflow
|
|
156
|
+
srunx flow workflow.yaml
|
|
157
|
+
|
|
158
|
+
# Validate workflow without execution
|
|
159
|
+
srunx flow workflow.yaml --validate-only
|
|
160
|
+
|
|
161
|
+
# Show execution plan
|
|
162
|
+
srunx flow workflow.yaml --dry-run
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Advanced Usage
|
|
166
|
+
|
|
167
|
+
### Custom Templates
|
|
168
|
+
|
|
169
|
+
Create a custom SLURM template:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
#!/bin/bash
|
|
173
|
+
#SBATCH --job-name={{ job_name }}
|
|
174
|
+
#SBATCH --nodes={{ nodes }}
|
|
175
|
+
{% if gpus_per_node > 0 -%}
|
|
176
|
+
#SBATCH --gpus-per-node={{ gpus_per_node }}
|
|
177
|
+
{% endif -%}
|
|
178
|
+
#SBATCH --time={{ time_limit }}
|
|
179
|
+
#SBATCH --output={{ log_dir }}/%x_%j.out
|
|
180
|
+
|
|
181
|
+
{{ environment_setup }}
|
|
182
|
+
|
|
183
|
+
srun {{ command }}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Use it with your job:
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
job = client.run(job, template_path="custom_template.slurm.jinja")
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Environment Configuration
|
|
193
|
+
|
|
194
|
+
#### Conda Environment
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
environment = JobEnvironment(
|
|
198
|
+
conda="my_env",
|
|
199
|
+
env_vars={"CUDA_VISIBLE_DEVICES": "0,1"}
|
|
200
|
+
)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
#### SquashFS Images
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
environment = JobEnvironment(
|
|
207
|
+
sqsh="/path/to/container.sqsh",
|
|
208
|
+
env_vars={"OMP_NUM_THREADS": "8"}
|
|
209
|
+
)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Programmatic Workflow Execution
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
from srunx.workflows import WorkflowRunner
|
|
216
|
+
|
|
217
|
+
runner = WorkflowRunner()
|
|
218
|
+
workflow = runner.load_from_yaml("workflow.yaml")
|
|
219
|
+
results = runner.execute_workflow(workflow)
|
|
220
|
+
|
|
221
|
+
print("Job IDs:")
|
|
222
|
+
for task_name, job_id in results.items():
|
|
223
|
+
print(f" {task_name}: {job_id}")
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Async Job Submission
|
|
227
|
+
|
|
228
|
+
```python
|
|
229
|
+
# Submit job without waiting
|
|
230
|
+
job = client.run(job)
|
|
231
|
+
|
|
232
|
+
# Later, wait for completion
|
|
233
|
+
completed_job = client.monitor(job, poll_interval=30)
|
|
234
|
+
print(f"Job completed with status: {completed_job.status}")
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## API Reference
|
|
238
|
+
|
|
239
|
+
### Core Classes
|
|
240
|
+
|
|
241
|
+
#### `Job`
|
|
242
|
+
Main job configuration class with resources and environment settings.
|
|
243
|
+
|
|
244
|
+
#### `JobResource`
|
|
245
|
+
Resource allocation specification (nodes, GPUs, memory, time).
|
|
246
|
+
|
|
247
|
+
#### `JobEnvironment`
|
|
248
|
+
Environment setup (conda, venv, sqsh, environment variables).
|
|
249
|
+
|
|
250
|
+
#### `Slurm`
|
|
251
|
+
Main interface for SLURM operations (submit, status, cancel, list).
|
|
252
|
+
|
|
253
|
+
#### `WorkflowRunner`
|
|
254
|
+
Workflow execution engine with YAML support.
|
|
255
|
+
|
|
256
|
+
### CLI Commands
|
|
257
|
+
|
|
258
|
+
#### Main CLI (`srunx`)
|
|
259
|
+
- `submit` - Submit SLURM jobs
|
|
260
|
+
- `status` - Check job status
|
|
261
|
+
- `list` - List jobs
|
|
262
|
+
- `cancel` - Cancel jobs
|
|
263
|
+
|
|
264
|
+
#### Workflow CLI (`srunx workflow`)
|
|
265
|
+
- Execute YAML-defined workflows
|
|
266
|
+
- Validate workflow files
|
|
267
|
+
- Show execution plans
|
|
268
|
+
|
|
269
|
+
## Configuration
|
|
270
|
+
|
|
271
|
+
### Environment Variables
|
|
272
|
+
|
|
273
|
+
- `SLURM_LOG_DIR`: Default directory for SLURM logs (default: `logs`)
|
|
274
|
+
|
|
275
|
+
### Template Locations
|
|
276
|
+
|
|
277
|
+
srunx includes built-in templates:
|
|
278
|
+
- `base.slurm.jinja`: Basic job template
|
|
279
|
+
- `advanced.slurm.jinja`: Full-featured template with all options
|
|
280
|
+
|
|
281
|
+
## Development
|
|
282
|
+
|
|
283
|
+
### Setup Development Environment
|
|
284
|
+
|
|
285
|
+
```bash
|
|
286
|
+
git clone https://github.com/your-username/srunx.git
|
|
287
|
+
cd srunx
|
|
288
|
+
uv sync --dev
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Run Tests
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
uv run pytest
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
### Type Checking
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
uv run mypy .
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### Code Formatting
|
|
304
|
+
|
|
305
|
+
```bash
|
|
306
|
+
uv run ruff check .
|
|
307
|
+
uv run ruff format .
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
## Examples
|
|
311
|
+
|
|
312
|
+
### Machine Learning Pipeline
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
# Complete ML pipeline example
|
|
316
|
+
from srunx import Job, JobResource, JobEnvironment, Slurm
|
|
317
|
+
|
|
318
|
+
def create_ml_job(script: str, **kwargs) -> Job:
|
|
319
|
+
return Job(
|
|
320
|
+
name=f"ml_{script.replace('.py', '')}",
|
|
321
|
+
command=["python", script] + [f"--{k}={v}" for k, v in kwargs.items()],
|
|
322
|
+
resources=JobResource(
|
|
323
|
+
nodes=1,
|
|
324
|
+
gpus_per_node=1,
|
|
325
|
+
memory_per_node="32GB",
|
|
326
|
+
time_limit="4:00:00"
|
|
327
|
+
),
|
|
328
|
+
environment=JobEnvironment(conda="pytorch")
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
client = Slurm()
|
|
332
|
+
|
|
333
|
+
# Submit preprocessing job
|
|
334
|
+
prep_job = create_ml_job("preprocess.py", data_path="/data", output_path="/processed")
|
|
335
|
+
prep_job = client.run(prep_job)
|
|
336
|
+
|
|
337
|
+
# Wait for preprocessing to complete
|
|
338
|
+
client.monitor(prep_job)
|
|
339
|
+
|
|
340
|
+
# Submit training job
|
|
341
|
+
train_job = create_ml_job("train.py", data_path="/processed", model_path="/models")
|
|
342
|
+
train_job = client.run(train_job)
|
|
343
|
+
|
|
344
|
+
print(f"Training job {train_job.job_id} submitted")
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
### Distributed Computing
|
|
348
|
+
|
|
349
|
+
```python
|
|
350
|
+
# Multi-node distributed job
|
|
351
|
+
distributed_job = Job(
|
|
352
|
+
name="distributed_training",
|
|
353
|
+
command=[
|
|
354
|
+
"mpirun", "-np", "16",
|
|
355
|
+
"python", "distributed_train.py"
|
|
356
|
+
],
|
|
357
|
+
resources=JobResource(
|
|
358
|
+
nodes=4,
|
|
359
|
+
ntasks_per_node=4,
|
|
360
|
+
cpus_per_task=8,
|
|
361
|
+
gpus_per_node=2,
|
|
362
|
+
memory_per_node="128GB",
|
|
363
|
+
time_limit="12:00:00"
|
|
364
|
+
),
|
|
365
|
+
environment=JobEnvironment(
|
|
366
|
+
conda="distributed_ml"
|
|
367
|
+
)
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
job = client.run(distributed_job)
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
## Contributing
|
|
374
|
+
|
|
375
|
+
We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) for details.
|
|
376
|
+
|
|
377
|
+
### Development Workflow
|
|
378
|
+
|
|
379
|
+
1. Fork the repository
|
|
380
|
+
2. Create a feature branch
|
|
381
|
+
3. Make your changes
|
|
382
|
+
4. Add tests
|
|
383
|
+
5. Run type checking and tests
|
|
384
|
+
6. Submit a pull request
|
|
385
|
+
|
|
386
|
+
## License
|
|
387
|
+
|
|
388
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
389
|
+
|
|
390
|
+
## Changelog
|
|
391
|
+
|
|
392
|
+
See [CHANGELOG.md](CHANGELOG.md) for release history.
|
|
393
|
+
|
|
394
|
+
## Support
|
|
395
|
+
|
|
396
|
+
- 📖 **Documentation**: [docs.example.com/srunx](https://docs.example.com/srunx)
|
|
397
|
+
- 🐞 **Issues**: [GitHub Issues](https://github.com/your-username/srunx/issues)
|
|
398
|
+
- 💬 **Discussions**: [GitHub Discussions](https://github.com/your-username/srunx/discussions)
|
|
399
|
+
|
|
400
|
+
## Acknowledgments
|
|
401
|
+
|
|
402
|
+
- Built with [Pydantic](https://pydantic.dev/) for data validation
|
|
403
|
+
- Workflow orchestration powered by [Prefect](https://www.prefect.io/)
|
|
404
|
+
- Template rendering with [Jinja2](https://jinja.palletsprojects.com/)
|
|
405
|
+
- Package management with [uv](https://github.com/astral-sh/uv)
|