jazari 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jazari-0.1.0/src/jazari.egg-info → jazari-0.1.1}/PKG-INFO +1 -1
- {jazari-0.1.0 → jazari-0.1.1}/pyproject.toml +2 -2
- jazari-0.1.1/src/jazari/commands/init.py +37 -0
- jazari-0.1.1/src/jazari/commands/kill.py +32 -0
- jazari-0.1.1/src/jazari/commands/logs.py +42 -0
- jazari-0.1.1/src/jazari/commands/run.py +94 -0
- jazari-0.1.1/src/jazari/commands/status.py +60 -0
- {jazari-0.1.0 → jazari-0.1.1}/src/jazari/config_generator.py +2 -11
- jazari-0.1.1/src/jazari/main.py +24 -0
- jazari-0.1.1/src/jazari/utils.py +30 -0
- {jazari-0.1.0 → jazari-0.1.1/src/jazari.egg-info}/PKG-INFO +1 -1
- {jazari-0.1.0 → jazari-0.1.1}/src/jazari.egg-info/SOURCES.txt +7 -1
- jazari-0.1.0/src/jazari/main.py +0 -329
- {jazari-0.1.0 → jazari-0.1.1}/LICENSE +0 -0
- {jazari-0.1.0 → jazari-0.1.1}/README.md +0 -0
- {jazari-0.1.0 → jazari-0.1.1}/setup.cfg +0 -0
- {jazari-0.1.0 → jazari-0.1.1}/src/jazari/__init__.py +0 -0
- {jazari-0.1.0 → jazari-0.1.1}/src/jazari/slurm_generator.py +0 -0
- {jazari-0.1.0 → jazari-0.1.1}/src/jazari.egg-info/dependency_links.txt +0 -0
- {jazari-0.1.0 → jazari-0.1.1}/src/jazari.egg-info/entry_points.txt +0 -0
- {jazari-0.1.0 → jazari-0.1.1}/src/jazari.egg-info/requires.txt +0 -0
- {jazari-0.1.0 → jazari-0.1.1}/src/jazari.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "jazari"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.1"
|
|
8
8
|
description = "The orchestration layer for modern Slurm clusters."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -45,4 +45,4 @@ dependencies = [
|
|
|
45
45
|
]
|
|
46
46
|
|
|
47
47
|
[project.scripts]
|
|
48
|
-
jazari = "jazari.main:app"
|
|
48
|
+
jazari = "jazari.main:app"
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from rich.console import Console
|
|
2
|
+
from rich.prompt import Prompt, Confirm
|
|
3
|
+
from jazari.config_generator import load_config, make_config_dir, CONFIG_FILE
|
|
4
|
+
|
|
5
|
+
console = Console()
|
|
6
|
+
|
|
7
|
+
def init_command():
|
|
8
|
+
"""
|
|
9
|
+
Initialize Jazari config for this cluster.
|
|
10
|
+
"""
|
|
11
|
+
current_config = load_config()
|
|
12
|
+
|
|
13
|
+
console.print('[bold blue]Welcome to Jazari setup.[/bold blue]')
|
|
14
|
+
console.print(f"This will create a configuration file at [dim]{CONFIG_FILE}[/dim]\n")
|
|
15
|
+
|
|
16
|
+
# 1. Ask for Slurm Account
|
|
17
|
+
default_account = current_config.get("account", "")
|
|
18
|
+
account = Prompt.ask("Enter your default Slurm account (e.g., def-user)",default=default_account)
|
|
19
|
+
|
|
20
|
+
# 2. Ask for default time limit
|
|
21
|
+
default_time = current_config.get("time", "01:00:00")
|
|
22
|
+
time_limit = Prompt.ask("Enter default time limit (D-HH:MM)",default=default_time)
|
|
23
|
+
|
|
24
|
+
# 3. Ask for W&B tracking default
|
|
25
|
+
default_wandb = current_config.get("track_wandb", False)
|
|
26
|
+
track_wandb = Confirm.ask("Enable W&B tracking by default?",default=default_wandb)
|
|
27
|
+
|
|
28
|
+
new_config = {
|
|
29
|
+
"account": account,
|
|
30
|
+
"time": time_limit,
|
|
31
|
+
"track_wandb": track_wandb
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
make_config_dir(new_config)
|
|
35
|
+
|
|
36
|
+
console.print(f"\n[bold green]✅ Configuration saved![/bold green]")
|
|
37
|
+
console.print("You can now run jobs without specifying these flags.")
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import typer, shutil, sys, subprocess
|
|
2
|
+
from rich.console import Console
|
|
3
|
+
from rich.prompt import Confirm
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from jazari.utils import get_current_user
|
|
6
|
+
|
|
7
|
+
console = Console()
|
|
8
|
+
|
|
9
|
+
def kill_command(
|
|
10
|
+
job_id: Optional[str] = typer.Argument(None, help = 'Job ID to cancel.'),
|
|
11
|
+
cancel_all: bool = typer.Option(False, '--all', help = 'Cancel ALL active jobs.')
|
|
12
|
+
):
|
|
13
|
+
if not shutil.which("scancel"):
|
|
14
|
+
console.print("[bold red]Error:[/bold red] 'scancel' command not found.")
|
|
15
|
+
console.print("Are you running this on a Slurm login node?")
|
|
16
|
+
sys.exit(1)
|
|
17
|
+
|
|
18
|
+
user = get_current_user()
|
|
19
|
+
|
|
20
|
+
if cancel_all:
|
|
21
|
+
if not Confirm.ask(f"[bold red]⚠️ DANGER:[/bold red] Cancel ALL jobs for '{user}'?"):
|
|
22
|
+
sys.exit(0)
|
|
23
|
+
subprocess.run(["scancel", "-u", user], check=True)
|
|
24
|
+
console.print(f"[bold green]💥 All jobs cancelled.[/bold green]")
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
if job_id:
|
|
28
|
+
subprocess.run(["scancel", job_id], check=True)
|
|
29
|
+
console.print(f"[bold green]💥 Signal sent to cancel job {job_id}.[/bold green]")
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
console.print("[yellow]Usage: jazari kill <JOB_ID> or jazari kill --all[/yellow]")
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import typer, sys, os, subprocess
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
|
|
6
|
+
console = Console()
|
|
7
|
+
|
|
8
|
+
def logs_command(
|
|
9
|
+
job_id: Optional[str] = typer.Argument(None, help="Job ID to view."),
|
|
10
|
+
tail: bool = typer.Option(False, "--tail", "-f", help="Follow output live."),
|
|
11
|
+
error: bool = typer.Option(False, "--error", "-e", help="View .err log instead of .out")
|
|
12
|
+
):
|
|
13
|
+
"""View or follow job logs."""
|
|
14
|
+
log_dir = Path("logs")
|
|
15
|
+
if not log_dir.exists():
|
|
16
|
+
console.print("[red]Error: 'logs/' directory not found.[/red]")
|
|
17
|
+
sys.exit(1)
|
|
18
|
+
|
|
19
|
+
suffix = ".err" if error else ".out"
|
|
20
|
+
target_file = None
|
|
21
|
+
|
|
22
|
+
if job_id:
|
|
23
|
+
matching = list(log_dir.glob(f"*{job_id}{suffix}"))
|
|
24
|
+
if not matching:
|
|
25
|
+
console.print(f"[red]No log found for Job {job_id}[/red]")
|
|
26
|
+
sys.exit(1)
|
|
27
|
+
target_file = matching[0]
|
|
28
|
+
else:
|
|
29
|
+
files = list(log_dir.glob(f"*{suffix}"))
|
|
30
|
+
if not files:
|
|
31
|
+
console.print(f"[yellow]No {suffix} files found.[/yellow]")
|
|
32
|
+
return
|
|
33
|
+
files.sort(key=os.path.getmtime)
|
|
34
|
+
target_file = files[-1]
|
|
35
|
+
|
|
36
|
+
console.print(f"[dim]Viewing: {target_file}[/dim]")
|
|
37
|
+
cmd = ["tail", "-f", str(target_file)] if tail else ["cat", str(target_file)]
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
subprocess.run(cmd, check=True)
|
|
41
|
+
except KeyboardInterrupt:
|
|
42
|
+
console.print("\n[dim]Stopped.[/dim]")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os, typer, sys, shlex, tempfile, subprocess
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
from rich.syntax import Syntax
|
|
5
|
+
from jazari.slurm_generator import generate_sbatch_script
|
|
6
|
+
from jazari.config_generator import load_config, get_config_default
|
|
7
|
+
from jazari.utils import get_wandb_api_key
|
|
8
|
+
|
|
9
|
+
console = Console()
|
|
10
|
+
CONFIG = load_config()
|
|
11
|
+
|
|
12
|
+
def run_command(
|
|
13
|
+
command: list[str] = typer.Argument(..., help="The command to run (e.g., python train.py). Use '--' before it."),
|
|
14
|
+
nodes: int = typer.Option(get_config_default("nodes", CONFIG, 1), '--nodes', '-N', help='Number of nodes.'),
|
|
15
|
+
gpus: int = typer.Option(get_config_default("gpus", CONFIG, 1), '--gpus', '-G', help='GPUs per node.'),
|
|
16
|
+
cpus: int = typer.Option(get_config_default("cpus", CONFIG, 1), '--cpus', '-n', help='CPUs per task/GPU'), # Changed -c to -n per our earlier discussion
|
|
17
|
+
time: str = typer.Option(get_config_default("time", CONFIG, '01:00:00'), '--time', '-t', help='Time limit (D-HH:MM)'),
|
|
18
|
+
name: str = typer.Option('jazari_run', '--name', '-j', help='Job name'), # Changed -n to -j to allow CPU shortcut
|
|
19
|
+
account: Optional[str] = typer.Option(get_config_default("account", CONFIG, None), "--account", "-A", help="Slurm account to charge."),
|
|
20
|
+
track_wandb: bool = typer.Option(get_config_default("track_wandb", CONFIG, False), "--track-wandb", help="Auto-configure W&B."),
|
|
21
|
+
push_to_hub: Optional[str] = typer.Option(None, "--push-to-hub", help="Hugging Face repo ID to upload model to."),
|
|
22
|
+
pull_data: Optional[str] = typer.Option(None, '--pull-data', help='Huggingface dataset ID to download.'),
|
|
23
|
+
dry_run: bool = typer.Option(False, '--dry-run', help='Print sbatch script without submitting.'),
|
|
24
|
+
):
|
|
25
|
+
'''
|
|
26
|
+
Launch a distributed training job.
|
|
27
|
+
'''
|
|
28
|
+
if not command:
|
|
29
|
+
console.print("[bold red]Error:[/bold red] You must provide a command to run.")
|
|
30
|
+
sys.exit(1)
|
|
31
|
+
|
|
32
|
+
log_dir = "logs"
|
|
33
|
+
if not os.path.exists(log_dir):
|
|
34
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
full_command_str = shlex.join(command)
|
|
37
|
+
|
|
38
|
+
# --- Weights & Biases ---
|
|
39
|
+
wandb_key = None
|
|
40
|
+
if track_wandb:
|
|
41
|
+
console.print("[dim]W&B tracking enabled.[/dim]")
|
|
42
|
+
wandb_key = get_wandb_api_key()
|
|
43
|
+
if not wandb_key:
|
|
44
|
+
console.print("[bold yellow]Warning:[/bold yellow] Could not find W&B API key locally.")
|
|
45
|
+
console.print("Please run [green]wandb login[/green] on this machine first.")
|
|
46
|
+
# -----------------
|
|
47
|
+
|
|
48
|
+
console.print(f'[bold]🐘 Generating Slurm script for: {name}[/bold]')
|
|
49
|
+
|
|
50
|
+
sbatch_content = generate_sbatch_script(
|
|
51
|
+
nodes = nodes,
|
|
52
|
+
gpus_per_node = gpus,
|
|
53
|
+
cpus_per_task = cpus,
|
|
54
|
+
time_limit = time,
|
|
55
|
+
job_name = name,
|
|
56
|
+
account_name = account,
|
|
57
|
+
wandb_api_key = wandb_key,
|
|
58
|
+
hf_repo_id = push_to_hub,
|
|
59
|
+
hf_dataset_id = pull_data,
|
|
60
|
+
user_command = full_command_str
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if dry_run:
|
|
64
|
+
console.print('\n[yellow]--- DRY RUN: Generated #SBATCH Script ---[/yellow]')
|
|
65
|
+
syntax = Syntax(sbatch_content, 'bash', theme = 'monokai', line_numbers = True)
|
|
66
|
+
console.print(syntax)
|
|
67
|
+
console.print('[yellow]-----------------------------------------------[/yellow]')
|
|
68
|
+
else:
|
|
69
|
+
console.print("[dim]Submitting to Slurm scheduler...[/dim]")
|
|
70
|
+
|
|
71
|
+
with tempfile.NamedTemporaryFile(mode='w+', suffix=".sh", delete=False) as temp_file:
|
|
72
|
+
temp_script_path = temp_file.name
|
|
73
|
+
temp_file.write(sbatch_content)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
result = subprocess.run(
|
|
77
|
+
["sbatch", temp_script_path],
|
|
78
|
+
check = True,
|
|
79
|
+
capture_output = True,
|
|
80
|
+
text = True
|
|
81
|
+
)
|
|
82
|
+
job_id = result.stdout.strip().split()[-1]
|
|
83
|
+
console.print(f"[bold green]✅ Job submitted successfully![/bold green] (ID: [bold]{job_id}[/bold])")
|
|
84
|
+
console.print(f"[dim]View logs: cat logs/{name}-{job_id}.out[/dim]")
|
|
85
|
+
|
|
86
|
+
except subprocess.CalledProcessError as e:
|
|
87
|
+
console.print("[bold red]❌ Failed to submit job.[/bold red]")
|
|
88
|
+
console.print(f"Sbatch error: {e.stderr}")
|
|
89
|
+
except FileNotFoundError:
|
|
90
|
+
console.print("\n[bold red]❌ Error:[/bold red] 'sbatch' command not found.")
|
|
91
|
+
console.print("Are you running this on a Slurm login node?")
|
|
92
|
+
finally:
|
|
93
|
+
if os.path.exists(temp_script_path):
|
|
94
|
+
os.unlink(temp_script_path)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import shutil, sys, subprocess
|
|
2
|
+
from rich.console import Console
|
|
3
|
+
from rich.table import Table
|
|
4
|
+
from rich import box
|
|
5
|
+
from jazari.utils import get_current_user # We will move helper functions to utils.py later
|
|
6
|
+
|
|
7
|
+
console = Console()
|
|
8
|
+
|
|
9
|
+
def status_command():
|
|
10
|
+
if not shutil.which("squeue"):
|
|
11
|
+
console.print("[bold red]Error:[/bold red] 'squeue' command not found.")
|
|
12
|
+
console.print("Are you running this on a Slurm login node?")
|
|
13
|
+
sys.exit(1)
|
|
14
|
+
|
|
15
|
+
user = get_current_user()
|
|
16
|
+
|
|
17
|
+
squeue_cmd = ["squeue", "-u", user, "-h", "-o", "%i|%j|%T|%M|%D|%R"]
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
result = subprocess.run(squeue_cmd, check=True, capture_output=True, text=True)
|
|
21
|
+
output_lines = result.stdout.strip().split('\n')
|
|
22
|
+
output_lines = [line for line in output_lines if line.strip()]
|
|
23
|
+
|
|
24
|
+
if not output_lines:
|
|
25
|
+
console.print(f"\n[dim]No active jobs found for user: [bold]{user}[/bold][/dim]\n")
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
table = Table(
|
|
29
|
+
title = f"🐘 Cluster Activity for user: [bold green]{user}[/bold green]",
|
|
30
|
+
box = box.ROUNDED,
|
|
31
|
+
header_style = "bold blue",
|
|
32
|
+
expand = True
|
|
33
|
+
)
|
|
34
|
+
table.add_column("Job ID", style="cyan", no_wrap=True)
|
|
35
|
+
table.add_column("Name")
|
|
36
|
+
table.add_column("State")
|
|
37
|
+
table.add_column("Time Used", justify="right")
|
|
38
|
+
table.add_column("Nodes", justify="right")
|
|
39
|
+
table.add_column("Nodelist / Reason", style="dim")
|
|
40
|
+
|
|
41
|
+
for line in output_lines:
|
|
42
|
+
try:
|
|
43
|
+
job_id, name, state, time_used, nodes, reason = line.split('|')
|
|
44
|
+
|
|
45
|
+
# Color-code the state
|
|
46
|
+
if state == "RUNNING":
|
|
47
|
+
state_formatted = f"[bold green]{state}[/bold green]"
|
|
48
|
+
elif state == "PENDING":
|
|
49
|
+
state_formatted = f"[yellow]{state}[/yellow]"
|
|
50
|
+
else:
|
|
51
|
+
state_formatted = f"[red]{state}[/red]"
|
|
52
|
+
|
|
53
|
+
table.add_row(job_id, name, state_formatted, time_used, nodes, reason)
|
|
54
|
+
except ValueError:
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
console.print(table)
|
|
58
|
+
|
|
59
|
+
except subprocess.CalledProcessError as e:
|
|
60
|
+
console.print(f"[bold red]Error running squeue:[/bold red] {e.stderr}")
|
|
@@ -1,16 +1,12 @@
|
|
|
1
1
|
import os, yaml
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from rich.console import Console
|
|
4
|
-
|
|
5
|
-
# defining the path to the config file: ~/.jazari/config.yaml
|
|
4
|
+
|
|
6
5
|
CONFIG_DIR = os.path.join(Path.home(), '.jazari')
|
|
7
6
|
CONFIG_FILE = os.path.join(CONFIG_DIR, 'config.yaml')
|
|
8
7
|
console = Console()
|
|
9
8
|
|
|
10
9
|
def load_config():
|
|
11
|
-
'''
|
|
12
|
-
Loading config file if exists, otherwise returns empty dict.
|
|
13
|
-
'''
|
|
14
10
|
if os.path.exists(CONFIG_FILE):
|
|
15
11
|
try:
|
|
16
12
|
with open(CONFIG_FILE, 'r') as f:
|
|
@@ -21,16 +17,11 @@ def load_config():
|
|
|
21
17
|
return {}
|
|
22
18
|
|
|
23
19
|
def get_config_default(key, CONFIG, default_value):
|
|
24
|
-
'''
|
|
25
|
-
Helper to get a default value from config or fall back to a hardcoded one.
|
|
26
|
-
'''
|
|
27
20
|
return CONFIG.get(key, default_value)
|
|
28
21
|
|
|
29
22
|
def make_config_dir(new_config):
|
|
30
|
-
# Creating directory if doesn't exist
|
|
31
23
|
Path(CONFIG_DIR).mkdir(parents = True, exist_ok=True)
|
|
32
|
-
|
|
33
|
-
# Writing config to the yaml file
|
|
24
|
+
|
|
34
25
|
with open(CONFIG_FILE, 'w') as f:
|
|
35
26
|
yaml.dump(new_config, f, default_flow_style=False)
|
|
36
27
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from jazari.commands.status import status_command
|
|
3
|
+
from jazari.commands.kill import kill_command
|
|
4
|
+
from jazari.commands.logs import logs_command
|
|
5
|
+
from jazari.commands.init import init_command
|
|
6
|
+
from jazari.commands.run import run_command
|
|
7
|
+
|
|
8
|
+
# Initializing the Typer app
|
|
9
|
+
app = typer.Typer(
|
|
10
|
+
name = "jazari",
|
|
11
|
+
help = "🐘 The orchestration layer for modern Slurm clusters.",
|
|
12
|
+
add_completion = False,
|
|
13
|
+
no_args_is_help = True
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# Registering the commands
|
|
17
|
+
app.command(name="status")(status_command)
|
|
18
|
+
app.command(name="kill")(kill_command)
|
|
19
|
+
app.command(name="logs")(logs_command)
|
|
20
|
+
app.command(name="init")(init_command)
|
|
21
|
+
app.command(name="run")(run_command)
|
|
22
|
+
|
|
23
|
+
if __name__ == "__main__":
|
|
24
|
+
app()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import os, getpass, netrc
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
|
|
5
|
+
console = Console()
|
|
6
|
+
|
|
7
|
+
def get_current_user() -> str:
|
|
8
|
+
"""Returns the current username, robustly on Linux environments."""
|
|
9
|
+
try:
|
|
10
|
+
return os.environ['USER']
|
|
11
|
+
except KeyError:
|
|
12
|
+
return getpass.getuser()
|
|
13
|
+
|
|
14
|
+
def get_wandb_api_key() -> Optional[str]:
|
|
15
|
+
"""
|
|
16
|
+
Attempts to read the W&B API key from the local ~/.netrc file.
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
login_info = netrc.netrc().authenticators("api.wandb.ai")
|
|
20
|
+
if login_info:
|
|
21
|
+
username, _, api_key = login_info
|
|
22
|
+
console.print(f"[dim]Found W&B API key for user: {username}[/dim]")
|
|
23
|
+
return api_key
|
|
24
|
+
else:
|
|
25
|
+
return None
|
|
26
|
+
except FileNotFoundError:
|
|
27
|
+
return None
|
|
28
|
+
except Exception as e:
|
|
29
|
+
console.print(f"[dim bold red]Warning: Could not read W&B credentials: {e}[/dim]")
|
|
30
|
+
return None
|
|
@@ -5,9 +5,15 @@ src/jazari/__init__.py
|
|
|
5
5
|
src/jazari/config_generator.py
|
|
6
6
|
src/jazari/main.py
|
|
7
7
|
src/jazari/slurm_generator.py
|
|
8
|
+
src/jazari/utils.py
|
|
8
9
|
src/jazari.egg-info/PKG-INFO
|
|
9
10
|
src/jazari.egg-info/SOURCES.txt
|
|
10
11
|
src/jazari.egg-info/dependency_links.txt
|
|
11
12
|
src/jazari.egg-info/entry_points.txt
|
|
12
13
|
src/jazari.egg-info/requires.txt
|
|
13
|
-
src/jazari.egg-info/top_level.txt
|
|
14
|
+
src/jazari.egg-info/top_level.txt
|
|
15
|
+
src/jazari/commands/init.py
|
|
16
|
+
src/jazari/commands/kill.py
|
|
17
|
+
src/jazari/commands/logs.py
|
|
18
|
+
src/jazari/commands/run.py
|
|
19
|
+
src/jazari/commands/status.py
|
jazari-0.1.0/src/jazari/main.py
DELETED
|
@@ -1,329 +0,0 @@
|
|
|
1
|
-
import typer, sys, os, tempfile, subprocess, netrc, shutil, getpass
|
|
2
|
-
|
|
3
|
-
from typing import Optional
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from rich.console import Console
|
|
7
|
-
from rich.syntax import Syntax
|
|
8
|
-
from rich.prompt import Prompt, Confirm
|
|
9
|
-
from rich.table import Table
|
|
10
|
-
from rich import box
|
|
11
|
-
|
|
12
|
-
from jazari.slurm_generator import generate_sbatch_script
|
|
13
|
-
from jazari.config_generator import load_config, get_config_default, make_config_dir, CONFIG_FILE
|
|
14
|
-
|
|
15
|
-
# Initializing Rich console for pretty printing
|
|
16
|
-
console = Console()
|
|
17
|
-
|
|
18
|
-
# Load config at start.
|
|
19
|
-
CONFIG = load_config()
|
|
20
|
-
|
|
21
|
-
# --- HELPER FUNCTIONS ---
|
|
22
|
-
|
|
23
|
-
def get_current_user() -> str:
|
|
24
|
-
"""Returns the current username, robustly on Linux environments."""
|
|
25
|
-
try:
|
|
26
|
-
return os.environ['USER']
|
|
27
|
-
except KeyError:
|
|
28
|
-
return getpass.getuser()
|
|
29
|
-
|
|
30
|
-
def get_wandb_api_key() -> Optional[str]:
|
|
31
|
-
"""
|
|
32
|
-
Attempts to read the W&B API key from the local ~/.netrc file.
|
|
33
|
-
"""
|
|
34
|
-
try:
|
|
35
|
-
login_info = netrc.netrc().authenticators("api.wandb.ai")
|
|
36
|
-
if login_info:
|
|
37
|
-
username, _, api_key = login_info
|
|
38
|
-
console.print(f"[dim]Found W&B API key for user: {username}[/dim]")
|
|
39
|
-
return api_key
|
|
40
|
-
else:
|
|
41
|
-
return None
|
|
42
|
-
except FileNotFoundError:
|
|
43
|
-
return None
|
|
44
|
-
except Exception as e:
|
|
45
|
-
console.print(f"[dim bold red]Warning: Could not read W&B credentials: {e}[/dim]")
|
|
46
|
-
return None
|
|
47
|
-
|
|
48
|
-
# --- COMMAND FUNCTIONS (No decorators here) ---
|
|
49
|
-
|
|
50
|
-
def status_command():
|
|
51
|
-
"""
|
|
52
|
-
View the status of active Slurm jobs for the current user.
|
|
53
|
-
"""
|
|
54
|
-
# 1. Check if squeue exists available
|
|
55
|
-
if not shutil.which("squeue"):
|
|
56
|
-
console.print("[bold red]Error:[/bold red] 'squeue' command not found.")
|
|
57
|
-
console.print("Are you running this on a Slurm login node?")
|
|
58
|
-
sys.exit(1)
|
|
59
|
-
|
|
60
|
-
user = get_current_user()
|
|
61
|
-
|
|
62
|
-
# 2. Define the squeue command
|
|
63
|
-
# -u: Filter by user
|
|
64
|
-
# -h: No header (we'll make our own)
|
|
65
|
-
# -o: Custom format with '|' delimiter.
|
|
66
|
-
# %i=ID, %j=Name, %T=State, %M=Time Used, %D=Nodes, %R=Reason/Nodelist
|
|
67
|
-
squeue_cmd = ["squeue", "-u", user, "-h", "-o", "%i|%j|%T|%M|%D|%R"]
|
|
68
|
-
|
|
69
|
-
try:
|
|
70
|
-
# 3. Run the command and capture output
|
|
71
|
-
result = subprocess.run(squeue_cmd, check=True, capture_output=True, text=True)
|
|
72
|
-
output_lines = result.stdout.strip().split('\n')
|
|
73
|
-
|
|
74
|
-
# Remove empty lines if squeue returned nothing
|
|
75
|
-
output_lines = [line for line in output_lines if line.strip()]
|
|
76
|
-
|
|
77
|
-
if not output_lines:
|
|
78
|
-
console.print(f"\n[dim]No active jobs found for user: [bold]{user}[/bold][/dim]\n")
|
|
79
|
-
return
|
|
80
|
-
|
|
81
|
-
# 4. Build the Rich Table
|
|
82
|
-
table = Table(
|
|
83
|
-
title=f"🐘 Cluster Activity for user: [bold green]{user}[/bold green]",
|
|
84
|
-
box=box.ROUNDED,
|
|
85
|
-
header_style="bold blue",
|
|
86
|
-
expand=True
|
|
87
|
-
)
|
|
88
|
-
table.add_column("Job ID", style="cyan", no_wrap=True)
|
|
89
|
-
table.add_column("Name")
|
|
90
|
-
table.add_column("State")
|
|
91
|
-
table.add_column("Time Used", justify="right")
|
|
92
|
-
table.add_column("Nodes", justify="right")
|
|
93
|
-
table.add_column("Nodelist / Reason", style="dim")
|
|
94
|
-
|
|
95
|
-
# 5. Parse output and add rows
|
|
96
|
-
for line in output_lines:
|
|
97
|
-
try:
|
|
98
|
-
job_id, name, state, time_used, nodes, reason = line.split('|')
|
|
99
|
-
|
|
100
|
-
# Color-code the state
|
|
101
|
-
if state == "RUNNING":
|
|
102
|
-
state_formatted = f"[bold green]{state}[/bold green]"
|
|
103
|
-
elif state == "PENDING":
|
|
104
|
-
state_formatted = f"[yellow]{state}[/yellow]"
|
|
105
|
-
else:
|
|
106
|
-
state_formatted = f"[red]{state}[/red]"
|
|
107
|
-
|
|
108
|
-
table.add_row(job_id, name, state_formatted, time_used, nodes, reason)
|
|
109
|
-
except ValueError:
|
|
110
|
-
# Handle rare cases where splitting fails
|
|
111
|
-
continue
|
|
112
|
-
|
|
113
|
-
console.print(table)
|
|
114
|
-
|
|
115
|
-
except subprocess.CalledProcessError as e:
|
|
116
|
-
console.print(f"[bold red]Error running squeue:[/bold red] {e.stderr}")
|
|
117
|
-
|
|
118
|
-
def kill_command(
|
|
119
|
-
job_id: Optional[str] = typer.Argument(None, help = 'Job ID to cancel.'),
|
|
120
|
-
cancel_all: bool = typer.Option(False, '--all', help = 'Cancel ALL active jobs.')
|
|
121
|
-
):
|
|
122
|
-
# Cancel a specific job or all jobs
|
|
123
|
-
if not shutil.which("scancel"):
|
|
124
|
-
console.print("[bold red]Error:[/bold red] 'scancel' command not found.")
|
|
125
|
-
console.print("Are you running this on a Slurm login node?")
|
|
126
|
-
sys.exit(1)
|
|
127
|
-
|
|
128
|
-
user = get_current_user()
|
|
129
|
-
|
|
130
|
-
if cancel_all:
|
|
131
|
-
if not Confirm.ask(f"[bold red]⚠️ DANGER:[/bold red] Cancel ALL jobs for '{user}'?"):
|
|
132
|
-
sys.exit(0)
|
|
133
|
-
subprocess.run(["scancel", "-u", user], check=True)
|
|
134
|
-
console.print(f"[bold green]💥 All jobs cancelled.[/bold green]")
|
|
135
|
-
return
|
|
136
|
-
|
|
137
|
-
if job_id:
|
|
138
|
-
subprocess.run(["scancel", job_id], check=True)
|
|
139
|
-
console.print(f"[bold green]💥 Signal sent to cancel job {job_id}.[/bold green]")
|
|
140
|
-
return
|
|
141
|
-
|
|
142
|
-
console.print("[yellow]Usage: jazari kill <JOB_ID> or jazari kill --all[/yellow]")
|
|
143
|
-
|
|
144
|
-
def logs_command(
|
|
145
|
-
job_id: Optional[str] = typer.Argument(None, help="Job ID to view."),
|
|
146
|
-
tail: bool = typer.Option(False, "--tail", "-f", help="Follow output live."),
|
|
147
|
-
error: bool = typer.Option(False, "--error", "-e", help="View .err log instead of .out")
|
|
148
|
-
):
|
|
149
|
-
"""View or follow job logs."""
|
|
150
|
-
log_dir = Path("logs")
|
|
151
|
-
if not log_dir.exists():
|
|
152
|
-
console.print("[red]Error: 'logs/' directory not found.[/red]")
|
|
153
|
-
sys.exit(1)
|
|
154
|
-
|
|
155
|
-
suffix = ".err" if error else ".out"
|
|
156
|
-
target_file = None
|
|
157
|
-
|
|
158
|
-
if job_id:
|
|
159
|
-
matching = list(log_dir.glob(f"*{job_id}{suffix}"))
|
|
160
|
-
if not matching:
|
|
161
|
-
console.print(f"[red]No log found for Job {job_id}[/red]")
|
|
162
|
-
sys.exit(1)
|
|
163
|
-
target_file = matching[0]
|
|
164
|
-
else:
|
|
165
|
-
# Find latest
|
|
166
|
-
files = list(log_dir.glob(f"*{suffix}"))
|
|
167
|
-
if not files:
|
|
168
|
-
console.print(f"[yellow]No {suffix} files found.[/yellow]")
|
|
169
|
-
return
|
|
170
|
-
files.sort(key=os.path.getmtime)
|
|
171
|
-
target_file = files[-1]
|
|
172
|
-
|
|
173
|
-
console.print(f"[dim]Viewing: {target_file}[/dim]")
|
|
174
|
-
cmd = ["tail", "-f", str(target_file)] if tail else ["cat", str(target_file)]
|
|
175
|
-
|
|
176
|
-
try:
|
|
177
|
-
subprocess.run(cmd, check=True)
|
|
178
|
-
except KeyboardInterrupt:
|
|
179
|
-
console.print("\n[dim]Stopped.[/dim]")
|
|
180
|
-
|
|
181
|
-
def init_command():
|
|
182
|
-
"""
|
|
183
|
-
Initialize Jazari config for this cluster.
|
|
184
|
-
Interactively asks the user for defaults and saves them to ~/.jazari/config.yaml.
|
|
185
|
-
"""
|
|
186
|
-
console.print('[bold blue]Welcome to Jazari setup.[/bold blue]')
|
|
187
|
-
console.print(f"This will create a configuration file at [dim]{CONFIG_FILE}[/dim]\n")
|
|
188
|
-
|
|
189
|
-
# 1. Ask for Slurm Account
|
|
190
|
-
# We use the current config value as the default prompt if it exists.
|
|
191
|
-
default_account = CONFIG.get("account", "")
|
|
192
|
-
account = Prompt.ask(
|
|
193
|
-
"Enter your default Slurm account (e.g., def-user)",
|
|
194
|
-
default=default_account
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
# 2. Ask for default time limit
|
|
198
|
-
default_time = CONFIG.get("time", "01:00:00")
|
|
199
|
-
time_limit = Prompt.ask(
|
|
200
|
-
"Enter default time limit (D-HH:MM)",
|
|
201
|
-
default=default_time
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
# 3. Ask for W&B tracking default
|
|
205
|
-
default_wandb = CONFIG.get("track_wandb", False)
|
|
206
|
-
track_wandb = Confirm.ask(
|
|
207
|
-
"Enable W&B tracking by default?",
|
|
208
|
-
default=default_wandb
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
# Prepare the config dictionary
|
|
212
|
-
new_config = {
|
|
213
|
-
"account": account,
|
|
214
|
-
"time": time_limit,
|
|
215
|
-
"track_wandb": track_wandb
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
make_config_dir(new_config)
|
|
219
|
-
|
|
220
|
-
console.print(f"\n[bold green]✅ Configuration saved![/bold green]")
|
|
221
|
-
console.print("You can now run jobs without specifying these flags.")
|
|
222
|
-
|
|
223
|
-
def run_command(
|
|
224
|
-
# The default values for these arguments are now pulled dynamically from the
|
|
225
|
-
# CONFIG dictionary using the get_config_default helper.
|
|
226
|
-
command: list[str] = typer.Argument(..., help="The command to run (e.g., python train.py --batch 64). Use '--' before it if it has flags."),
|
|
227
|
-
nodes: int = typer.Option(get_config_default("nodes", CONFIG, 1), '--nodes', '-N', help='Number of nodes.'),
|
|
228
|
-
gpus: int = typer.Option(get_config_default("gpus", CONFIG, 1), '--gpus', '-G', help='GPUs per node.'),
|
|
229
|
-
cpus: int = typer.Option(get_config_default("cpus", CONFIG, 1), '--cpus', '-c', help='CPUs per task/GPU'),
|
|
230
|
-
time: str = typer.Option(get_config_default("time", CONFIG, '01:00:00'), '--time', '-t', help='Time limit (D-HH:MM)'),
|
|
231
|
-
name: str = typer.Option('jazari_run', '--name', '-n', help='Job name'),
|
|
232
|
-
account: Optional[str] = typer.Option(get_config_default("account", CONFIG, None), "--account", "-A", help="Slurm account to charge."),
|
|
233
|
-
track_wandb: bool = typer.Option(get_config_default("track_wandb", CONFIG, False), "--track-wandb", help="Auto-configure W&B."),
|
|
234
|
-
push_to_hub: Optional[str] = typer.Option(None, "--push-to-hub", help="Hugging Face repo ID to upload model to (e.g. 'my-org/my-model)."),
|
|
235
|
-
pull_data: Optional[str] = typer.Option(None, '--pull-data', help='Huggingface dataset ID to download.'),
|
|
236
|
-
dry_run: bool = typer.Option(False, '--dry-run', help='Print sbatch script without submitting.'),
|
|
237
|
-
):
|
|
238
|
-
'''
|
|
239
|
-
Launch a distributed training job.
|
|
240
|
-
'''
|
|
241
|
-
if not command:
|
|
242
|
-
console.print("[bold red]Error:[/bold red] You must provide a command to run.")
|
|
243
|
-
sys.exit(1)
|
|
244
|
-
|
|
245
|
-
log_dir = "logs"
|
|
246
|
-
if not os.path.exists(log_dir):
|
|
247
|
-
os.makedirs(log_dir, exist_ok=True)
|
|
248
|
-
|
|
249
|
-
full_command_str = ' '.join(command)
|
|
250
|
-
|
|
251
|
-
# --- Weights & Biases ---
|
|
252
|
-
wandb_key = None
|
|
253
|
-
if track_wandb:
|
|
254
|
-
console.print("[dim]W&B tracking enabled.[/dim]")
|
|
255
|
-
wandb_key = get_wandb_api_key()
|
|
256
|
-
if not wandb_key:
|
|
257
|
-
console.print("[bold yellow]Warning:[/bold yellow] Could not find W&B API key locally.")
|
|
258
|
-
console.print("Please run [green]wandb login[/green] on this machine first.")
|
|
259
|
-
# -----------------
|
|
260
|
-
|
|
261
|
-
console.print(f'[bold]🐘 Generating Slurm script for: {name}[/bold]')
|
|
262
|
-
|
|
263
|
-
sbatch_content = generate_sbatch_script(
|
|
264
|
-
nodes = nodes,
|
|
265
|
-
gpus_per_node = gpus,
|
|
266
|
-
cpus_per_task = cpus,
|
|
267
|
-
time_limit = time,
|
|
268
|
-
job_name = name,
|
|
269
|
-
account_name = account,
|
|
270
|
-
wandb_api_key = wandb_key,
|
|
271
|
-
hf_repo_id = push_to_hub,
|
|
272
|
-
hf_dataset_id = pull_data,
|
|
273
|
-
user_command = full_command_str
|
|
274
|
-
)
|
|
275
|
-
|
|
276
|
-
if dry_run:
|
|
277
|
-
console.print('\n[yellow]--- DRY RUN: Generated #SBATCH Script ---[/yellow]')
|
|
278
|
-
syntax = Syntax(sbatch_content, 'bash', theme = 'monokai', line_numbers = True)
|
|
279
|
-
console.print(syntax)
|
|
280
|
-
console.print('[yellow]-----------------------------------------------[/yellow]')
|
|
281
|
-
else:
|
|
282
|
-
console.print("[dim]Submitting to Slurm scheduler...[/dim]")
|
|
283
|
-
|
|
284
|
-
# --- Submission ---
|
|
285
|
-
with tempfile.NamedTemporaryFile(mode='w+', suffix=".sh", delete=False) as temp_file:
|
|
286
|
-
temp_script_path = temp_file.name
|
|
287
|
-
temp_file.write(sbatch_content)
|
|
288
|
-
|
|
289
|
-
try:
|
|
290
|
-
result = subprocess.run(
|
|
291
|
-
["sbatch", temp_script_path],
|
|
292
|
-
check=True,
|
|
293
|
-
capture_output=True,
|
|
294
|
-
text=True
|
|
295
|
-
)
|
|
296
|
-
# Extract the job ID from the output "Submitted batch job 12345"
|
|
297
|
-
job_id = result.stdout.strip().split()[-1]
|
|
298
|
-
console.print(f"[bold green]✅ Job submitted successfully![/bold green] (ID: [bold]{job_id}[/bold])")
|
|
299
|
-
console.print(f"[dim]View logs: cat logs/{name}-{job_id}.out[/dim]")
|
|
300
|
-
|
|
301
|
-
except subprocess.CalledProcessError as e:
|
|
302
|
-
console.print("[bold red]❌ Failed to submit job.[/bold red]")
|
|
303
|
-
console.print(f"Sbatch error: {e.stderr}")
|
|
304
|
-
except FileNotFoundError:
|
|
305
|
-
console.print("\n[bold red]❌ Error:[/bold red] 'sbatch' command not found.")
|
|
306
|
-
console.print("Are you running this on a Slurm login node?")
|
|
307
|
-
finally:
|
|
308
|
-
if os.path.exists(temp_script_path):
|
|
309
|
-
os.unlink(temp_script_path)
|
|
310
|
-
|
|
311
|
-
# --- APP DEFINITION AND COMMAND REGISTRATION ---
|
|
312
|
-
|
|
313
|
-
# Initializing the Typer app
|
|
314
|
-
app = typer.Typer(
|
|
315
|
-
name="jazari",
|
|
316
|
-
help="🐘 The orchestration layer for modern Slurm clusters.",
|
|
317
|
-
add_completion=False,
|
|
318
|
-
no_args_is_help=True
|
|
319
|
-
)
|
|
320
|
-
|
|
321
|
-
# Explicitly register the commands
|
|
322
|
-
app.command(name="status")(status_command)
|
|
323
|
-
app.command(name="kill")(kill_command)
|
|
324
|
-
app.command(name="logs")(logs_command)
|
|
325
|
-
app.command(name="init")(init_command)
|
|
326
|
-
app.command(name="run")(run_command)
|
|
327
|
-
|
|
328
|
-
if __name__ == "__main__":
|
|
329
|
-
app()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|