jazari 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,42 @@
1
+ import os
2
+ from rich.console import Console
3
+ from rich.prompt import Prompt, Confirm
4
+ from jazari.config_generator import load_config, make_config_dir, CONFIG_FILE
5
+
6
+ console = Console()
7
+
8
+ def init_command():
9
+ """
10
+ Initialize Jazari config for this cluster.
11
+ """
12
+ current_config = load_config()
13
+ config_dir = os.path.expanduser('~/.jazari')
14
+ if not os.path.exists(config_dir):
15
+ os.makedirs(config_dir)
16
+ os.chmod(config_dir, 0o700)
17
+
18
+ console.print('[bold blue]Welcome to Jazari setup.[/bold blue]')
19
+ console.print(f"This will create a configuration file at [dim]{CONFIG_FILE}[/dim]\n")
20
+
21
+ # 1. Ask for Slurm Account
22
+ default_account = current_config.get("account", "")
23
+ account = Prompt.ask("Enter your default Slurm account (e.g., def-user)",default=default_account)
24
+
25
+ # 2. Ask for default time limit
26
+ default_time = current_config.get("time", "01:00:00")
27
+ time_limit = Prompt.ask("Enter default time limit (D-HH:MM)",default=default_time)
28
+
29
+ # 3. Ask for W&B tracking default
30
+ default_wandb = current_config.get("track_wandb", False)
31
+ track_wandb = Confirm.ask("Enable W&B tracking by default?",default=default_wandb)
32
+
33
+ new_config = {
34
+ "account": account,
35
+ "time": time_limit,
36
+ "track_wandb": track_wandb
37
+ }
38
+
39
+ make_config_dir(new_config)
40
+
41
+ console.print(f"\n[bold green]✅ Configuration saved![/bold green]")
42
+ console.print("You can now run jobs without specifying these flags.")
@@ -0,0 +1,32 @@
1
+ import typer, shutil, sys, subprocess
2
+ from rich.console import Console
3
+ from rich.prompt import Confirm
4
+ from typing import Optional
5
+ from jazari.utils import get_current_user
6
+
7
+ console = Console()
8
+
9
+ def kill_command(
10
+ job_id: Optional[str] = typer.Argument(None, help = 'Job ID to cancel.'),
11
+ cancel_all: bool = typer.Option(False, '--all', help = 'Cancel ALL active jobs.')
12
+ ):
13
+ if not shutil.which("scancel"):
14
+ console.print("[bold red]Error:[/bold red] 'scancel' command not found.")
15
+ console.print("Are you running this on a Slurm login node?")
16
+ sys.exit(1)
17
+
18
+ user = get_current_user()
19
+
20
+ if cancel_all:
21
+ if not Confirm.ask(f"[bold red]⚠️ DANGER:[/bold red] Cancel ALL jobs for '{user}'?"):
22
+ sys.exit(0)
23
+ subprocess.run(["scancel", "-u", user], check=True)
24
+ console.print(f"[bold green]💥 All jobs cancelled.[/bold green]")
25
+ return
26
+
27
+ if job_id:
28
+ subprocess.run(["scancel", job_id], check=True)
29
+ console.print(f"[bold green]💥 Signal sent to cancel job {job_id}.[/bold green]")
30
+ return
31
+
32
+ console.print("[yellow]Usage: jazari kill <JOB_ID> or jazari kill --all[/yellow]")
@@ -0,0 +1,42 @@
1
+ import typer, sys, os, subprocess
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ from rich.console import Console
5
+
6
+ console = Console()
7
+
8
+ def logs_command(
9
+ job_id: Optional[str] = typer.Argument(None, help="Job ID to view."),
10
+ tail: bool = typer.Option(False, "--tail", "-f", help="Follow output live."),
11
+ error: bool = typer.Option(False, "--error", "-e", help="View .err log instead of .out")
12
+ ):
13
+ """View or follow job logs."""
14
+ log_dir = Path("logs")
15
+ if not log_dir.exists():
16
+ console.print("[red]Error: 'logs/' directory not found.[/red]")
17
+ sys.exit(1)
18
+
19
+ suffix = ".err" if error else ".out"
20
+ target_file = None
21
+
22
+ if job_id:
23
+ matching = list(log_dir.glob(f"*{job_id}{suffix}"))
24
+ if not matching:
25
+ console.print(f"[red]No log found for Job {job_id}[/red]")
26
+ sys.exit(1)
27
+ target_file = matching[0]
28
+ else:
29
+ files = list(log_dir.glob(f"*{suffix}"))
30
+ if not files:
31
+ console.print(f"[yellow]No {suffix} files found.[/yellow]")
32
+ return
33
+ files.sort(key=os.path.getmtime)
34
+ target_file = files[-1]
35
+
36
+ console.print(f"[dim]Viewing: {target_file}[/dim]")
37
+ cmd = ["tail", "-f", str(target_file)] if tail else ["cat", str(target_file)]
38
+
39
+ try:
40
+ subprocess.run(cmd, check=True)
41
+ except KeyboardInterrupt:
42
+ console.print("\n[dim]Stopped.[/dim]")
jazari/commands/run.py ADDED
@@ -0,0 +1,98 @@
1
+ import os, typer, sys, shlex, tempfile, subprocess
2
+ from typing import Optional
3
+ from rich.console import Console
4
+ from rich.syntax import Syntax
5
+ from jazari.slurm_generator import generate_sbatch_script
6
+ from jazari.config_generator import load_config, get_config_default
7
+ from jazari.utils import get_wandb_api_key
8
+
9
+ console = Console()
10
+ CONFIG = load_config()
11
+
12
+ def run_command(
13
+ command: list[str] = typer.Argument(..., help="The command to run (e.g., python train.py). Use '--' before it."),
14
+ nodes: int = typer.Option(get_config_default("nodes", CONFIG, 1), '--nodes', '-N', help='Number of nodes.'),
15
+ gpus: int = typer.Option(get_config_default("gpus", CONFIG, 1), '--gpus', '-G', help='GPUs per node.'),
16
+ cpus: int = typer.Option(get_config_default("cpus", CONFIG, 1), '--cpus', '-n', help='CPUs per task/GPU'), # Changed -c to -n per our earlier discussion
17
+ time: str = typer.Option(get_config_default("time", CONFIG, '01:00:00'), '--time', '-t', help='Time limit (D-HH:MM)'),
18
+ name: str = typer.Option('jazari_run', '--name', '-j', help='Job name'), # Changed -n to -j to allow CPU shortcut
19
+ account: Optional[str] = typer.Option(get_config_default("account", CONFIG, None), "--account", "-A", help="Slurm account to charge."),
20
+ track_wandb: bool = typer.Option(get_config_default("track_wandb", CONFIG, False), "--track-wandb", help="Auto-configure W&B."),
21
+ push_to_hub: Optional[str] = typer.Option(None, "--push-to-hub", help="Hugging Face repo ID to upload model to."),
22
+ pull_data: Optional[str] = typer.Option(None, '--pull-data', help='Huggingface dataset ID to download.'),
23
+ dry_run: bool = typer.Option(False, '--dry-run', help='Print sbatch script without submitting.'),
24
+ ):
25
+ '''
26
+ Launch a distributed training job.
27
+ '''
28
+ if not command:
29
+ console.print("[bold red]Error:[/bold red] You must provide a command to run.")
30
+ sys.exit(1)
31
+
32
+ log_dir = "logs"
33
+ if not os.path.exists(log_dir):
34
+ os.makedirs(log_dir, exist_ok=True)
35
+
36
+ full_command_str = shlex.join(command)
37
+
38
+ # --- Weights & Biases ---
39
+ wandb_key = None
40
+ if track_wandb:
41
+ console.print("[dim]W&B tracking enabled.[/dim]")
42
+ wandb_key = get_wandb_api_key()
43
+ if not wandb_key:
44
+ console.print("[bold yellow]Warning:[/bold yellow] Could not find W&B API key locally.")
45
+ console.print("Please run [green]wandb login[/green] on this machine first.")
46
+ # -----------------
47
+
48
+ console.print(f'[bold]🐘 Generating Slurm script for: {name}[/bold]')
49
+
50
+ sbatch_content = generate_sbatch_script(
51
+ nodes = nodes,
52
+ gpus_per_node = gpus,
53
+ cpus_per_task = cpus,
54
+ time_limit = time,
55
+ job_name = name,
56
+ account_name = account,
57
+ hf_repo_id = push_to_hub,
58
+ hf_dataset_id = pull_data,
59
+ user_command = full_command_str
60
+ )
61
+
62
+ if dry_run:
63
+ console.print('\n[yellow]--- DRY RUN: Generated #SBATCH Script ---[/yellow]')
64
+ syntax = Syntax(sbatch_content, 'bash', theme = 'monokai', line_numbers = True)
65
+ console.print(syntax)
66
+ console.print('[yellow]-----------------------------------------------[/yellow]')
67
+ else:
68
+ console.print("[dim]Submitting to Slurm scheduler...[/dim]")
69
+
70
+ submit_env = os.environ.copy()
71
+ if wandb_key:
72
+ submit_env['WANDB_API_KEY'] = wandb_key
73
+
74
+ with tempfile.NamedTemporaryFile(mode='w+', suffix=".sh", delete=False) as temp_file:
75
+ temp_script_path = temp_file.name
76
+ temp_file.write(sbatch_content)
77
+
78
+ try:
79
+ result = subprocess.run(
80
+ ["sbatch", temp_script_path],
81
+ check = True,
82
+ capture_output = True,
83
+ text = True,
84
+ env = submit_env
85
+ )
86
+ job_id = result.stdout.strip().split()[-1]
87
+ console.print(f"[bold green]✅ Job submitted successfully![/bold green] (ID: [bold]{job_id}[/bold])")
88
+ console.print(f"[dim]View logs: cat logs/{name}-{job_id}.out[/dim]")
89
+
90
+ except subprocess.CalledProcessError as e:
91
+ console.print("[bold red]❌ Failed to submit job.[/bold red]")
92
+ console.print(f"Sbatch error: {e.stderr}")
93
+ except FileNotFoundError:
94
+ console.print("\n[bold red]❌ Error:[/bold red] 'sbatch' command not found.")
95
+ console.print("Are you running this on a Slurm login node?")
96
+ finally:
97
+ if os.path.exists(temp_script_path):
98
+ os.unlink(temp_script_path)
@@ -0,0 +1,60 @@
1
+ import shutil, sys, subprocess
2
+ from rich.console import Console
3
+ from rich.table import Table
4
+ from rich import box
5
+ from jazari.utils import get_current_user # We will move helper functions to utils.py later
6
+
7
+ console = Console()
8
+
9
+ def status_command():
10
+ if not shutil.which("squeue"):
11
+ console.print("[bold red]Error:[/bold red] 'squeue' command not found.")
12
+ console.print("Are you running this on a Slurm login node?")
13
+ sys.exit(1)
14
+
15
+ user = get_current_user()
16
+
17
+ squeue_cmd = ["squeue", "-u", user, "-h", "-o", "%i|%j|%T|%M|%D|%R"]
18
+
19
+ try:
20
+ result = subprocess.run(squeue_cmd, check=True, capture_output=True, text=True)
21
+ output_lines = result.stdout.strip().split('\n')
22
+ output_lines = [line for line in output_lines if line.strip()]
23
+
24
+ if not output_lines:
25
+ console.print(f"\n[dim]No active jobs found for user: [bold]{user}[/bold][/dim]\n")
26
+ return
27
+
28
+ table = Table(
29
+ title = f"🐘 Cluster Activity for user: [bold green]{user}[/bold green]",
30
+ box = box.ROUNDED,
31
+ header_style = "bold blue",
32
+ expand = True
33
+ )
34
+ table.add_column("Job ID", style="cyan", no_wrap=True)
35
+ table.add_column("Name")
36
+ table.add_column("State")
37
+ table.add_column("Time Used", justify="right")
38
+ table.add_column("Nodes", justify="right")
39
+ table.add_column("Nodelist / Reason", style="dim")
40
+
41
+ for line in output_lines:
42
+ try:
43
+ job_id, name, state, time_used, nodes, reason = line.split('|')
44
+
45
+ # Color-code the state
46
+ if state == "RUNNING":
47
+ state_formatted = f"[bold green]{state}[/bold green]"
48
+ elif state == "PENDING":
49
+ state_formatted = f"[yellow]{state}[/yellow]"
50
+ else:
51
+ state_formatted = f"[red]{state}[/red]"
52
+
53
+ table.add_row(job_id, name, state_formatted, time_used, nodes, reason)
54
+ except ValueError:
55
+ continue
56
+
57
+ console.print(table)
58
+
59
+ except subprocess.CalledProcessError as e:
60
+ console.print(f"[bold red]Error running squeue:[/bold red] {e.stderr}")
@@ -1,16 +1,12 @@
1
1
  import os, yaml
2
2
  from pathlib import Path
3
3
  from rich.console import Console
4
- # Config helpers
5
- # defining the path to the config file: ~/.jazari/config.yaml
4
+
6
5
  CONFIG_DIR = os.path.join(Path.home(), '.jazari')
7
6
  CONFIG_FILE = os.path.join(CONFIG_DIR, 'config.yaml')
8
7
  console = Console()
9
8
 
10
9
  def load_config():
11
- '''
12
- Loading config file if exists, otherwise returns empty dict.
13
- '''
14
10
  if os.path.exists(CONFIG_FILE):
15
11
  try:
16
12
  with open(CONFIG_FILE, 'r') as f:
@@ -21,16 +17,11 @@ def load_config():
21
17
  return {}
22
18
 
23
19
  def get_config_default(key, CONFIG, default_value):
24
- '''
25
- Helper to get a default value from config or fall back to a hardcoded one.
26
- '''
27
20
  return CONFIG.get(key, default_value)
28
21
 
29
22
  def make_config_dir(new_config):
30
- # Creating directory if doesn't exist
31
23
  Path(CONFIG_DIR).mkdir(parents = True, exist_ok=True)
32
-
33
- # Writing config to the yaml file
24
+
34
25
  with open(CONFIG_FILE, 'w') as f:
35
26
  yaml.dump(new_config, f, default_flow_style=False)
36
27
 
jazari/main.py CHANGED
@@ -1,324 +1,21 @@
1
- import typer, sys, os, tempfile, subprocess, netrc, shutil, getpass
1
+ import typer, os
2
+ from jazari.commands.status import status_command
3
+ from jazari.commands.kill import kill_command
4
+ from jazari.commands.logs import logs_command
5
+ from jazari.commands.init import init_command
6
+ from jazari.commands.run import run_command
2
7
 
3
- from typing import Optional
4
- from pathlib import Path
5
-
6
- from rich.console import Console
7
- from rich.syntax import Syntax
8
- from rich.prompt import Prompt, Confirm
9
- from rich.table import Table
10
- from rich import box
11
-
12
- from jazari.slurm_generator import generate_sbatch_script
13
- from jazari.config_generator import load_config, get_config_default, make_config_dir, CONFIG_FILE
14
-
15
- # Initializing Rich console for pretty printing
16
- console = Console()
17
-
18
- # Load config at start.
19
- CONFIG = load_config()
20
-
21
- # --- HELPER FUNCTIONS ---
22
-
23
- def get_current_user() -> str:
24
- """Returns the current username, robustly on Linux environments."""
25
- try:
26
- return os.environ['USER']
27
- except KeyError:
28
- return getpass.getuser()
29
-
30
- def get_wandb_api_key() -> Optional[str]:
31
- """
32
- Attempts to read the W&B API key from the local ~/.netrc file.
33
- """
34
- try:
35
- login_info = netrc.netrc().authenticators("api.wandb.ai")
36
- if login_info:
37
- username, _, api_key = login_info
38
- console.print(f"[dim]Found W&B API key for user: {username}[/dim]")
39
- return api_key
40
- else:
41
- return None
42
- except FileNotFoundError:
43
- return None
44
- except Exception as e:
45
- console.print(f"[dim bold red]Warning: Could not read W&B credentials: {e}[/dim]")
46
- return None
47
-
48
- # --- COMMAND FUNCTIONS (No decorators here) ---
49
-
50
- def status_command():
51
- """
52
- View the status of active Slurm jobs for the current user.
53
- """
54
- # 1. Check if squeue exists available
55
- if not shutil.which("squeue"):
56
- console.print("[bold red]Error:[/bold red] 'squeue' command not found.")
57
- console.print("Are you running this on a Slurm login node?")
58
- sys.exit(1)
59
-
60
- user = get_current_user()
61
-
62
- # 2. Define the squeue command
63
- # -u: Filter by user
64
- # -h: No header (we'll make our own)
65
- # -o: Custom format with '|' delimiter.
66
- # %i=ID, %j=Name, %T=State, %M=Time Used, %D=Nodes, %R=Reason/Nodelist
67
- squeue_cmd = ["squeue", "-u", user, "-h", "-o", "%i|%j|%T|%M|%D|%R"]
68
-
69
- try:
70
- # 3. Run the command and capture output
71
- result = subprocess.run(squeue_cmd, check=True, capture_output=True, text=True)
72
- output_lines = result.stdout.strip().split('\n')
73
-
74
- # Remove empty lines if squeue returned nothing
75
- output_lines = [line for line in output_lines if line.strip()]
76
-
77
- if not output_lines:
78
- console.print(f"\n[dim]No active jobs found for user: [bold]{user}[/bold][/dim]\n")
79
- return
80
-
81
- # 4. Build the Rich Table
82
- table = Table(
83
- title=f"🐘 Cluster Activity for user: [bold green]{user}[/bold green]",
84
- box=box.ROUNDED,
85
- header_style="bold blue",
86
- expand=True
87
- )
88
- table.add_column("Job ID", style="cyan", no_wrap=True)
89
- table.add_column("Name")
90
- table.add_column("State")
91
- table.add_column("Time Used", justify="right")
92
- table.add_column("Nodes", justify="right")
93
- table.add_column("Nodelist / Reason", style="dim")
94
-
95
- # 5. Parse output and add rows
96
- for line in output_lines:
97
- try:
98
- job_id, name, state, time_used, nodes, reason = line.split('|')
99
-
100
- # Color-code the state
101
- if state == "RUNNING":
102
- state_formatted = f"[bold green]{state}[/bold green]"
103
- elif state == "PENDING":
104
- state_formatted = f"[yellow]{state}[/yellow]"
105
- else:
106
- state_formatted = f"[red]{state}[/red]"
107
-
108
- table.add_row(job_id, name, state_formatted, time_used, nodes, reason)
109
- except ValueError:
110
- # Handle rare cases where splitting fails
111
- continue
112
-
113
- console.print(table)
114
-
115
- except subprocess.CalledProcessError as e:
116
- console.print(f"[bold red]Error running squeue:[/bold red] {e.stderr}")
117
-
118
- def kill_command(
119
- job_id: Optional[str] = typer.Argument(None, help = 'Job ID to cancel.'),
120
- cancel_all: bool = typer.Option(False, '--all', help = 'Cancel ALL active jobs.')
121
- ):
122
- # Cancel a specific job or all jobs
123
- if not shutil.which("scancel"):
124
- console.print("[bold red]Error:[/bold red] 'scancel' command not found.")
125
- console.print("Are you running this on a Slurm login node?")
126
- sys.exit(1)
127
-
128
- user = get_current_user()
129
-
130
- if cancel_all:
131
- if not Confirm.ask(f"[bold red]⚠️ DANGER:[/bold red] Cancel ALL jobs for '{user}'?"):
132
- sys.exit(0)
133
- subprocess.run(["scancel", "-u", user], check=True)
134
- console.print(f"[bold green]💥 All jobs cancelled.[/bold green]")
135
- return
136
-
137
- if job_id:
138
- subprocess.run(["scancel", job_id], check=True)
139
- console.print(f"[bold green]💥 Signal sent to cancel job {job_id}.[/bold green]")
140
- return
141
-
142
- console.print("[yellow]Usage: jazari kill <JOB_ID> or jazari kill --all[/yellow]")
143
-
144
- def logs_command(
145
- job_id: Optional[str] = typer.Argument(None, help="Job ID to view."),
146
- tail: bool = typer.Option(False, "--tail", "-f", help="Follow output live."),
147
- error: bool = typer.Option(False, "--error", "-e", help="View .err log instead of .out")
148
- ):
149
- """View or follow job logs."""
150
- log_dir = Path("logs")
151
- if not log_dir.exists():
152
- console.print("[red]Error: 'logs/' directory not found.[/red]")
153
- sys.exit(1)
154
-
155
- suffix = ".err" if error else ".out"
156
- target_file = None
157
-
158
- if job_id:
159
- matching = list(log_dir.glob(f"*{job_id}{suffix}"))
160
- if not matching:
161
- console.print(f"[red]No log found for Job {job_id}[/red]")
162
- sys.exit(1)
163
- target_file = matching[0]
164
- else:
165
- # Find latest
166
- files = list(log_dir.glob(f"*{suffix}"))
167
- if not files:
168
- console.print(f"[yellow]No {suffix} files found.[/yellow]")
169
- return
170
- files.sort(key=os.path.getmtime)
171
- target_file = files[-1]
172
-
173
- console.print(f"[dim]Viewing: {target_file}[/dim]")
174
- cmd = ["tail", "-f", str(target_file)] if tail else ["cat", str(target_file)]
175
-
176
- try:
177
- subprocess.run(cmd, check=True)
178
- except KeyboardInterrupt:
179
- console.print("\n[dim]Stopped.[/dim]")
180
-
181
- def init_command():
182
- """
183
- Initialize Jazari config for this cluster.
184
- Interactively asks the user for defaults and saves them to ~/.jazari/config.yaml.
185
- """
186
- console.print('[bold blue]Welcome to Jazari setup.[/bold blue]')
187
- console.print(f"This will create a configuration file at [dim]{CONFIG_FILE}[/dim]\n")
188
-
189
- # 1. Ask for Slurm Account
190
- # We use the current config value as the default prompt if it exists.
191
- default_account = CONFIG.get("account", "")
192
- account = Prompt.ask(
193
- "Enter your default Slurm account (e.g., def-user)",
194
- default=default_account
195
- )
196
-
197
- # 2. Ask for default time limit
198
- default_time = CONFIG.get("time", "01:00:00")
199
- time_limit = Prompt.ask(
200
- "Enter default time limit (D-HH:MM)",
201
- default=default_time
202
- )
203
-
204
- # 3. Ask for W&B tracking default
205
- default_wandb = CONFIG.get("track_wandb", False)
206
- track_wandb = Confirm.ask(
207
- "Enable W&B tracking by default?",
208
- default=default_wandb
209
- )
210
-
211
- # Prepare the config dictionary
212
- new_config = {
213
- "account": account,
214
- "time": time_limit,
215
- "track_wandb": track_wandb
216
- }
217
-
218
- make_config_dir(new_config)
219
-
220
- console.print(f"\n[bold green]✅ Configuration saved![/bold green]")
221
- console.print("You can now run jobs without specifying these flags.")
222
-
223
- def run_command(
224
- # The default values for these arguments are now pulled dynamically from the
225
- # CONFIG dictionary using the get_config_default helper.
226
- command: list[str] = typer.Argument(..., help="The command to run (e.g., python train.py --batch 64). Use '--' before it if it has flags."),
227
- nodes: int = typer.Option(get_config_default("nodes", CONFIG, 1), '--nodes', '-N', help='Number of nodes.'),
228
- gpus: int = typer.Option(get_config_default("gpus", CONFIG, 1), '--gpus', '-G', help='GPUs per node.'),
229
- cpus: int = typer.Option(get_config_default("cpus", CONFIG, 1), '--cpus', '-c', help='CPUs per task/GPU'),
230
- time: str = typer.Option(get_config_default("time", CONFIG, '01:00:00'), '--time', '-t', help='Time limit (D-HH:MM)'),
231
- name: str = typer.Option('jazari_run', '--name', '-n', help='Job name'),
232
- account: Optional[str] = typer.Option(get_config_default("account", CONFIG, None), "--account", "-A", help="Slurm account to charge."),
233
- track_wandb: bool = typer.Option(get_config_default("track_wandb", CONFIG, False), "--track-wandb", help="Auto-configure W&B."),
234
- push_to_hub: Optional[str] = typer.Option(None, "--push-to-hub", help="Hugging Face repo ID to upload model to (e.g. 'my-org/my-model)."),
235
- pull_data: Optional[str] = typer.Option(None, '--pull-data', help='Huggingface dataset ID to download.'),
236
- dry_run: bool = typer.Option(False, '--dry-run', help='Print sbatch script without submitting.'),
237
- ):
238
- '''
239
- Launch a distributed training job.
240
- '''
241
- if not command:
242
- console.print("[bold red]Error:[/bold red] You must provide a command to run.")
243
- sys.exit(1)
244
-
245
- log_dir = "logs"
246
- if not os.path.exists(log_dir):
247
- os.makedirs(log_dir, exist_ok=True)
248
-
249
- full_command_str = ' '.join(command)
250
-
251
- # --- Weights & Biases ---
252
- wandb_key = None
253
- if track_wandb:
254
- console.print("[dim]W&B tracking enabled.[/dim]")
255
- wandb_key = get_wandb_api_key()
256
- if not wandb_key:
257
- console.print("[bold yellow]Warning:[/bold yellow] Could not find W&B API key locally.")
258
- console.print("Please run [green]wandb login[/green] on this machine first.")
259
- # -----------------
260
-
261
- console.print(f'[bold]🐘 Generating Slurm script for: {name}[/bold]')
262
-
263
- sbatch_content = generate_sbatch_script(
264
- nodes = nodes,
265
- gpus_per_node = gpus,
266
- cpus_per_task = cpus,
267
- time_limit = time,
268
- job_name = name,
269
- account_name = account,
270
- wandb_api_key = wandb_key,
271
- hf_repo_id = push_to_hub,
272
- hf_dataset_id = pull_data,
273
- user_command = full_command_str
274
- )
275
-
276
- if dry_run:
277
- console.print('\n[yellow]--- DRY RUN: Generated #SBATCH Script ---[/yellow]')
278
- syntax = Syntax(sbatch_content, 'bash', theme = 'monokai', line_numbers = True)
279
- console.print(syntax)
280
- console.print('[yellow]-----------------------------------------------[/yellow]')
281
- else:
282
- console.print("[dim]Submitting to Slurm scheduler...[/dim]")
283
-
284
- # --- Submission ---
285
- with tempfile.NamedTemporaryFile(mode='w+', suffix=".sh", delete=False) as temp_file:
286
- temp_script_path = temp_file.name
287
- temp_file.write(sbatch_content)
288
-
289
- try:
290
- result = subprocess.run(
291
- ["sbatch", temp_script_path],
292
- check=True,
293
- capture_output=True,
294
- text=True
295
- )
296
- # Extract the job ID from the output "Submitted batch job 12345"
297
- job_id = result.stdout.strip().split()[-1]
298
- console.print(f"[bold green]✅ Job submitted successfully![/bold green] (ID: [bold]{job_id}[/bold])")
299
- console.print(f"[dim]View logs: cat logs/{name}-{job_id}.out[/dim]")
300
-
301
- except subprocess.CalledProcessError as e:
302
- console.print("[bold red]❌ Failed to submit job.[/bold red]")
303
- console.print(f"Sbatch error: {e.stderr}")
304
- except FileNotFoundError:
305
- console.print("\n[bold red]❌ Error:[/bold red] 'sbatch' command not found.")
306
- console.print("Are you running this on a Slurm login node?")
307
- finally:
308
- if os.path.exists(temp_script_path):
309
- os.unlink(temp_script_path)
310
-
311
- # --- APP DEFINITION AND COMMAND REGISTRATION ---
8
+ os.umask(0o077)
312
9
 
313
10
  # Initializing the Typer app
314
11
  app = typer.Typer(
315
- name="jazari",
316
- help="🐘 The orchestration layer for modern Slurm clusters.",
317
- add_completion=False,
318
- no_args_is_help=True
12
+ name = "jazari",
13
+ help = "🐘 The orchestration layer for modern Slurm clusters.",
14
+ add_completion = False,
15
+ no_args_is_help = True
319
16
  )
320
17
 
321
- # Explicitly register the commands
18
+ # Register the commands
322
19
  app.command(name="status")(status_command)
323
20
  app.command(name="kill")(kill_command)
324
21
  app.command(name="logs")(logs_command)
jazari/slurm_generator.py CHANGED
@@ -14,7 +14,6 @@ def generate_sbatch_script(
14
14
  time_limit: str,
15
15
  job_name: str,
16
16
  account_name: Optional[str],
17
- wandb_api_key: Optional[str],
18
17
  hf_repo_id: Optional[str],
19
18
  hf_dataset_id: Optional[str],
20
19
  user_command: str
@@ -34,7 +33,6 @@ def generate_sbatch_script(
34
33
  time_limit = time_limit,
35
34
  job_name = job_name,
36
35
  account_name = account_name,
37
- wandb_api_key = wandb_api_key,
38
36
  hf_repo_id = hf_repo_id,
39
37
  hf_dataset_id = hf_dataset_id,
40
38
  user_command_script = user_command
jazari/utils.py ADDED
@@ -0,0 +1,30 @@
1
+ import os, getpass, netrc
2
+ from typing import Optional
3
+ from rich.console import Console
4
+
5
+ console = Console()
6
+
7
+ def get_current_user() -> str:
8
+ """Returns the current username, robustly on Linux environments."""
9
+ try:
10
+ return os.environ['USER']
11
+ except KeyError:
12
+ return getpass.getuser()
13
+
14
+ def get_wandb_api_key() -> Optional[str]:
15
+ """
16
+ Attempts to read the W&B API key from the local ~/.netrc file.
17
+ """
18
+ try:
19
+ login_info = netrc.netrc().authenticators("api.wandb.ai")
20
+ if login_info:
21
+ username, _, api_key = login_info
22
+ console.print(f"[dim]Found W&B API key for user: {username}[/dim]")
23
+ return api_key
24
+ else:
25
+ return None
26
+ except FileNotFoundError:
27
+ return None
28
+ except Exception as e:
29
+ console.print(f"[dim bold red]Warning: Could not read W&B credentials: {e}[/dim]")
30
+ return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jazari
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: The orchestration layer for modern Slurm clusters.
5
5
  Author-email: Levent Ozbek <levent@jazari.run>
6
6
  License: MIT
@@ -0,0 +1,16 @@
1
+ jazari/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ jazari/config_generator.py,sha256=ZLT48mCkSxoA5poURDhy4L7solmq5ZVXnBMkYDhpyDw,940
3
+ jazari/main.py,sha256=prINKNdGeqh8GZzBgBoeC1d1pk2JIzYJL0OiDNckHC4,745
4
+ jazari/slurm_generator.py,sha256=1powpTR6YbkUMb-eWacz2LweZRQIstB0G7Pv1N8lBtY,1401
5
+ jazari/utils.py,sha256=kMgO01TSGUnQqwnpGznvX99L_pEl86rX602tvpSnZZA,911
6
+ jazari/commands/init.py,sha256=0tpPE2Vzk76bKosTSsUOBq74LzgXVnT28ZaftWZPKXI,1488
7
+ jazari/commands/kill.py,sha256=UsK4lGYTh5SrnaK87avFXgTyXrjf3ExiUZT6jkvA76I,1191
8
+ jazari/commands/logs.py,sha256=RJEnmQ1CAzBj47w_TGtFYgB4WhgIKwlGP5kD3lPNnDI,1408
9
+ jazari/commands/run.py,sha256=VWrX8-p5B4T18ZgSonZlZ3Uown8Nt-B5CKLGwiIf98o,4749
10
+ jazari/commands/status.py,sha256=JODRwFkXkSHqqmZNZNDO93ZWHI2Kcrdrf7dbdOh3DnY,2252
11
+ jazari-0.1.2.dist-info/licenses/LICENSE,sha256=OvKjjZV2nvR_v2QrhWR4TMgvtDmZYfZLFSt3bk1q6QA,1068
12
+ jazari-0.1.2.dist-info/METADATA,sha256=bqAxIfy2pFP-OzA_kVRxS__3CCAa-4PmLNWeZgXvWx8,5826
13
+ jazari-0.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
14
+ jazari-0.1.2.dist-info/entry_points.txt,sha256=9vYDaHn1_jBw_cMNC9ZKl4vRk3n2aDQ5dHL0cM8tEfE,43
15
+ jazari-0.1.2.dist-info/top_level.txt,sha256=NY_Ke0lfKudUMc5w4rycAxzHKSgTUkhYqcmdFMYrya4,7
16
+ jazari-0.1.2.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- jazari/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- jazari/config_generator.py,sha256=BBg10poQLHIGy95frLdivNF4xhml6xeuWL7BWketm8g,1270
3
- jazari/main.py,sha256=Xb2MSMDyCWm1QQobPK28u-NKeKmWWwa_HzjFMuyXWRQ,12636
4
- jazari/slurm_generator.py,sha256=m5F76NqzMIwO4lL7MnyMWtuJTdj5Qv5qIN-P93CVKzM,1485
5
- jazari-0.1.0.dist-info/licenses/LICENSE,sha256=OvKjjZV2nvR_v2QrhWR4TMgvtDmZYfZLFSt3bk1q6QA,1068
6
- jazari-0.1.0.dist-info/METADATA,sha256=DXUGSaVwiaeCJhGUaZvT8FoDpyAL4LmAhM59Q1QQVqs,5826
7
- jazari-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
8
- jazari-0.1.0.dist-info/entry_points.txt,sha256=9vYDaHn1_jBw_cMNC9ZKl4vRk3n2aDQ5dHL0cM8tEfE,43
9
- jazari-0.1.0.dist-info/top_level.txt,sha256=NY_Ke0lfKudUMc5w4rycAxzHKSgTUkhYqcmdFMYrya4,7
10
- jazari-0.1.0.dist-info/RECORD,,
File without changes