nshtrainer 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +0,0 @@
1
- from dataclasses import dataclass
2
- from pathlib import Path
3
-
4
-
5
- @dataclass(frozen=True)
6
- class SubmitOutput:
7
- command_parts: list[str]
8
- script_path: Path
9
-
10
- @property
11
- def command(self) -> str:
12
- return " ".join(self.command_parts)
@@ -1,109 +0,0 @@
1
- from collections.abc import Iterable, Mapping, Sequence
2
- from pathlib import Path
3
-
4
-
5
- def _create_launcher_script_file(
6
- script_path: Path,
7
- original_command: str | Iterable[str],
8
- environment: Mapping[str, str],
9
- setup_commands: Sequence[str],
10
- chmod: bool = True,
11
- prepend_command_with_exec: bool = True,
12
- command_prefix: str | None = None,
13
- # ^ If True, the original command will be prepended with 'exec' to replace the shell process
14
- # with the command. This is useful for ensuring that the command is the only process in the
15
- # process tree (e.g. for better signal handling).
16
- ):
17
- """
18
- Creates a helper bash script for running the given function.
19
-
20
- The core idea: The helper script is essentially one additional layer of indirection
21
- that allows us to encapsulates the environment setup and the actual function call
22
- in a single bash script (that does not require properly set up Python environment).
23
-
24
- In effect, this allows us to, for example:
25
- - Easily run the function in the correct environment
26
- (without having to deal with shell hooks)
27
- using `conda run -n myenv bash /path/to/helper.sh`.
28
- - Easily run the function in a Singularity container
29
- using `singularity exec my_container.sif bash /path/to/helper.sh`.
30
- """
31
- with script_path.open("w") as f:
32
- f.write("#!/bin/bash\n\n")
33
- f.write("set -e\n\n")
34
-
35
- if environment:
36
- for key, value in environment.items():
37
- f.write(f"export {key}={value}\n")
38
- f.write("\n")
39
-
40
- if setup_commands:
41
- for setup_command in setup_commands:
42
- f.write(f"{setup_command}\n")
43
- f.write("\n")
44
-
45
- if not isinstance(original_command, str):
46
- original_command = " ".join(original_command)
47
-
48
- if command_prefix:
49
- original_command = f"{command_prefix} {original_command}"
50
-
51
- if prepend_command_with_exec:
52
- original_command = f"exec {original_command}"
53
- f.write(f"{original_command}\n")
54
-
55
- if chmod:
56
- # Make the script executable
57
- script_path.chmod(0o755)
58
-
59
-
60
- def write_helper_script(
61
- base_dir: Path,
62
- command: str | Iterable[str],
63
- environment: Mapping[str, str],
64
- setup_commands: Sequence[str],
65
- chmod: bool = True,
66
- prepend_command_with_exec: bool = True,
67
- command_prefix: str | None = None,
68
- file_name: str = "helper.sh",
69
- ):
70
- """
71
- Creates a helper bash script for running the given function.
72
-
73
- The core idea: The helper script is essentially one additional layer of indirection
74
- that allows us to encapsulates the environment setup and the actual function call
75
- in a single bash script (that does not require properly set up Python environment).
76
-
77
- In effect, this allows us to, for example:
78
- - Easily run the function in the correct environment
79
- (without having to deal with shell hooks)
80
- using `conda run -n myenv bash /path/to/helper.sh`.
81
- - Easily run the function in a Singularity container
82
- using `singularity exec my_container.sif bash /path/to/helper.sh`.
83
- """
84
-
85
- out_path = base_dir / file_name
86
- _create_launcher_script_file(
87
- out_path,
88
- command,
89
- environment,
90
- setup_commands,
91
- chmod,
92
- prepend_command_with_exec,
93
- command_prefix,
94
- )
95
- return out_path
96
-
97
-
98
- DEFAULT_TEMPLATE = "bash {script}"
99
-
100
-
101
- def helper_script_to_command(script: Path, template: str | None) -> str:
102
- if not template:
103
- template = DEFAULT_TEMPLATE
104
-
105
- # Make sure the template has '{script}' in it
106
- if "{script}" not in template:
107
- raise ValueError(f"Template must contain '{{script}}'. Got: {template!r}")
108
-
109
- return template.format(script=str(script.absolute()))
@@ -1,467 +0,0 @@
1
- import copy
2
- import logging
3
- import os
4
- import signal
5
- from collections.abc import Callable, Mapping, Sequence
6
- from datetime import timedelta
7
- from pathlib import Path
8
- from typing import Any, Literal, cast
9
-
10
- from deepmerge import always_merger
11
- from typing_extensions import TypeAlias, TypedDict, TypeVarTuple, Unpack
12
-
13
- from ._output import SubmitOutput
14
- from ._script import helper_script_to_command, write_helper_script
15
-
16
- log = logging.getLogger(__name__)
17
-
18
-
19
- TArgs = TypeVarTuple("TArgs")
20
-
21
- _Path: TypeAlias = str | Path | os.PathLike
22
-
23
-
24
- class LSFJobKwargs(TypedDict, total=False):
25
- name: str
26
- """
27
- The name of the job.
28
-
29
- This corresponds to the "-J" option in bsub.
30
- """
31
-
32
- queue: str | Sequence[str]
33
- """
34
- The queue to submit the job to.
35
-
36
- This corresponds to the "-q" option in bsub. If not specified, the default queue will be used.
37
- """
38
-
39
- output_file: _Path
40
- """
41
- The file to write the job output to.
42
-
43
- This corresponds to the "-o" option in bsub. If not specified, the output will be written to the default output file.
44
- """
45
-
46
- error_file: _Path
47
- """
48
- The file to write the job errors to.
49
-
50
- This corresponds to the "-e" option in bsub. If not specified, the errors will be written to the default error file.
51
- """
52
-
53
- walltime: timedelta
54
- """
55
- The maximum walltime for the job.
56
-
57
- This corresponds to the "-W" option in bsub. The format is "HH:MM" or "MM". If not specified, the default walltime will be used.
58
- """
59
-
60
- memory_mb: int
61
- """
62
- The maximum memory for the job in MB.
63
-
64
- This corresponds to the "-M" option in bsub. If not specified, the default memory limit will be used.
65
- """
66
-
67
- cpu_limit: int
68
- """
69
- The CPU time limit for the job in minutes.
70
-
71
- This corresponds to the "-c" option in bsub. If not specified, the default CPU limit will be used.
72
- """
73
-
74
- rerunnable: bool
75
- """
76
- Whether the job should be rerunnable.
77
-
78
- This corresponds to the "-r" option in bsub. If set to True, the job will be rerun if it fails due to a system failure.
79
- """
80
-
81
- dependency_conditions: Sequence[str]
82
- """
83
- The dependency conditions for the job.
84
-
85
- This corresponds to the "-w" option in bsub. Each condition is a string that specifies the dependency condition.
86
- Multiple conditions can be specified, and they will be combined using logical AND.
87
- """
88
-
89
- email: str
90
- """
91
- The email address to send notifications to.
92
-
93
- This corresponds to the "-u" option in bsub. If specified, job notifications will be sent to this email address.
94
- """
95
-
96
- notify_begin: bool
97
- """
98
- Whether to send a notification when the job begins.
99
-
100
- This corresponds to the "-B" option in bsub. If set to True, a notification will be sent when the job begins.
101
- """
102
-
103
- notify_end: bool
104
- """
105
- Whether to send a notification when the job ends.
106
-
107
- This corresponds to the "-N" option in bsub. If set to True, a notification will be sent when the job ends.
108
- """
109
-
110
- setup_commands: Sequence[str]
111
- """
112
- The setup commands to run before the job.
113
-
114
- These commands will be executed prior to everything else in the job script.
115
- """
116
-
117
- environment: Mapping[str, str]
118
- """
119
- The environment variables to set for the job.
120
-
121
- These variables will be set prior to executing any commands in the job script.
122
- """
123
-
124
- project: str
125
- """
126
- The project to charge the job to.
127
-
128
- This corresponds to the "-P" option in bsub. If specified, the job will be charged to this project.
129
- """
130
-
131
- nodes: int
132
- """
133
- The number of nodes to use for the job.
134
-
135
- This corresponds to the "-nnodes" option in bsub. The default is 1 node.
136
- """
137
-
138
- rs_per_node: int
139
- """
140
- The number of resource sets per node. This is sent to the `jsrun` command.
141
- """
142
-
143
- cpus_per_rs: int | Literal["ALL_CPUS"]
144
- """
145
- The number of CPUs per resource set. This is sent to the `jsrun` command.
146
- """
147
-
148
- gpus_per_rs: int | Literal["ALL_GPUS"]
149
- """
150
- The number of GPUs per resource set. This is sent to the `jsrun` command.
151
- """
152
-
153
- tasks_per_rs: int
154
- """
155
- The number of tasks per resource set. This is sent to the `jsrun` command.
156
- """
157
-
158
- alloc_flags: str
159
- """
160
- The allocation flags for the job.
161
-
162
- This corresponds to the "-alloc_flags" option in bsub. If specified, the job will be allocated using these flags.
163
- """
164
-
165
- command_prefix: str
166
- """
167
- A command to prefix the job command with.
168
-
169
- This is used to add commands like `jsrun` to the job command.
170
- """
171
-
172
- command_template: str
173
- """
174
- The template for the command to execute the helper script.
175
-
176
- Default: `bash {/path/to/helper.sh}`.
177
- """
178
-
179
- signal: signal.Signals
180
- """
181
- The signal to send to the job as the "warning action".
182
-
183
- This corresponds to the "-wa" option in bsub.
184
- """
185
-
186
- signal_time: timedelta
187
- """
188
- The time (before the job ends) to send the signal.
189
-
190
- This corresponds to the "-wt" option in bsub.
191
- """
192
-
193
- # Our own custom options
194
- summit: bool
195
- """
196
- Whether the job is being submitted to Summit.
197
-
198
- If set to True, the job will be submitted to Summit and the default Summit options will be used.
199
- """
200
-
201
-
202
- DEFAULT_KWARGS: LSFJobKwargs = {
203
- "name": "ll",
204
- # "nodes": 1,
205
- # "rs_per_node": 1,
206
- # "walltime": timedelta(hours=2),
207
- "summit": False,
208
- "signal": signal.SIGURG,
209
- "signal_time": timedelta(minutes=5),
210
- }
211
-
212
-
213
- def _update_kwargs_jsrun(kwargs: LSFJobKwargs, base_dir: Path) -> LSFJobKwargs:
214
- kwargs = copy.deepcopy(kwargs)
215
-
216
- # Update the command_prefix to add srun:
217
- command_parts: list[str] = ["jsrun"]
218
-
219
- # Add the worker logs
220
- command_parts.extend(["--stdio_mode", "individual"])
221
- command_parts.extend(
222
- ["--stdio_stdout", str(base_dir / "logs" / "worker_out.%h.%j.%t.%p")]
223
- )
224
- command_parts.extend(
225
- ["--stdio_stderr", str(base_dir / "logs" / "worker_err.%h.%j.%t.%p")]
226
- )
227
-
228
- if (rs_per_node := kwargs.get("rs_per_node")) is not None:
229
- # Add the total number of resource sets requested across all nodes in the job
230
- total_num_rs = rs_per_node * kwargs.get("nodes", 1)
231
- command_parts.append(f"-n{total_num_rs}")
232
-
233
- # Add the number of resource sets requested on each node
234
- command_parts.append(f"-r{rs_per_node}")
235
-
236
- # Add the number of CPUs per resource set
237
- if (cpus_per_rs := kwargs.get("cpus_per_rs")) is not None:
238
- command_parts.append(f"-c{cpus_per_rs}")
239
-
240
- # Add the number of GPUs per resource set
241
- if (gpus_per_rs := kwargs.get("gpus_per_rs")) is not None:
242
- command_parts.append(f"-g{gpus_per_rs}")
243
-
244
- # Add the number of tasks per resource set
245
- if (tasks_per_rs := kwargs.get("tasks_per_rs")) is not None:
246
- command_parts.append(f"-a{tasks_per_rs}")
247
-
248
- # Add -brs. This binds the resource sets to the CPUs.
249
- command_parts.append("-brs")
250
-
251
- # If there is already a command prefix, combine them.
252
- if (existing_command_prefix := kwargs.get("command_prefix")) is not None:
253
- command_parts.extend(existing_command_prefix.split())
254
-
255
- # Add the command prefix to the kwargs.
256
- kwargs["command_prefix"] = " ".join(command_parts)
257
-
258
- return kwargs
259
-
260
-
261
- SUMMIT_DEFAULTS: LSFJobKwargs = {
262
- "environment": {"JSM_NAMESPACE_LOCAL_RANK": "0"},
263
- "rs_per_node": 6,
264
- "cpus_per_rs": 7,
265
- "gpus_per_rs": 1,
266
- "tasks_per_rs": 1,
267
- }
268
-
269
-
270
- def _append_job_index_to_path(path: Path) -> Path:
271
- # If job array, append the job index to the output file
272
- # E.g., if `output_file` is "output_%J.out", we want "output_%J_%I.out"
273
- stem = path.stem
274
- suffix = path.suffix
275
- new_stem = f"{stem}_%I"
276
- new_path = path.with_name(new_stem + suffix)
277
- return new_path
278
-
279
-
280
- def _write_batch_script_to_file(
281
- path: Path,
282
- kwargs: LSFJobKwargs,
283
- command: str,
284
- job_array_n_jobs: int | None = None,
285
- ):
286
- logs_base = path.parent / "logs"
287
- logs_base.mkdir(exist_ok=True)
288
-
289
- if kwargs.get("output_file") is None:
290
- kwargs["output_file"] = logs_base / "output_%J_%I.out"
291
-
292
- if kwargs.get("error_file") is None:
293
- kwargs["error_file"] = logs_base / "error_%J_%I.err"
294
-
295
- with path.open("w") as f:
296
- f.write("#!/bin/bash\n")
297
-
298
- if (name := kwargs.get("name")) is not None:
299
- if job_array_n_jobs is not None:
300
- name += "[1-" + str(job_array_n_jobs) + "]"
301
- f.write(f"#BSUB -J {name}\n")
302
-
303
- if (project := kwargs.get("project")) is not None:
304
- f.write(f"#BSUB -P {project}\n")
305
-
306
- if (walltime := kwargs.get("walltime")) is not None:
307
- # Convert the walltime to the format expected by LSF:
308
- # -W [hour:]minute[/host_name | /host_model]
309
- # E.g., 72 hours -> 72:00
310
- total_minutes = walltime.total_seconds() // 60
311
- hours = int(total_minutes // 60)
312
- minutes = int(total_minutes % 60)
313
- walltime = f"{hours:02d}:{minutes:02d}"
314
- f.write(f"#BSUB -W {walltime}\n")
315
-
316
- if (nodes := kwargs.get("nodes")) is not None:
317
- f.write(f"#BSUB -nnodes {nodes}\n")
318
-
319
- if (output_file := kwargs.get("output_file")) is not None:
320
- output_file = Path(output_file).absolute()
321
- if job_array_n_jobs is not None:
322
- output_file = _append_job_index_to_path(output_file)
323
- output_file = str(output_file)
324
- f.write(f"#BSUB -o {output_file}\n")
325
-
326
- if (error_file := kwargs.get("error_file")) is not None:
327
- error_file = Path(error_file).absolute()
328
- if job_array_n_jobs is not None:
329
- error_file = _append_job_index_to_path(error_file)
330
- error_file = str(error_file)
331
- f.write(f"#BSUB -e {error_file}\n")
332
-
333
- if (queue := kwargs.get("queue")) is not None:
334
- if not isinstance(queue, str) and isinstance(queue, Sequence):
335
- assert len(queue) == 1, "Only one queue can be specified"
336
- queue = queue[0]
337
- f.write(f"#BSUB -q {queue}\n")
338
-
339
- if (memory_mb := kwargs.get("memory_mb")) is not None:
340
- f.write(f"#BSUB -M {memory_mb}\n")
341
-
342
- if (cpu_limit := kwargs.get("cpu_limit")) is not None:
343
- f.write(f"#BSUB -c {cpu_limit}\n")
344
-
345
- if (rerunnable := kwargs.get("rerunnable")) is not None:
346
- f.write(f"#BSUB -r {'y' if rerunnable else 'n'}\n")
347
-
348
- for dependency_condition in kwargs.get("dependency_conditions", []):
349
- f.write(f"#BSUB -w {dependency_condition}\n")
350
-
351
- if (email := kwargs.get("email")) is not None:
352
- f.write(f"#BSUB -u {email}\n")
353
-
354
- if (notify_begin := kwargs.get("notify_begin")) is not None:
355
- f.write(f"#BSUB -B {'y' if notify_begin else 'n'}\n")
356
-
357
- if (notify_end := kwargs.get("notify_end")) is not None:
358
- f.write(f"#BSUB -N {'y' if notify_end else 'n'}\n")
359
-
360
- if (alloc_flags := kwargs.get("alloc_flags")) is not None:
361
- f.write(f"#BSUB -alloc_flags {alloc_flags}\n")
362
-
363
- if (signal := kwargs.get("signal")) is not None:
364
- # Convert the signal.Signals enum to a string
365
- signal = signal.name
366
- # Remove the "SIG" prefix
367
- signal = signal[len("SIG") :]
368
- f.write(f"#BSUB -wa {signal}\n")
369
-
370
- if (signal_time := kwargs.get("signal_time")) is not None:
371
- # Convert from time-delta to "H:M"
372
- total_seconds = signal_time.total_seconds()
373
- hours = int(total_seconds // 3600)
374
- minutes = int((total_seconds % 3600) // 60)
375
-
376
- signal_time = str(minutes)
377
- if hours > 0:
378
- signal_time = f"{hours}:{signal_time}"
379
-
380
- f.write(f"#BSUB -wt {signal_time}\n")
381
-
382
- f.write("\n")
383
-
384
- if (command_prefix := kwargs.get("command_prefix")) is not None:
385
- command = " ".join(
386
- x_stripped
387
- for x in (command_prefix, command)
388
- if (x_stripped := x.strip())
389
- )
390
- f.write(f"{command}\n")
391
-
392
- return path
393
-
394
-
395
- def _update_kwargs(kwargs_in: LSFJobKwargs, base_dir: Path) -> LSFJobKwargs:
396
- # Update the kwargs with the default values
397
- global DEFAULT_KWARGS
398
- kwargs = copy.deepcopy(DEFAULT_KWARGS)
399
-
400
- # If the job is being submitted to Summit, update the kwargs with the Summit defaults
401
- if kwargs_in.get("summit"):
402
- global SUMMIT_DEFAULTS
403
- kwargs = cast(LSFJobKwargs, always_merger.merge(kwargs, SUMMIT_DEFAULTS))
404
-
405
- # Update the kwargs with the provided values
406
- kwargs = cast(LSFJobKwargs, always_merger.merge(kwargs, kwargs_in))
407
- del kwargs_in
408
-
409
- kwargs = _update_kwargs_jsrun(kwargs, base_dir)
410
- return kwargs
411
-
412
-
413
- def to_array_batch_script(
414
- dest: Path,
415
- callable: Callable[[Unpack[TArgs]], Any],
416
- args_list: Sequence[tuple[Unpack[TArgs]]],
417
- /,
418
- job_index_variable: str = "LSB_JOBINDEX",
419
- print_environment_info: bool = False,
420
- python_command_prefix: str | None = None,
421
- **kwargs: Unpack[LSFJobKwargs],
422
- ) -> SubmitOutput:
423
- """
424
- Create the batch script for the job.
425
- """
426
-
427
- from ...picklerunner import serialize_many
428
-
429
- kwargs = _update_kwargs(kwargs, dest)
430
-
431
- # Convert the command/callable to a string for the command
432
- num_jobs = len(args_list)
433
-
434
- destdir = dest / "fns"
435
- destdir.mkdir(exist_ok=True)
436
-
437
- additional_command_parts: list[str] = []
438
-
439
- serialized_command = serialize_many(
440
- destdir,
441
- callable,
442
- [(args, {}) for args in args_list],
443
- start_idx=1, # LSF job indices are 1-based
444
- additional_command_parts=additional_command_parts,
445
- )
446
- helper_path = write_helper_script(
447
- destdir,
448
- serialized_command.to_bash_command(
449
- job_index_variable, print_environment_info=print_environment_info
450
- ),
451
- kwargs.get("environment", {}),
452
- kwargs.get("setup_commands", []),
453
- command_prefix=python_command_prefix,
454
- )
455
- command = helper_script_to_command(helper_path, kwargs.get("command_template"))
456
-
457
- script_path = _write_batch_script_to_file(
458
- dest / "launch.sh",
459
- kwargs,
460
- command,
461
- job_array_n_jobs=num_jobs,
462
- )
463
- script_path = script_path.resolve().absolute()
464
- return SubmitOutput(
465
- command_parts=["bsub", str(script_path)],
466
- script_path=script_path,
467
- )