PyPI - nshtrainer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

nshtrainer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

nshtrainer/__init__.py +64 -0
nshtrainer/_experimental/__init__.py +2 -0
nshtrainer/_experimental/flops/__init__.py +48 -0
nshtrainer/_experimental/flops/flop_counter.py +787 -0
nshtrainer/_experimental/flops/module_tracker.py +140 -0
nshtrainer/_snoop.py +216 -0
nshtrainer/_submit/print_environment_info.py +31 -0
nshtrainer/_submit/session/_output.py +12 -0
nshtrainer/_submit/session/_script.py +109 -0
nshtrainer/_submit/session/lsf.py +467 -0
nshtrainer/_submit/session/slurm.py +573 -0
nshtrainer/_submit/session/unified.py +350 -0
nshtrainer/actsave/__init__.py +7 -0
nshtrainer/actsave/_callback.py +75 -0
nshtrainer/actsave/_loader.py +144 -0
nshtrainer/actsave/_saver.py +337 -0
nshtrainer/callbacks/__init__.py +35 -0
nshtrainer/callbacks/_throughput_monitor_callback.py +549 -0
nshtrainer/callbacks/base.py +113 -0
nshtrainer/callbacks/early_stopping.py +112 -0
nshtrainer/callbacks/ema.py +383 -0
nshtrainer/callbacks/finite_checks.py +75 -0
nshtrainer/callbacks/gradient_skipping.py +103 -0
nshtrainer/callbacks/interval.py +322 -0
nshtrainer/callbacks/latest_epoch_checkpoint.py +45 -0
nshtrainer/callbacks/log_epoch.py +35 -0
nshtrainer/callbacks/norm_logging.py +187 -0
nshtrainer/callbacks/on_exception_checkpoint.py +44 -0
nshtrainer/callbacks/print_table.py +90 -0
nshtrainer/callbacks/throughput_monitor.py +56 -0
nshtrainer/callbacks/timer.py +157 -0
nshtrainer/callbacks/wandb_watch.py +103 -0
nshtrainer/config.py +289 -0
nshtrainer/data/__init__.py +4 -0
nshtrainer/data/balanced_batch_sampler.py +132 -0
nshtrainer/data/transform.py +67 -0
nshtrainer/lr_scheduler/__init__.py +18 -0
nshtrainer/lr_scheduler/_base.py +101 -0
nshtrainer/lr_scheduler/linear_warmup_cosine.py +138 -0
nshtrainer/lr_scheduler/reduce_lr_on_plateau.py +73 -0
nshtrainer/model/__init__.py +44 -0
nshtrainer/model/base.py +641 -0
nshtrainer/model/config.py +2064 -0
nshtrainer/model/modules/callback.py +157 -0
nshtrainer/model/modules/debug.py +42 -0
nshtrainer/model/modules/distributed.py +70 -0
nshtrainer/model/modules/logger.py +170 -0
nshtrainer/model/modules/profiler.py +24 -0
nshtrainer/model/modules/rlp_sanity_checks.py +202 -0
nshtrainer/model/modules/shared_parameters.py +72 -0
nshtrainer/nn/__init__.py +19 -0
nshtrainer/nn/mlp.py +106 -0
nshtrainer/nn/module_dict.py +66 -0
nshtrainer/nn/module_list.py +50 -0
nshtrainer/nn/nonlinearity.py +157 -0
nshtrainer/optimizer.py +62 -0
nshtrainer/runner.py +21 -0
nshtrainer/scripts/check_env.py +41 -0
nshtrainer/scripts/find_packages.py +51 -0
nshtrainer/trainer/__init__.py +1 -0
nshtrainer/trainer/signal_connector.py +208 -0
nshtrainer/trainer/trainer.py +340 -0
nshtrainer/typecheck.py +144 -0
nshtrainer/util/environment.py +119 -0
nshtrainer/util/seed.py +11 -0
nshtrainer/util/singleton.py +89 -0
nshtrainer/util/slurm.py +49 -0
nshtrainer/util/typed.py +2 -0
nshtrainer/util/typing_utils.py +19 -0
nshtrainer-0.1.0.dist-info/METADATA +18 -0
nshtrainer-0.1.0.dist-info/RECORD +72 -0
nshtrainer-0.1.0.dist-info/WHEEL +4 -0

nshtrainer/_submit/session/slurm.py ADDED Viewed

@@ -0,0 +1,573 @@
+import copy
+import logging
+import math
+import os
+import signal
+from collections.abc import Callable, Mapping, Sequence
+from datetime import timedelta
+from pathlib import Path
+from typing import Any, Literal, cast
+from deepmerge import always_merger
+from typing_extensions import TypeAlias, TypedDict, TypeVarTuple, Unpack
+from ._output import SubmitOutput
+from ._script import helper_script_to_command, write_helper_script
+log = logging.getLogger(__name__)
+TArgs = TypeVarTuple("TArgs")
+_Path: TypeAlias = str | Path | os.PathLike
+MailType: TypeAlias = Literal[
+    "NONE",
+    "BEGIN",
+    "END",
+    "FAIL",
+    "REQUEUE",
+    "ALL",
+    "INVALID_DEPEND",
+    "STAGE_OUT",
+    "TIME_LIMIT",
+    "TIME_LIMIT_90",
+    "TIME_LIMIT_80",
+    "TIME_LIMIT_50",
+    "ARRAY_TASKS",
+]
+class SlurmJobKwargs(TypedDict, total=False):
+    name: str
+    """
+    The name of the job.
+    This corresponds to the "-J" option in sbatch.
+    """
+    account: str
+    """
+    The account to charge resources used by this job to.
+    This corresponds to the "-A" option in sbatch.
+    """
+    partition: str | Sequence[str]
+    """
+    The partition to submit the job to.
+    This corresponds to the "-p" option in sbatch. If not specified, the default partition will be used.
+    Multiple partitions can be specified, and they will be combined using logical OR.
+    """
+    qos: str
+    """
+    The quality of service to submit the job to.
+    This corresponds to the "--qos" option in sbatch.
+    """
+    output_file: _Path
+    """
+    The file to write the job output to.
+    This corresponds to the "-o" option in sbatch. If not specified, the output will be written to the default output file.
+    """
+    error_file: _Path
+    """
+    The file to write the job errors to.
+    This corresponds to the "-e" option in sbatch. If not specified, the errors will be written to the default error file.
+    """
+    time: timedelta | Literal[0]
+    """
+    The maximum time for the job.
+    This corresponds to the "-t" option in sbatch. A value of 0 means no time limit.
+    """
+    memory_mb: int
+    """
+    The maximum memory for the job in MB.
+    This corresponds to the "--mem" option in sbatch. If not specified, the default memory limit will be used.
+    """
+    memory_per_cpu_mb: int
+    """
+    The minimum memory required per usable allocated CPU.
+    This corresponds to the "--mem-per-cpu" option in sbatch. If not specified, the default memory limit will be used.
+    """
+    memory_per_gpu_mb: int
+    """
+    The minimum memory required per allocated GPU.
+    This corresponds to the "--mem-per-gpu" option in sbatch. If not specified, the default memory limit will be used.
+    """
+    cpus_per_task: int
+    """
+    Advise the Slurm controller that ensuing job steps will require _ncpus_ number of processors per task.
+    This corresponds to the "-c" option in sbatch.
+    """
+    nodes: int
+    """
+    The number of nodes to use for the job.
+    This corresponds to the "-N" option in sbatch. The default is 1 node.
+    """
+    ntasks: int
+    """
+    The number of tasks to use for the job.
+    This corresponds to the "-n" option in sbatch. The default is 1 task.
+    """
+    ntasks_per_node: int
+    """
+    The number of tasks for each node.
+    This corresponds to the "--ntasks-per-node" option in sbatch.
+    """
+    constraint: str | Sequence[str]
+    """
+    Nodes can have features assigned to them by the Slurm administrator. Users can specify which of these features are required by their job using the constraint option.
+    This corresponds to the "-C" option in sbatch.
+    """
+    gres: str | Sequence[str]
+    """
+    Specifies a comma-delimited list of generic consumable resources.
+    This corresponds to the "--gres" option in sbatch.
+    """
+    gpus: int | str
+    """
+    Specify the total number of GPUs required for the job. An optional GPU type specification can be supplied.
+    This corresponds to the "-G" option in sbatch.
+    """
+    gpus_per_node: int | str
+    """
+    Specify the number of GPUs required for the job on each node included in the job's resource allocation. An optional GPU type specification can be supplied.
+    This corresponds to the "--gpus-per-node" option in sbatch.
+    """
+    gpus_per_task: int
+    """
+    Specify the number of GPUs required for the job on each task to be spawned in the job's resource allocation. An optional GPU type specification can be supplied.
+    This corresponds to the "--gpus-per-task" option in sbatch.
+    """
+    mail_user: str
+    """
+    User to receive email notification of state changes as defined by mail_type.
+    This corresponds to the "--mail-user" option in sbatch.
+    """
+    mail_type: MailType | Sequence[MailType]
+    """
+    Notify user by email when certain event types occur.
+    This corresponds to the "--mail-type" option in sbatch.
+    """
+    dependency: str
+    """
+    Defer the start of this job until the specified dependencies have been satisfied.
+    This corresponds to the "-d" option in sbatch.
+    """
+    exclusive: bool
+    """
+    The job allocation can not share nodes with other running jobs.
+    This corresponds to the "--exclusive" option in sbatch.
+    """
+    signal: signal.Signals
+    """
+    The signal to send to the job when the job is being terminated.
+    This corresponds to the "--signal" option in sbatch.
+    """
+    signal_delay: timedelta
+    """
+    The delay before sending the signal to the job.
+    This corresponds to the "--signal ...@[delay]" option in sbatch.
+    """
+    open_mode: str
+    """
+    The open mode for the output and error files.
+    This corresponds to the "--open-mode" option in sbatch.
+    Valid values are "append" and "truncate".
+    """
+    requeue: bool
+    """
+    Requeues the job if it's pre-empted.
+    This corresponds to the "--requeue" option in sbatch.
+    """
+    setup_commands: Sequence[str]
+    """
+    The setup commands to run before the job.
+    These commands will be executed prior to everything else in the job script.
+    """
+    environment: Mapping[str, str]
+    """
+    The environment variables to set for the job.
+    These variables will be set prior to executing any commands in the job script.
+    """
+    command_prefix: str
+    """
+    A command to prefix the job command with.
+    This is used to add commands like `srun` to the job command.
+    """
+    command_template: str
+    """
+    The template for the command to execute the helper script.
+    Default: `bash {/path/to/helper.sh}`.
+    """
+    srun_flags: str | Sequence[str]
+    """
+    The flags to pass to the `srun` command.
+    """
+DEFAULT_KWARGS: SlurmJobKwargs = {
+    "name": "ll",
+    # "nodes": 1,
+    # "time": timedelta(hours=2),
+    "signal": signal.SIGURG,
+    "signal_delay": timedelta(seconds=90),
+    "open_mode": "append",
+    # "requeue": True,
+}
+def _determine_gres(kwargs: SlurmJobKwargs) -> Sequence[str] | None:
+    """
+    There are many different ways to specify GPU resources, but some are buggy.
+    This function normalizes all other ways to specify GPU resources to the `gres` option.
+    """
+    # If `--gres` is set, just return it
+    if (gres := kwargs.get("gres")) is not None:
+        if isinstance(gres, str):
+            gres = [gres]
+        return gres
+    # We will only support `--gpus` if `--nodes` is set to 1
+    if (gpus := kwargs.get("gpus")) is not None:
+        if kwargs.get("nodes") != 1:
+            raise ValueError("Cannot specify `gpus` without `nodes` set to 1.")
+        if isinstance(gpus, int):
+            gpus = [f"gpu:{gpus}"]
+        return gpus
+    # `--gpus-per-task` is only supported if `--ntasks-per-node` is set (or can be inferred).
+    if (gpus_per_task := kwargs.get("gpus_per_task")) is not None:
+        if (ntasks_per_node := _determine_ntasks_per_node(kwargs)) is None:
+            raise ValueError(
+                "Cannot specify `gpus_per_task` without `ntasks_per_node`."
+            )
+        gpus_per_node = ntasks_per_node * gpus_per_task
+        return [f"gpu:{gpus_per_node}"]
+    # `--gpus-per-node` has no restrictions
+    if (gpus_per_node := kwargs.get("gpus_per_node")) is not None:
+        if isinstance(gpus_per_node, int):
+            gpus_per_node = [f"gpu:{gpus_per_node}"]
+        return gpus_per_node
+    return None
+def _determine_ntasks_per_node(kwargs: SlurmJobKwargs) -> int | None:
+    # If `--ntasks-per-node` is set, just return it
+    if (ntasks_per_node := kwargs.get("ntasks_per_node")) is not None:
+        return ntasks_per_node
+    # If `--ntasks` is set, we can infer `--ntasks-per-node`
+    if (ntasks := kwargs.get("ntasks")) is not None:
+        if (nodes := kwargs.get("nodes")) is None:
+            raise ValueError("Cannot infer `ntasks_per_node` without `nodes`.")
+        # If nnodes is not divisible by ntasks, raise an error
+        if nodes % ntasks != 0:
+            raise ValueError(
+                "The number of nodes must be divisible by the number of tasks."
+            )
+        return ntasks // nodes
+    return None
+def _write_batch_script_to_file(
+    path: Path,
+    kwargs: SlurmJobKwargs,
+    command: str,
+    job_array_n_jobs: int | None = None,
+):
+    with path.open("w") as f:
+        f.write("#!/bin/bash\n")
+        if kwargs.get("requeue"):
+            f.write("#SBATCH --requeue\n")
+        if job_array_n_jobs is not None:
+            f.write(f"#SBATCH --array=1-{job_array_n_jobs}\n")
+        if (name := kwargs.get("name")) is not None:
+            f.write(f"#SBATCH -J {name}\n")
+        if (account := kwargs.get("account")) is not None:
+            f.write(f"#SBATCH --account={account}\n")
+        if (time := kwargs.get("time")) is not None:
+            # A time limit of zero requests that no time limit be imposed. Acceptable time formats include "minutes", "minutes:seconds", "hours:minutes:seconds", "days-hours", "days-hours:minutes" and "days-hours:minutes:seconds".
+            if time == 0:
+                time_str = "0"
+            else:
+                total_seconds = time.total_seconds()
+                hours, remainder = divmod(total_seconds, 3600)
+                minutes, seconds = divmod(remainder, 60)
+                if hours > 24:
+                    days, hours = divmod(hours, 24)
+                    time_str = f"{int(days)}-{int(hours):02d}:{int(minutes):02d}"
+                else:
+                    time_str = f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
+            f.write(f"#SBATCH --time={time_str}\n")
+        if (nodes := kwargs.get("nodes")) is not None:
+            f.write(f"#SBATCH --nodes={nodes}\n")
+        if (ntasks := kwargs.get("ntasks")) is not None:
+            f.write(f"#SBATCH --ntasks={ntasks}\n")
+        if (ntasks_per_node := kwargs.get("ntasks_per_node")) is not None:
+            f.write(f"#SBATCH --ntasks-per-node={ntasks_per_node}\n")
+        if (output_file := kwargs.get("output_file")) is not None:
+            output_file = str(Path(output_file).absolute())
+            f.write(f"#SBATCH --output={output_file}\n")
+        if (error_file := kwargs.get("error_file")) is not None:
+            error_file = str(Path(error_file).absolute())
+            f.write(f"#SBATCH --error={error_file}\n")
+        if (partition := kwargs.get("partition")) is not None:
+            if isinstance(partition, str):
+                partition = [partition]
+            f.write(f"#SBATCH --partition={','.join(partition)}\n")
+        if (qos := kwargs.get("qos")) is not None:
+            f.write(f"#SBATCH --qos={qos}\n")
+        if (memory_mb := kwargs.get("memory_mb")) is not None:
+            f.write(f"#SBATCH --mem={memory_mb}\n")
+        if (memory_per_cpu_mb := kwargs.get("memory_per_cpu_mb")) is not None:
+            f.write(f"#SBATCH --mem-per-cpu={memory_per_cpu_mb}\n")
+        if (memory_per_gpu_mb := kwargs.get("memory_per_gpu_mb")) is not None:
+            f.write(f"#SBATCH --mem-per-gpu={memory_per_gpu_mb}\n")
+        if (cpus_per_task := kwargs.get("cpus_per_task")) is not None:
+            f.write(f"#SBATCH --cpus-per-task={cpus_per_task}\n")
+        if gres := _determine_gres(kwargs):
+            f.write(f"#SBATCH --gres={','.join(gres)}\n")
+        if (mail_user := kwargs.get("mail_user")) is not None:
+            f.write(f"#SBATCH --mail-user={mail_user}\n")
+        if (mail_type := kwargs.get("mail_type")) is not None:
+            if isinstance(mail_type, str):
+                mail_type = [mail_type]
+            f.write(f"#SBATCH --mail-type={','.join(mail_type)}\n")
+        if (dependency := kwargs.get("dependency")) is not None:
+            f.write(f"#SBATCH --dependency={dependency}\n")
+        if kwargs.get("exclusive"):
+            f.write("#SBATCH --exclusive\n")
+        if (open_mode := kwargs.get("open_mode")) is not None:
+            f.write(f"#SBATCH --open-mode={open_mode}\n")
+        if (constraint := kwargs.get("constraint")) is not None:
+            if isinstance(constraint, str):
+                constraint = [constraint]
+            f.write(f"#SBATCH --constraint={','.join(constraint)}\n")
+        if (signal := kwargs.get("signal")) is not None:
+            signal_str = signal.name
+            if (signal_delay := kwargs.get("signal_delay")) is not None:
+                signal_str += f"@{math.ceil(signal_delay.total_seconds())}"
+            f.write(f"#SBATCH --signal={signal_str}\n")
+        f.write("\n")
+        if (command_prefix := kwargs.get("command_prefix")) is not None:
+            command = " ".join(
+                x_stripped
+                for x in (command_prefix, command)
+                if (x_stripped := x.strip())
+            )
+        f.write(f"{command}\n")
+    return path
+def _update_kwargs(kwargs_in: SlurmJobKwargs, base_path: Path):
+    # Update the kwargs with the default values
+    kwargs = copy.deepcopy(DEFAULT_KWARGS)
+    # Merge the kwargs
+    kwargs = cast(SlurmJobKwargs, always_merger.merge(kwargs, kwargs_in))
+    del kwargs_in
+    # If out/err files are not specified, set them
+    logs_base = base_path.parent / "logs"
+    logs_base.mkdir(exist_ok=True)
+    if kwargs.get("output_file") is None:
+        kwargs["output_file"] = logs_base / "output_%j_%a.out"
+    if kwargs.get("error_file") is None:
+        kwargs["error_file"] = logs_base / "error_%j_%a.err"
+    # Update the command_prefix to add srun:
+    command_parts: list[str] = ["srun"]
+    if (srun_flags := kwargs.get("srun_flags")) is not None:
+        if isinstance(srun_flags, str):
+            srun_flags = [srun_flags]
+        command_parts.extend(srun_flags)
+    # Add ntasks/cpus/gpus
+    if (ntasks := kwargs.get("ntasks")) is not None:
+        command_parts.append(f"--ntasks={ntasks}")
+    if (ntasks_per_node := kwargs.get("ntasks_per_node")) is not None:
+        command_parts.append(f"--ntasks-per-node={ntasks_per_node}")
+    if (cpus_per_task := kwargs.get("cpus_per_task")) is not None:
+        command_parts.append(f"--cpus-per-task={cpus_per_task}")
+    if gres := _determine_gres(kwargs):
+        command_parts.append(f"--gres={','.join(gres)}")
+    command_parts.append("--unbuffered")
+    # Add the task id to the output filenames
+    if (f := kwargs.get("output_file")) is not None:
+        f = Path(f).absolute()
+        command_parts.extend(
+            [
+                "--output",
+                str(f.with_name(f"{f.stem}-%t{f.suffix}").absolute()),
+            ]
+        )
+    if (f := kwargs.get("error_file")) is not None:
+        f = Path(f).absolute()
+        command_parts.extend(
+            [
+                "--error",
+                str(f.with_name(f"{f.stem}-%t{f.suffix}").absolute()),
+            ]
+        )
+    # If there is already a command prefix, combine them.
+    if (existing_command_prefix := kwargs.get("command_prefix")) is not None:
+        command_parts.extend(existing_command_prefix.split())
+    # Add the command prefix to the kwargs.
+    kwargs["command_prefix"] = " ".join(command_parts)
+    return kwargs
+def to_array_batch_script(
+    dest: Path,
+    callable: Callable[[Unpack[TArgs]], Any],
+    args_list: Sequence[tuple[Unpack[TArgs]]],
+    /,
+    job_index_variable: str = "SLURM_ARRAY_TASK_ID",
+    print_environment_info: bool = False,
+    python_command_prefix: str | None = None,
+    **kwargs: Unpack[SlurmJobKwargs],
+) -> SubmitOutput:
+    """
+    Create the batch script for the job.
+    """
+    from ...picklerunner import serialize_many
+    kwargs = _update_kwargs(kwargs, dest)
+    # Convert the command/callable to a string for the command
+    num_jobs = len(args_list)
+    destdir = dest / "fns"
+    destdir.mkdir(exist_ok=True)
+    serialized_command = serialize_many(
+        destdir,
+        callable,
+        [(args, {}) for args in args_list],
+        start_idx=1,  # Slurm job indices are 1-based
+    )
+    helper_path = write_helper_script(
+        destdir,
+        serialized_command.to_bash_command(
+            job_index_variable, print_environment_info=print_environment_info
+        ),
+        kwargs.get("environment", {}),
+        kwargs.get("setup_commands", []),
+        command_prefix=python_command_prefix,
+    )
+    command = helper_script_to_command(helper_path, kwargs.get("command_template"))
+    script_path = _write_batch_script_to_file(
+        dest / "launch.sh",
+        kwargs,
+        command,
+        job_array_n_jobs=num_jobs,
+    )
+    script_path = script_path.resolve().absolute()
+    return SubmitOutput(
+        command_parts=["sbatch", f"{script_path}"],
+        script_path=script_path,
+    )