PyPI - nntool - Versions diffs - 2.0.0rc0__py3-none-any.whl - Mend

nntool 2.0.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

nntool/__init__.py +7 -0
nntool/build_backend.py +24 -0
nntool/experiment/__init__.py +13 -0
nntool/experiment/config.py +108 -0
nntool/experiment/utils.py +63 -0
nntool/slurm/__init__.py +21 -0
nntool/slurm/accelerator/__init__.py +0 -0
nntool/slurm/accelerator/utils.py +37 -0
nntool/slurm/config.py +208 -0
nntool/slurm/core/__init__.py +4 -0
nntool/slurm/core/_slurm.py +546 -0
nntool/slurm/core/_slurm_context.py +47 -0
nntool/slurm/function.py +209 -0
nntool/slurm/parser/__init__.py +6 -0
nntool/slurm/parser/parse.py +22 -0
nntool/slurm/task.py +300 -0
nntool/slurm/wrap.py +148 -0
nntool/utils/__init__.py +12 -0
nntool/version.py +11 -0
nntool/wandb/__init__.py +7 -0
nntool/wandb/config.py +116 -0
nntool-2.0.0rc0.dist-info/METADATA +12 -0
nntool-2.0.0rc0.dist-info/RECORD +25 -0
nntool-2.0.0rc0.dist-info/WHEEL +5 -0
nntool-2.0.0rc0.dist-info/top_level.txt +1 -0

nntool/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .version import VERSION, VERSION_SHORT
+__all__ = [
+    "VERSION",
+    "VERSION_SHORT",
+]
+__version__ = VERSION

nntool/build_backend.py ADDED Viewed

@@ -0,0 +1,24 @@
+import os
+from setuptools import build_meta as _orig
+from setuptools.build_meta import *
+def add_cythonpackage(requires: list):
+    new_list = list(requires)
+    new_list.append("buildkit/cythonpackage")
+    return new_list
+def get_requires_for_build_wheel(config_settings=None):
+    if not os.getenv("NNTOOL_PYTHON_BUILD"):
+        return add_cythonpackage(_orig.get_requires_for_build_wheel(config_settings))
+    else:
+        return _orig.get_requires_for_build_wheel(config_settings)
+def get_requires_for_build_sdist(config_settings=None):
+    if not os.getenv("NNTOOL_PYTHON_BUILD"):
+        return add_cythonpackage(_orig.get_requires_for_build_sdist(config_settings))
+    else:
+        return _orig.get_requires_for_build_sdist(config_settings)

nntool/experiment/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .config import BaseExperimentConfig
+from .utils import (
+    get_current_time,
+    get_output_path,
+    read_toml_file,
+)
+__all__ = [
+    "BaseExperimentConfig",
+    "get_current_time",
+    "get_output_path",
+    "read_toml_file",
+]

nntool/experiment/config.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os
+from typing import Any, Dict
+from pathlib import Path
+from dataclasses import dataclass
+from .utils import get_output_path, read_toml_file
+@dataclass
+class BaseExperimentConfig:
+    """
+    Configuration class for setting up an experiment.
+    :param config_name: The name of the configuration.
+    :param output_folder: The folder path where the outputs will be saved.
+    :param experiment_name_key: Key for experiment name in the environment variable, default is 'EXP_NAME'.
+    :param env_toml_path: Path to the `env.toml` file, default is 'env.toml'.
+    :param append_date_to_path: If True, the current date and time will be appended to the output path, default is True.
+    :param existing_output_path_ok: If True, the existing output path is ok to be reused, default is False.
+    """
+    # config name
+    config_name: str
+    # the output folder for the outputs
+    output_folder: str
+    # key for experiment name in the environment variable
+    experiment_name_key: str = "EXP_NAME"
+    # the path to the env.toml file
+    env_toml_path: str = "env.toml"
+    # append date time to the output path
+    append_date_to_path: bool = True
+    # exisiting output path is ok
+    existing_output_path_ok: bool = False
+    def __post_init__(self):
+        # annotations
+        self.experiment_name: str
+        self.project_path: str
+        self.output_path: str
+        self.current_time: str
+        self.env_toml: Dict[str, Any] = self.__prepare_env_toml_dict()
+        self.experiment_name = self.__prepare_experiment_name()
+        self.project_path, self.output_path, self.current_time = self.__prepare_experiment_paths()
+        # custom post update for the derived class
+        self.set_up_stateful_fields()
+    def __prepare_env_toml_dict(self):
+        env_toml_path = Path(self.env_toml_path)
+        if not env_toml_path.exists():
+            raise FileNotFoundError(f"{env_toml_path} does not exist")
+        config = read_toml_file(env_toml_path)
+        return config
+    def __prepare_experiment_name(self):
+        return os.environ.get(self.experiment_name_key, "default")
+    def __prepare_experiment_paths(self):
+        project_path = self.env_toml["project"]["path"]
+        output_path, current_time = get_output_path(
+            output_path=os.path.join(self.output_folder, self.config_name, self.experiment_name),
+            append_date=self.append_date_to_path,
+            cache_into_env=False,
+        )
+        output_path = f"{project_path}/{output_path}"
+        return project_path, output_path, current_time
+    def get_output_path(self) -> str:
+        """Return the output path prepared for the experiment.
+        :return: output path for the experiment
+        """
+        return self.output_path
+    def get_current_time(self) -> str:
+        """Return the current time for the experiment.
+        :return: current time for the experiment
+        """
+        return self.current_time
+    def set_up_stateful_fields(self):
+        """
+        Post configuration steps for stateful fields such as `output_path` in the derived class.
+        This method should be overridden in the derived class.
+        """
+        pass
+    def start(self):
+        """
+        Start the experimen. This will
+        - cache `NNTOOL_OUTPUT_PATH` and `NNTOOL_OUTPUT_PATH_DATE` into environment variables, which means the later launched processes would inherit these variables.
+        - create the output path if it does not exist.
+        """
+        os.environ["NNTOOL_OUTPUT_PATH"] = self.get_output_path()
+        os.environ["NNTOOL_OUTPUT_PATH_DATE"] = self.get_current_time()
+        # create the output path
+        output_path = Path(self.get_output_path())
+        output_path.mkdir(parents=True, exist_ok=self.existing_output_path_ok)

nntool/experiment/utils.py ADDED Viewed

@@ -0,0 +1,63 @@
+import os
+import datetime
+import tomli
+def get_current_time() -> str:
+    """get current time in this format: MMDDYYYY/HHMMSS
+    :return: time in the format MMDDYYYY/HHMMSS
+    """
+    # Get the current time
+    current_time = datetime.datetime.now()
+    # Format the time (MDY/HMS)
+    formatted_time = current_time.strftime("%m%d%Y/%H%M%S")
+    return formatted_time
+def read_toml_file(file_path: str) -> dict:
+    """Read a toml file and return the content as a dictionary
+    :param file_path: path to the toml file
+    :return: content of the toml file as a dictionary
+    """
+    with open(file_path, "rb") as f:
+        content = tomli.load(f)
+    return content
+def get_output_path(
+    output_path: str = "./",
+    append_date: bool = True,
+    cache_into_env: bool = True,
+) -> tuple[str, str]:
+    """Get output path based on environment variable OUTPUT_PATH and NNTOOL_OUTPUT_PATH.
+    The output path is appended with the current time if append_date is True (e.g. /OUTPUT_PATH/xxx/MMDDYYYY/HHMMSS).
+    :param append_date: append a children folder with the date time, defaults to True
+    :param cache_into_env: whether cache the newly created path into env, defaults to True
+    :return: (output path, current time)
+    """
+    if "OUTPUT_PATH" in os.environ:
+        output_path = os.environ["OUTPUT_PATH"]
+        current_time = "" if not append_date else get_current_time()
+    elif "NNTOOL_OUTPUT_PATH" in os.environ:
+        # reuse the NNTOOL_OUTPUT_PATH if it is set
+        output_path = os.environ["NNTOOL_OUTPUT_PATH"]
+        current_time = "" if not append_date else os.environ["NNTOOL_OUTPUT_PATH_DATE"]
+    else:
+        current_time = get_current_time()
+        if append_date:
+            output_path = os.path.join(output_path, current_time)
+        print(
+            f"OUTPUT_PATH is not found in environment variables. NNTOOL_OUTPUT_PATH is set using path: {output_path}"
+        )
+        if cache_into_env:
+            os.environ["NNTOOL_OUTPUT_PATH"] = output_path
+            os.environ["NNTOOL_OUTPUT_PATH_DATE"] = current_time
+    return output_path, current_time

nntool/slurm/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+from .config import SlurmConfig, SlurmArgs
+from .wrap import (
+    slurm_function,
+    slurm_fn,
+    slurm_launcher,
+)
+from .function import SlurmFunction
+from .task import Task, DistributedTaskConfig, PyTorchDistributedTask
+__all__ = [
+    "SlurmConfig",
+    "SlurmArgs",
+    "SlurmFunction",
+    "slurm_fn",
+    "slurm_function",
+    "slurm_launcher",
+    "Task",
+    "DistributedTaskConfig",
+    "PyTorchDistributedTask",
+]

nntool/slurm/accelerator/__init__.py ADDED Viewed

File without changes

nntool/slurm/accelerator/utils.py ADDED Viewed

@@ -0,0 +1,37 @@
+import subprocess
+def nvidia_smi_gpu_memory_stats() -> dict:
+    """
+    Parse the nvidia-smi output and extract the memory used stats.
+    """
+    out_dict = {}
+    try:
+        sp = subprocess.Popen(
+            ["nvidia-smi", "--query-gpu=index,memory.used", "--format=csv,noheader"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            close_fds=True,
+        )
+        out_str = sp.communicate()
+        out_list = out_str[0].decode("utf-8").split("\n")
+        out_dict = {}
+        for item in out_list:
+            if " MiB" in item:
+                gpu_idx, mem_used = item.split(",")
+                gpu_key = f"gpu_{gpu_idx}_mem_used_gb"
+                out_dict[gpu_key] = int(mem_used.strip().split(" ")[0]) / 1024
+    except FileNotFoundError:
+        raise Exception("Failed to find the 'nvidia-smi' executable for printing GPU stats")
+    except subprocess.CalledProcessError as e:
+        raise Exception(f"nvidia-smi returned non zero error code: {e.returncode}")
+    return out_dict
+def nvidia_smi_gpu_memory_stats_str() -> str:
+    """
+    Parse the nvidia-smi output and extract the memory used stats.
+    """
+    stats = nvidia_smi_gpu_memory_stats()
+    return ", ".join([f"{k}: {v:.4f}" for k, v in stats.items()])

nntool/slurm/config.py ADDED Viewed

@@ -0,0 +1,208 @@
+import os
+import sys
+from dataclasses import dataclass, field, replace
+from typing import List, Literal, Dict, Optional
+@dataclass
+class SlurmConfig:
+    """
+    Configuration class for SLURM job submission and execution.
+    Args:
+        mode (Literal["run", "debug", "local", "slurm"]): Running mode for the job. Options include:
+            "run" (default, directly run the function), "debug" (run debugging which will involve pdb if it reachs a breakpoint), "local" (run the job locally by subprocess, without gpu allocations and CUDA_VISIBLE_DEVICES cannot be set), or "slurm" (run the job on a SLURM cluster).
+        job_name (str): The name of the SLURM job. Default is 'Job'.
+        partition (str): The name of the SLURM partition to use. Default is ''.
+        output_parent_path (str): The parent directory path for saving the slurm folder. Default is './'.
+        output_folder (str): The folder name where SLURM output files will be stored. Default is 'slurm'.
+        node_list (str): A string specifying the nodes to use. Leave blank to use all available nodes. Default is an empty string.
+        node_list_exclude (str): A string specifying the nodes to exclude. Leave blank to use all nodes in the node list. Default is an empty string.
+        num_of_node (int): The number of nodes to request. Default is 1.
+        tasks_per_node (int): The number of tasks to run per node. Default is 1.
+        gpus_per_task (int): The number of GPUs to request per task. Default is 0.
+        cpus_per_task (int): The number of CPUs to request per task. Default is 1.
+        gpus_per_node (int): The number of GPUs to request per node. If this is set, `gpus_per_task` will be ignored. Default is None.
+        mem (str): The amount of memory (GB) to request. Leave blank to use the default memory configuration of the node. Default is an empty string.
+        timeout_min (int): The time limit for the job in minutes. Default is `sys.maxsize` for effectively no limit.
+        stderr_to_stdout (bool): Whether to redirect stderr to stdout. Default is False.
+        setup (List[str]): A list of environment variable setup commands. Default is an empty list.
+        pack_code (bool): Whether to pack the codebase before submission. Default is False.
+        use_packed_code (bool): Whether to use the packed code for execution. Default is False.
+        code_root (str): The root directory of the codebase, which will be used by the code packing. Default is the current directory (``.``).
+        code_file_suffixes (List[str]): A list of file extensions for code files to be included when packing. Default includes ``.py``, ``.sh``, ``.yaml``, and ``.toml``.
+        exclude_code_folders (List[str]): A list of folder names relative to `code_root` that will be excluded from packing. Default excludes 'wandb', 'outputs', and 'datasets'.
+        use_distributed_env (bool): Whether to use a distributed environment for the job. Default is False.
+        distributed_env_task (Literal["torch"]): The type of distributed environment task to use. Currently, only "torch" is supported. Default is "torch".
+        processes_per_task (int): The number of processes to run per task. This value is not used by SLURM but is relevant for correctly set up distributed environments. Default is 1.
+        distributed_launch_command (str): The command to launch distributed environment setup, using environment variables like ``{num_processes}``, ``{num_machines}``, ``{machine_rank}``, ``{main_process_ip}``, ``{main_process_port}``. Default is an empty string.
+        extra_params_kwargs (Dict[str, str]): Additional parameters for the SLURM job as a dictionary of key-value pairs. Default is an empty dictionary.
+        extra_submit_kwargs (Dict[str, str]): Additional submit parameters for the SLURM job as a dictionary of key-value pairs. Default is an empty dictionary.
+        extra_task_kwargs (Dict[str, str]): Additional task parameters for the SLURM job as a dictionary of key-value pairs. Default is an empty dictionary.
+    """
+    # running mode
+    mode: Literal["run", "debug", "local", "slurm"] = "run"
+    # slurm job name
+    job_name: str = "Job"
+    # slurm partition name
+    partition: str = ""
+    # slurm output parent path
+    output_parent_path: str = "./"
+    # slurm output folder name
+    output_folder: str = "slurm"
+    # node list string (leave blank to use all nodes)
+    node_list: str = ""
+    # node list string to be excluded (leave blank to use all nodes in the node list)
+    node_list_exclude: str = ""
+    # number of nodes to request
+    num_of_node: int = 1
+    # tasks per node
+    tasks_per_node: int = 1
+    # number of gpus per task to request
+    gpus_per_task: int = 0
+    # number of cpus per task to request
+    cpus_per_task: int = 1
+    # number of gpus per node to request (if this is set, gpus_per_task will be ignored)
+    gpus_per_node: Optional[int] = None
+    # memory (GB) to request (leave black to use default memory configurations in the node)
+    mem: str = ""
+    # time out min
+    timeout_min: int = sys.maxsize
+    # whether to redirect stderr to stdout
+    stderr_to_stdout: bool = False
+    # environment variables setup command
+    setup: List[str] = field(default_factory=list)
+    # whether to pack code
+    pack_code: bool = False
+    # use packed code to run
+    use_packed_code: bool = False
+    # code root
+    code_root: str = "."
+    # code file extensions
+    code_file_suffixes: list[str] = field(default_factory=lambda: [".py", ".sh", ".yaml", ".toml"])
+    # exclude folders (relative to the code root)
+    exclude_code_folders: list[str] = field(
+        default_factory=lambda: ["wandb", "outputs", "datasets"]
+    )
+    # whether to use distributed environment
+    use_distributed_env: bool = False
+    # distributed enviroment task
+    distributed_env_task: Literal["torch"] = "torch"
+    # processes per task (this value is not used by slurm, but in the distributed environment)
+    processes_per_task: int = 1
+    # distributed launch command (this will be called after the distributed enviroment is set up)
+    # the following environment variables are available:
+    #   num_processes: int
+    #   num_machines: int
+    #   machine_rank: int
+    #   main_process_ip: str
+    #   main_process_port: int
+    # use braces to access the environment variables, e.g. {num_processes}
+    distributed_launch_command: str = ""
+    # extra slurm job parameters
+    extra_params_kwargs: Dict[str, str] = field(default_factory=dict)
+    # extra slurm submit parameters
+    extra_submit_kwargs: Dict[str, str] = field(default_factory=dict)
+    # extra slurm task parameters
+    extra_task_kwargs: Dict[str, str] = field(default_factory=dict)
+    def _configuration_check(self):
+        # check partition
+        if self.partition == "":
+            raise ValueError("partition must be set")
+        # check distributed enviroment task
+        if self.use_distributed_env and self.distributed_launch_command == "":
+            raise ValueError(
+                "distributed_launch_command must be set when use_distributed_env is True"
+            )
+    def __post_init__(self):
+        # check configuration
+        self._configuration_check()
+        # normalize the output folder
+        output_folder_suffix = ""
+        if self.mode != "slurm":
+            output_folder_suffix = f"_{self.mode}"
+        if self.output_folder.endswith("slurm"):
+            self.output_folder = f"{self.output_folder}{output_folder_suffix}"
+        else:
+            self.output_folder = os.path.join(self.output_folder, f"slurm{output_folder_suffix}")
+        # output path
+        self.output_path: str = os.path.join(self.output_parent_path, self.output_folder)
+    def set_output_path(self, output_parent_path: str) -> "SlurmConfig":
+        """Set output path and date for the slurm job.
+        Args:
+            output_parent_path (str): The parent path for the output.
+        Returns:
+            SlurmConfig: The updated SlurmConfig instance.
+        """
+        new_config = replace(
+            self,
+            output_parent_path=output_parent_path,
+        )
+        return new_config
+SlurmArgs = SlurmConfig

nntool/slurm/core/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from ._slurm import SlurmFunction as SlurmBackend
+__all__ = ["SlurmBackend"]