PyPI - konfai - Versions diffs - 1.1.7__py3-none-any.whl → 1.1.9__py3-none-any.whl - Mend

konfai 1.1.7py3-none-any.whl → 1.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of konfai might be problematic. Click here for more details.

Files changed (36) hide show

konfai/__init__.py +59 -14
konfai/data/augmentation.py +457 -286
konfai/data/data_manager.py +509 -290
konfai/data/patching.py +300 -183
konfai/data/transform.py +384 -277
konfai/evaluator.py +309 -68
konfai/main.py +71 -22
konfai/metric/measure.py +341 -222
konfai/metric/schedulers.py +24 -13
konfai/models/classification/convNeXt.py +187 -81
konfai/models/classification/resnet.py +272 -58
konfai/models/generation/cStyleGan.py +233 -59
konfai/models/generation/ddpm.py +348 -121
konfai/models/generation/diffusionGan.py +757 -358
konfai/models/generation/gan.py +177 -53
konfai/models/generation/vae.py +140 -40
konfai/models/registration/registration.py +135 -52
konfai/models/representation/representation.py +57 -23
konfai/models/segmentation/NestedUNet.py +339 -68
konfai/models/segmentation/UNet.py +140 -30
konfai/network/blocks.py +331 -187
konfai/network/network.py +781 -423
konfai/predictor.py +645 -240
konfai/trainer.py +527 -216
konfai/utils/ITK.py +191 -106
konfai/utils/config.py +152 -95
konfai/utils/dataset.py +326 -455
konfai/utils/utils.py +495 -249
{konfai-1.1.7.dist-info → konfai-1.1.9.dist-info}/METADATA +1 -3
konfai-1.1.9.dist-info/RECORD +38 -0
konfai/utils/registration.py +0 -199
konfai-1.1.7.dist-info/RECORD +0 -39
{konfai-1.1.7.dist-info → konfai-1.1.9.dist-info}/WHEEL +0 -0
{konfai-1.1.7.dist-info → konfai-1.1.9.dist-info}/entry_points.txt +0 -0
{konfai-1.1.7.dist-info → konfai-1.1.9.dist-info}/licenses/LICENSE +0 -0
{konfai-1.1.7.dist-info → konfai-1.1.9.dist-info}/top_level.txt +0 -0

konfai/evaluator.py CHANGED Viewed

@@ -1,69 +1,182 @@
+import builtins
+import importlib
+import json
 import os
-from torch.utils.data import DataLoader
+import shutil
+from typing import Any
+import numpy as np
 import torch
 import tqdm
-import numpy as np
-import json
-import shutil
-import builtins
-import importlib
-from konfai import EVALUATIONS_DIRECTORY, PREDICTIONS_DIRECTORY, KONFAI_ROOT, CONFIG_FILE
-from konfai.utils.config import config
-from konfai.utils.utils import _getModule, DistributedObject, synchronize_data, EvaluatorError
+from torch.utils.data import DataLoader
+from konfai import config_file, evaluations_directory, konfai_root
 from konfai.data.data_manager import DataMetric
+from konfai.utils.config import config
+from konfai.utils.utils import DistributedObject, EvaluatorError, get_module, synchronize_data
-class CriterionsAttr():
+class CriterionsAttr:
+    """
+    Container for additional metadata or configuration attributes related to a loss criterion.
+    This class is currently empty but acts as a placeholder for future extension.
+    It is passed along with each loss function to allow parameterization or inspection of its behavior.
+    Use cases may include:
+    - Weighting of individual loss terms
+    - Conditional activation
+    - Logging preferences
+    """
     @config()
     def __init__(self) -> None:
-        pass
+        pass
+class CriterionsLoader:
+    """
+    Loader for multiple criterion modules to be applied between a model output and one or more targets.
-class CriterionsLoader():
+    Each loss module (e.g., Dice, CrossEntropy, NCC) is dynamically loaded using its fully-qualified
+    classpath and is associated with a `CriterionsAttr` configuration object.
+    Args:
+        criterions_loader (dict): A mapping from module classpaths (as strings) to `CriterionsAttr` instances.
+                                  The module path is parsed and instantiated via `get_module`.
+    """
     @config()
-    def __init__(self, criterionsLoader: dict[str, CriterionsAttr] = {"default:torch_nn_CrossEntropyLoss:Dice:NCC": CriterionsAttr()}) -> None:
-        self.criterionsLoader = criterionsLoader
+    def __init__(
+        self,
+        criterions_loader: dict[str, CriterionsAttr] = {"default:torch:nn:CrossEntropyLoss:Dice:NCC": CriterionsAttr()},
+    ) -> None:
+        self.criterions_loader = criterions_loader
-    def getCriterions(self, output_group : str, target_group : str) -> dict[torch.nn.Module, CriterionsAttr]:
+    def get_criterions(self, output_group: str, target_group: str) -> dict[torch.nn.Module, CriterionsAttr]:
         criterions = {}
-        for module_classpath, criterionsAttr in self.criterionsLoader.items():
-            module, name = _getModule(module_classpath, "konfai.metric.measure")
-            criterions[config("{}.metrics.{}.targetsCriterions.{}.criterionsLoader.{}".format(KONFAI_ROOT(), output_group, target_group, module_classpath))(getattr(importlib.import_module(module), name))(config = None)] = criterionsAttr
+        for module_classpath, criterions_attr in self.criterions_loader.items():
+            module, name = get_module(module_classpath, "konfai.metric.measure")
+            criterions[
+                config(
+                    f"{konfai_root()}.metrics.{output_group}.targets_criterions.{target_group}"
+                    f".criterions_loader.{module_classpath}"
+                )(getattr(importlib.import_module(module), name))(config=None)
+            ] = criterions_attr
         return criterions
-class TargetCriterionsLoader():
+class TargetCriterionsLoader:
+    """
+    Loader class for handling multiple target groups with associated criterion configurations.
+    This class allows defining a set of criterion loaders (e.g., Dice, BCE, MSE) for each
+    target group to be used during evaluation or training. Each target group corresponds
+    to one or more loss functions, all linked to a specific model output.
+    Args:
+        targets_criterions (dict[str, CriterionsLoader]): Dictionary mapping each target group name
+            to a `CriterionsLoader` instance that defines its associated loss functions.
+    """
     @config()
-    def __init__(self, targetsCriterions : dict[str, CriterionsLoader] = {"default" : CriterionsLoader()}) -> None:
-        self.targetsCriterions = targetsCriterions
-    def getTargetsCriterions(self, output_group : str) -> dict[str, dict[torch.nn.Module, float]]:
-        targetsCriterions = {}
-        for target_group, criterionsLoader in self.targetsCriterions.items():
-            targetsCriterions[target_group] = criterionsLoader.getCriterions(output_group, target_group)
-        return targetsCriterions
+    def __init__(
+        self,
+        targets_criterions: dict[str, CriterionsLoader] = {"default": CriterionsLoader()},
+    ) -> None:
+        self.targets_criterions = targets_criterions
+    def get_targets_criterions(self, output_group: str) -> dict[str, dict[torch.nn.Module, CriterionsAttr]]:
+        """
+        Retrieve the criterion modules and their attributes for a specific output group.
+        This function prepares the loss functions to be applied for a given model output,
+        grouped by their target group.
+        Args:
+            output_group (str): Name of the model output group (e.g., "output_segmentation").
+        Returns:
+            dict[str, dict[nn.Module, CriterionsAttr]]: A nested dictionary where the first key is the
+            target group name, and the value is a dictionary mapping each loss module to its attributes.
+        """
+        targets_criterions = {}
+        for target_group, criterions_loader in self.targets_criterions.items():
+            targets_criterions[target_group] = criterions_loader.get_criterions(output_group, target_group)
+        return targets_criterions
+class Statistics:
+    """
+    Utility class to accumulate, structure, and write evaluation metric results.
+    This class is used to:
+    - Collect metrics for each dataset sample.
+    - Compute aggregate statistics (mean, std, percentiles, etc.).
+    - Export all results in a structured JSON format, including both per-case and aggregate values.
-class Statistics():
+    Args:
+        filename (str): Path to the output JSON file that will store the final results.
+    """
     def __init__(self, filename: str) -> None:
         self.measures: dict[str, dict[str, float]] = {}
         self.filename = filename
     def add(self, values: dict[str, float], name_dataset: str) -> None:
+        """
+        Add a set of metric values for a given dataset case.
+        Args:
+            values (dict): Dictionary of metric names and their values.
+            name_dataset (str): Identifier (e.g., case name) for the sample.
+        """
         for name, value in values.items():
             if name_dataset not in self.measures:
                 self.measures[name_dataset] = {}
             self.measures[name_dataset][name] = value
-    @staticmethod
-    def getStatistic(values: list[float]) -> dict[str, float]:
-        return {"max": np.max(values), "min": np.min(values), "std": np.std(values), "25pc": np.percentile(values, 25), "50pc": np.percentile(values, 50), "75pc": np.percentile(values, 75), "mean": np.mean(values), "count": len(values)}
-    def write(self, outputs: list[dict[str, any]]) -> None:
+    @staticmethod
+    def get_statistic(values: list[float]) -> dict[str, float]:
+        """
+        Compute statistical aggregates for a list of metric values.
+        Args:
+            values (list of float): Values to summarize.
+        Returns:
+            dict[str, float]: A dictionary containing:
+                - max, min, std
+                - 25th, 50th, and 75th percentiles
+                - mean and count
+        """
+        return {
+            "max": np.max(values),
+            "min": np.min(values),
+            "std": np.std(values),
+            "25pc": np.percentile(values, 25),
+            "50pc": np.percentile(values, 50),
+            "75pc": np.percentile(values, 75),
+            "mean": np.mean(values),
+            "count": len(values),
+        }
+    def write(self, outputs: list[dict[str, dict[str, Any]]]) -> None:
+        """
+        Write the collected and aggregated statistics to the configured output file.
+        The output JSON structure contains:
+        - `case`: All individual metrics per sample.
+        - `aggregates`: Global statistics computed over all cases.
+        Args:
+            outputs (list): List of metric dictionaries to merge and serialize.
+        """
         measures = {}
         for output in outputs:
             measures.update(output)
-        result = {}
+        result: dict[str, dict[str, dict[str, Any]]] = {}
         result["case"] = {}
         for name, v in measures.items():
             for metric_name, value in v.items():
@@ -73,92 +186,220 @@ class Statistics():
         result["aggregates"] = {}
         tmp: dict[str, list[float]] = {}
-        for name, v in measures.items():
-            for metric_name, value in v.items():
+        for _, v in measures.items():
+            for metric_name, _ in v.items():
                 if metric_name not in tmp:
                     tmp[metric_name] = []
                 tmp[metric_name].append(v[metric_name])
         for metric_name, values in tmp.items():
-            result["aggregates"][metric_name] = Statistics.getStatistic(values)
+            result["aggregates"][metric_name] = Statistics.get_statistic(values)
         with open(self.filename, "w") as f:
             f.write(json.dumps(result, indent=4))
 class Evaluator(DistributedObject):
+    """
+    Distributed evaluation engine for computing metrics on model predictions.
+    This class handles the evaluation of predicted outputs using predefined metric loaders.
+    It supports multi-output and multi-target configurations, computes aggregated statistics
+    across training and validation datasets, and synchronizes results across processes.
+    Evaluation results are stored in JSON format and optionally displayed during iteration.
+    Args:
+        train_name (str): Unique name of the evaluation run, used for logging and output folders.
+        metrics (dict[str, TargetCriterionsLoader]): Dictionary mapping output groups to loaders of target metrics.
+        dataset (DataMetric): Dataset provider configured for evaluation mode.
+    Attributes:
+        statistics_train (Statistics): Object used to store training evaluation metrics.
+        statistics_validation (Statistics): Object used to store validation evaluation metrics.
+        dataloader (list[DataLoader]): DataLoaders for training and validation sets.
+        metric_path (str): Path to the evaluation output directory.
+        metrics (dict): Instantiated metrics organized by output and target groups.
+    """
     @config("Evaluator")
-    def __init__(self, train_name: str = "default:TRAIN_01", metrics: dict[str, TargetCriterionsLoader] = {"default": TargetCriterionsLoader()}, dataset : DataMetric = DataMetric(),) -> None:
+    def __init__(
+        self,
+        train_name: str = "default:TRAIN_01",
+        metrics: dict[str, TargetCriterionsLoader] = {"default": TargetCriterionsLoader()},
+        dataset: DataMetric = DataMetric(),
+    ) -> None:
         if os.environ["KONFAI_CONFIG_MODE"] != "Done":
             exit(0)
         super().__init__(train_name)
-        self.metric_path = EVALUATIONS_DIRECTORY()+self.name+"/"
+        self.metric_path = evaluations_directory() + self.name + "/"
         self.metricsLoader = metrics
         self.dataset = dataset
-        self.metrics = {k: v.getTargetsCriterions(k) for k, v in self.metricsLoader.items()}
-        self.statistics_train = Statistics(self.metric_path+"Metric_TRAIN.json")
-        self.statistics_validation = Statistics(self.metric_path+"Metric_VALIDATION.json")
+        self.metrics = {k: v.get_targets_criterions(k) for k, v in self.metricsLoader.items()}
+        self.statistics_train = Statistics(self.metric_path + "Metric_TRAIN.json")
+        self.statistics_validation = Statistics(self.metric_path + "Metric_VALIDATION.json")
-    def update(self, data_dict: dict[str, tuple[torch.Tensor, str]], statistics : Statistics) -> dict[str, float]:
+    def update(self, data_dict: dict[str, tuple[torch.Tensor, str]], statistics: Statistics) -> dict[str, float]:
+        """
+        Compute metrics for a batch and update running statistics.
+        Args:
+            data_dict (dict): Dictionary where keys are output/target group names and values are
+                            tuples of (tensor, sample name).
+            statistics (Statistics): The statistics object to update (train or validation).
+        Returns:
+            dict[str, float]: Dictionary of computed metric values with keys in the format
+                            'output_group:target_group:MetricName'.
+        """
         result = {}
         for output_group in self.metrics:
             for target_group in self.metrics[output_group]:
-                targets = [data_dict[group][0].to(0) if torch.cuda.is_available() else data_dict[group][0] for group in target_group.split(";") if group in data_dict]
+                targets = [
+                    (data_dict[group][0].to(0) if torch.cuda.is_available() else data_dict[group][0])
+                    for group in target_group.split(";")
+                    if group in data_dict
+                ]
                 name = data_dict[output_group][1][0]
                 for metric in self.metrics[output_group][target_group]:
-                    result["{}:{}:{}".format(output_group, target_group, metric.__class__.__name__)] = metric(data_dict[output_group][0].to(0) if torch.cuda.is_available() else data_dict[output_group][0], *targets).item()
+                    result[f"{output_group}:{target_group}:{metric.__class__.__name__}"] = metric(
+                        (data_dict[output_group][0].to(0) if torch.cuda.is_available() else data_dict[output_group][0]),
+                        *targets,
+                    ).item()
         statistics.add(result, name)
         return result
     def setup(self, world_size: int):
+        """
+        Prepare the evaluator for distributed metric computation.
+        This method performs the following steps:
+        - Checks whether previous evaluation results exist and optionally overwrites them.
+        - Creates the output directory and copies the current configuration file for reproducibility.
+        - Loads the evaluation dataset according to the world size.
+        - Validates that all specified output and target groups used in metric definitions
+        are present in the dataset group configuration.
+        Args:
+            world_size (int): Number of processes in the distributed evaluation setup.
+        Raises:
+            EvaluatorError: If any metric output or target group is missing in the dataset's group mapping.
+        """
         if os.path.exists(self.metric_path):
             if os.environ["KONFAI_OVERWRITE"] != "True":
-                accept = builtins.input("The metric {} already exists ! Do you want to overwrite it (yes,no) : ".format(self.name))
+                accept = builtins.input(
+                    f"The metric {self.name} already exists ! Do you want to overwrite it (yes,no) : "
+                )
                 if accept != "yes":
                     return
             if os.path.exists(self.metric_path):
-                shutil.rmtree(self.metric_path)
+                shutil.rmtree(self.metric_path)
         if not os.path.exists(self.metric_path):
             os.makedirs(self.metric_path)
-        metric_namefile_src = CONFIG_FILE().replace(".yml", "")
-        shutil.copyfile(metric_namefile_src+".yml", "{}{}.yml".format(self.metric_path, metric_namefile_src))
+        metric_namefile_src = config_file().replace(".yml", "")
+        shutil.copyfile(
+            metric_namefile_src + ".yml",
+            f"{self.metric_path}{metric_namefile_src}.yml",
+        )
-        self.dataloader = self.dataset.getData(world_size)
+        self.dataloader = self.dataset.get_data(world_size)
-        groupsDest = [group for groups in self.dataset.groups_src.values() for group in groups]
+        groups_dest = [group for groups in self.dataset.groups_src.values() for group in groups]
-        missing_outputs = set(self.metrics.keys()) - set(groupsDest)
+        missing_outputs = set(self.metrics.keys()) - set(groups_dest)
         if missing_outputs:
             raise EvaluatorError(
-                f"The following metric output groups are missing from 'groupsDest': {sorted(missing_outputs)}. ",
-                f"Available groups: {sorted(groupsDest)}"
+                f"The following metric output groups are missing from 'groups_dest': {sorted(missing_outputs)}. ",
+                f"Available groups: {sorted(groups_dest)}",
             )
         target_groups = []
         for i in {target for targets in self.metrics.values() for target in targets}:
             for u in i.split(";"):
                 target_groups.append(u)
-        missing_targets = set(target_groups) - set(groupsDest)
+        missing_targets = set(target_groups) - set(groups_dest)
         if missing_targets:
             raise EvaluatorError(
-                f"The following metric target groups are missing from 'groupsDest': {sorted(missing_targets)}. ",
-                f"Available groups: {sorted(groupsDest)}"
+                f"The following metric target groups are missing from 'groups_dest': {sorted(missing_targets)}. ",
+                f"Available groups: {sorted(groups_dest)}",
             )
     def run_process(self, world_size: int, global_rank: int, gpu: int, dataloaders: list[DataLoader]):
-        description = lambda measure : "Metric TRAIN : {} ".format(" | ".join("{}: {:.2f}".format(k, v) for k, v in measure.items()) if measure is not None else "")
-        with tqdm.tqdm(iterable = enumerate(dataloaders[0]), leave=True, desc = description(None), total=len(dataloaders[0]), ncols=0) as batch_iter:
+        """
+        Execute the distributed evaluation loop over the training and validation datasets.
+        This method iterates through the provided DataLoaders (train and optionally validation),
+        updates the metric statistics using the configured `metrics` dictionary, and synchronizes
+        the results across all processes. On the global rank 0, the metrics are saved as JSON files.
+        Metrics are displayed in real-time using `tqdm` progress bars, showing a summary of the
+        current batch's computed values.
+        Args:
+            world_size (int): Total number of distributed processes.
+            global_rank (int): Global rank of the current process (used for writing results).
+            gpu (int): Local GPU ID used for synchronization.
+            dataloaders (list[DataLoader]): A list containing one or two DataLoaders:
+                - `dataloaders[0]` is used for training evaluation.
+                - `dataloaders[1]` (optional) is used for validation evaluation.
+        Notes:
+            - Only the main process (`global_rank == 0`) writes final results to disk.
+        """
+        def description(measure):
+            return (
+                f"Metric TRAIN : {' | '.join(f'{k}: {v:.4f}' for k, v in measure.items())}"
+                if measure is not None
+                else "Metric TRAIN : "
+            )
+        with tqdm.tqdm(
+            iterable=enumerate(dataloaders[0]),
+            leave=True,
+            desc=description(None),
+            total=len(dataloaders[0]),
+            ncols=0,
+        ) as batch_iter:
             for _, data_dict in batch_iter:
-                batch_iter.set_description(description(self.update({k: (v[0], v[4]) for k,v in data_dict.items()}, self.statistics_train)))
+                batch_iter.set_description(
+                    description(
+                        self.update(
+                            {k: (v[0], v[4]) for k, v in data_dict.items()},
+                            self.statistics_train,
+                        )
+                    )
+                )
         outputs = synchronize_data(world_size, gpu, self.statistics_train.measures)
         if global_rank == 0:
             self.statistics_train.write(outputs)
         if len(dataloaders) == 2:
-            description = lambda measure : "Metric VALIDATION : {} ".format(" | ".join("{}: {:.2f}".format(k, v) for k, v in measure.items()) if measure is not None else "")
-            with tqdm.tqdm(iterable = enumerate(dataloaders[1]), leave=True, desc = description(None), total=len(dataloaders[1]), ncols=0) as batch_iter:
+            def description(measure):
+                return (
+                    f"Metric VALIDATION : {' | '.join(f'{k}: {v:.2f}' for k, v in measure.items())}"
+                    if measure is not None
+                    else "Metric VALIDATION : "
+                )
+            with tqdm.tqdm(
+                iterable=enumerate(dataloaders[1]),
+                leave=True,
+                desc=description(None),
+                total=len(dataloaders[1]),
+                ncols=0,
+            ) as batch_iter:
                 for _, data_dict in batch_iter:
-                    batch_iter.set_description(description(self.update({k: (v[0], v[4]) for k,v in data_dict.items()}, self.statistics_validation)))
+                    batch_iter.set_description(
+                        description(
+                            self.update(
+                                {k: (v[0], v[4]) for k, v in data_dict.items()},
+                                self.statistics_validation,
+                            )
+                        )
+                    )
             outputs = synchronize_data(world_size, gpu, self.statistics_validation.measures)
             if global_rank == 0:
-                self.statistics_validation.write(outputs)
+                self.statistics_validation.write(outputs)

konfai/main.py CHANGED Viewed

@@ -1,53 +1,102 @@
 import argparse
 import os
-from torch.cuda import device_count
+import sys
 import torch.multiprocessing as mp
-from konfai.utils.utils import setup, TensorBoard, Log
-from konfai import KONFAI_NB_CORES
+from torch.cuda import device_count
+from konfai import konfai_nb_cores
+from konfai.utils.utils import Log, TensorBoard, setup
-import sys
 sys.path.insert(0, os.getcwd())
 def main():
+    """
+    Entry point for launching KonfAI training locally.
+    - Parses arguments (if any) via a setup parser.
+    - Initializes distributed environment based on available CUDA devices or CPU cores.
+    - Launches training via `mp.spawn`.
+    - Manages logging and TensorBoard context.
+    KeyboardInterrupt is caught to allow clean manual termination.
+    """
     parser = argparse.ArgumentParser(description="KonfAI", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     try:
-        with setup(parser) as distributedObject:
-            with Log(distributedObject.name, 0):
+        with setup(parser) as distributed_object:
+            with Log(distributed_object.name, 0):
                 world_size = device_count()
                 if world_size == 0:
-                    world_size = int(KONFAI_NB_CORES())
-                distributedObject.setup(world_size)
-                with TensorBoard(distributedObject.name):
-                    mp.spawn(distributedObject, nprocs=world_size)
+                    world_size = int(konfai_nb_cores())
+                distributed_object.setup(world_size)
+                with TensorBoard(distributed_object.name):
+                    mp.spawn(distributed_object, nprocs=world_size)
     except KeyboardInterrupt:
         print("\n[KonfAI] Manual interruption (Ctrl+C)")
 def cluster():
+    """
+    Entry point for launching KonfAI on a cluster using Submitit.
+    - Parses cluster-specific arguments: job name, nodes, memory, time limit, etc.
+    - Sets up distributed environment based on number of nodes and GPUs.
+    - Configures Submitit executor with job specs.
+    - Submits the job to SLURM (or another Submitit-compatible backend).
+    Environment variables:
+        KONFAI_OVERWRITE: Set to force overwrite of previous training runs.
+        KONFAI_CLUSTER: Mark this as a cluster job (used downstream).
+    Raises:
+        KeyboardInterrupt: On manual interruption.
+        Exception: Any submission-related error is printed and causes exit.
+    """
     parser = argparse.ArgumentParser(description="KonfAI", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     # Cluster manager arguments
-    cluster_args = parser.add_argument_group('Cluster manager arguments')
-    cluster_args.add_argument('--name', type=str, help='Task name', required=True)
-    cluster_args.add_argument('--num-nodes', '--num_nodes', default=1, type=int, help='Number of nodes')
-    cluster_args.add_argument('--memory', type=int, default=16, help='Amount of memory per node')
-    cluster_args.add_argument('--time-limit', '--time_limit', type=int, default=1440, help='Job time limit in minute')
-    cluster_args.add_argument('--resubmit', action='store_true', help='Automatically resubmit job just before timout')
+    cluster_args = parser.add_argument_group("Cluster manager arguments")
+    cluster_args.add_argument("--name", type=str, help="Task name", required=True)
+    cluster_args.add_argument("--num-nodes", "--num_nodes", default=1, type=int, help="Number of nodes")
+    cluster_args.add_argument("--memory", type=int, default=16, help="Amount of memory per node")
+    cluster_args.add_argument(
+        "--time-limit",
+        "--time_limit",
+        type=int,
+        default=1440,
+        help="Job time limit in minute",
+    )
+    cluster_args.add_argument(
+        "--resubmit",
+        action="store_true",
+        help="Automatically resubmit job just before timout",
+    )
     try:
-        with setup(parser) as distributedObject:
+        with setup(parser) as distributed_object:
             args = parser.parse_args()
             config = vars(args)
             os.environ["KONFAI_OVERWRITE"] = "True"
             os.environ["KONFAI_CLUSTER"] = "True"
             n_gpu = len(config["gpu"].split(","))
-            distributedObject.setup(n_gpu*int(config["num_nodes"]))
+            distributed_object.setup(n_gpu * int(config["num_nodes"]))
             import submitit
             executor = submitit.AutoExecutor(folder="./Cluster/")
-            executor.update_parameters(name=config["name"], mem_gb=config["memory"], gpus_per_node=n_gpu, tasks_per_node=n_gpu//distributedObject.size, cpus_per_task=config["num_workers"], nodes=config["num_nodes"], timeout_min=config["time_limit"])
-            with TensorBoard(distributedObject.name):
-                executor.submit(distributedObject)
+            executor.update_parameters(
+                name=config["name"],
+                mem_gb=config["memory"],
+                gpus_per_node=n_gpu,
+                tasks_per_node=n_gpu // distributed_object.size,
+                cpus_per_task=config["num_workers"],
+                nodes=config["num_nodes"],
+                timeout_min=config["time_limit"],
+            )
+            with TensorBoard(distributed_object.name):
+                executor.submit(distributed_object)
     except KeyboardInterrupt:
         print("\n[KonfAI] Manual interruption (Ctrl+C)")
     except Exception as e:
         print(e)
-        exit(1)
+        exit(1)

konfai 1.1.7__py3-none-any.whl → 1.1.9__py3-none-any.whl

Potentially problematic release.

konfai 1.1.7py3-none-any.whl → 1.1.9py3-none-any.whl