PyPI - kostyl-toolkit - Versions diffs - 0.1.35__tar.gz → 0.1.36__tar.gz - Mend

kostyl-toolkit 0.1.35tar.gz → 0.1.36tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{kostyl_toolkit-0.1.35 → kostyl_toolkit-0.1.36}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: kostyl-toolkit
-Version: 0.1.35
+Version: 0.1.36
 Summary: Kickass Orchestration System for Training, Yielding & Logging
 Requires-Dist: case-converter>=1.2.0
 Requires-Dist: loguru>=0.7.3

{kostyl_toolkit-0.1.35 → kostyl_toolkit-0.1.36}/kostyl/ml/configs/hyperparams.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from typing import Literal
 from pydantic import BaseModel
 from pydantic import Field
 from pydantic import model_validator
@@ -8,11 +10,25 @@ from kostyl.utils.logging import setup_logger
 logger = setup_logger(fmt="only_message")
-class Optimizer(BaseModel):
-    """Optimizer hyperparameters configuration."""
+class AdamConfig(BaseModel):
+    """AdamW optimizer hyperparameters configuration."""
+    type: Literal["AdamW"] = "AdamW"
+    betas: tuple[float, float] = (0.9, 0.999)
+    is_adamw: bool = True
+class AdamWithPrecisionConfig(BaseModel):
+    """Adam optimizer with low-precision hyperparameters configuration."""
+    type: Literal["Adam8bit", "Adam4bit", "AdamFp8"]
+    betas: tuple[float, float] = (0.9, 0.999)
+    block_size: int
+    bf16_stochastic_round: bool = False
+    is_adamw: bool = True
-    adamw_beta1: float = 0.9
-    adamw_beta2: float = 0.999
+Optimizer = AdamConfig | AdamWithPrecisionConfig
 class Lr(BaseModel):
@@ -73,6 +89,6 @@ class HyperparamsConfig(BaseModel):
     """Model training hyperparameters configuration."""
     grad_clip_val: float | None = Field(default=None, gt=0, validate_default=False)
-    optimizer: Optimizer = Optimizer()
+    optimizer: Optimizer
     lr: Lr
     weight_decay: WeightDecay

{kostyl_toolkit-0.1.35 → kostyl_toolkit-0.1.36}/kostyl/ml/configs/training_settings.py RENAMED Viewed

@@ -25,21 +25,31 @@ PRECISION = Literal[
     "16",
     "bf16",
 ]
+DTYPE = Literal["float32", "float16", "bfloat16", "float64"]
+class SingleDeviceStrategyConfig(BaseModel):
+    """Single device strategy configuration."""
+    type: Literal["single_device"]
 class FSDP1StrategyConfig(BaseModel):
     """Fully Sharded Data Parallel (FSDP) strategy configuration."""
     type: Literal["fsdp1"]
-    param_dtype: Literal["float32", "float16", "bfloat16"]
-    reduce_dtype: Literal["float32", "float16", "bfloat16"]
-    buffer_dtype: Literal["float32", "float16", "bfloat16"]
+    param_dtype: DTYPE | None
+    reduce_dtype: DTYPE | None
+    buffer_dtype: DTYPE | None
-class SingleDeviceStrategyConfig(BaseModel):
-    """Single device strategy configuration."""
+class FSDP2StrategyConfig(BaseModel):
+    """Fully Sharded Data Parallel (FSDP) strategy configuration."""
-    type: Literal["single_device"]
+    type: Literal["fsdp2"]
+    param_dtype: DTYPE | None
+    reduce_dtype: DTYPE | None
+    buffer_dtype: DTYPE | None
 class DDPStrategyConfig(BaseModel):
@@ -82,6 +92,7 @@ class CheckpointConfig(BaseModel):
     monitor: str = "val_loss"
     mode: str = "min"
     filename: str = "{epoch:02d}-{val_loss:.2f}"
+    save_weights_only: bool = True
 class DataConfig(BaseModel):

{kostyl_toolkit-0.1.35 → kostyl_toolkit-0.1.36}/kostyl/ml/lightning/callbacks/checkpoint.py RENAMED Viewed

@@ -299,9 +299,9 @@ class ModelCheckpointWithRegistryUploader(ModelCheckpoint):
 def setup_checkpoint_callback(
     dirpath: Path,
     ckpt_cfg: CheckpointConfig,
-    save_weights_only: bool = True,
     registry_uploader_callback: RegistryUploaderCallback | None = None,
     uploading_strategy: Literal["only-best", "every-checkpoint"] | None = None,
+    remove_folder_if_exists: bool = True,
 ) -> ModelCheckpointWithRegistryUploader | ModelCheckpoint:
     """
     Create and configure a checkpoint callback for model saving.
@@ -313,14 +313,13 @@ def setup_checkpoint_callback(
     Args:
         dirpath: Path to the directory for saving checkpoints.
         ckpt_cfg: Checkpoint configuration (filename, monitor, mode, save_top_k).
-        save_weights_only: If True, only model weights are saved without optimizer and lr-scheduler state.
-            Defaults to True.
         registry_uploader_callback: Optional callback for uploading checkpoints to a remote registry.
             Must be specified together with uploading_strategy.
         uploading_strategy: Checkpoint upload mode:
             - "only-best": only the best checkpoint is uploaded
             - "every-checkpoint": every saved checkpoint is uploaded
             Must be specified together with registry_uploader_callback.
+        remove_folder_if_exists: If True, removes existing checkpoint directory before creating a new one.
     Returns:
         ModelCheckpointWithRegistryUploader if registry_uploader_callback is provided,
@@ -331,7 +330,7 @@ def setup_checkpoint_callback(
     Note:
         If the dirpath directory already exists, it will be removed and recreated
-        (only on the main process in distributed training).
+        (only on the main process in distributed training) if remove_folder_if_exists is True.
     """
     if (registry_uploader_callback is None) != (uploading_strategy is None):
@@ -342,8 +341,9 @@ def setup_checkpoint_callback(
     if dirpath.exists():
         if is_main_process():
             logger.warning(f"Checkpoint directory {dirpath} already exists.")
-            rmtree(dirpath)
-            logger.warning(f"Removed existing checkpoint directory {dirpath}.")
+            if remove_folder_if_exists:
+                rmtree(dirpath)
+                logger.warning(f"Removed existing checkpoint directory {dirpath}.")
     else:
         logger.info(f"Creating checkpoint directory {dirpath}.")
         dirpath.mkdir(parents=True, exist_ok=True)
@@ -356,7 +356,7 @@ def setup_checkpoint_callback(
             monitor=ckpt_cfg.monitor,
             mode=ckpt_cfg.mode,
             verbose=True,
-            save_weights_only=save_weights_only,
+            save_weights_only=ckpt_cfg.save_weights_only,
             registry_uploader_callback=registry_uploader_callback,
             uploading_mode=uploading_strategy,
         )
@@ -368,6 +368,6 @@ def setup_checkpoint_callback(
             monitor=ckpt_cfg.monitor,
             mode=ckpt_cfg.mode,
             verbose=True,
-            save_weights_only=save_weights_only,
+            save_weights_only=ckpt_cfg.save_weights_only,
         )
     return checkpoint_callback

kostyl_toolkit-0.1.36/kostyl/ml/lightning/utils.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import cast
+import lightning as L
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+from kostyl.ml.configs import DDPStrategyConfig
+from kostyl.ml.configs import FSDP1StrategyConfig
+from kostyl.ml.configs import SingleDeviceStrategyConfig
+from kostyl.utils.logging import setup_logger
+TRAINING_STRATEGIES = (
+    FSDP1StrategyConfig | DDPStrategyConfig | SingleDeviceStrategyConfig
+)
+logger = setup_logger(add_rank=True)
+def estimate_total_steps(
+    trainer: L.Trainer, dp_process_group: ProcessGroup | None = None
+) -> int:
+    """
+    Estimates the total number of training steps with respect to data parallelism and gradient accumulation.
+    Args:
+        trainer: The PyTorch Lightning Trainer instance.
+        dp_process_group: The data parallel process group. If None, the world process group will be used.
+    """
+    if dist.is_initialized():
+        world_size = dist.get_world_size(dp_process_group)
+    else:
+        world_size = 1
+    datamodule = trainer.datamodule  # type: ignore
+    if datamodule is None:
+        raise ValueError("Trainer must have a datamodule to estimate total steps.")
+    datamodule = cast(L.LightningDataModule, datamodule)
+    logger.info("Loading `train_dataloader` to estimate number of stepping batches.")
+    datamodule.setup("fit")
+    dataloader_len = len(datamodule.train_dataloader())
+    steps_per_epoch = dataloader_len // trainer.accumulate_grad_batches // world_size
+    if trainer.max_epochs is None:
+        raise ValueError("Trainer must have `max_epochs` set to estimate total steps.")
+    total_steps = steps_per_epoch * trainer.max_epochs
+    logger.info(
+        f"Total steps: {total_steps} (per-epoch: {steps_per_epoch}) "
+        f"-> Dataloader len: {dataloader_len} "
+        f"-> Accumulate grad batches: {trainer.accumulate_grad_batches} "
+        f"-> Epochs: {trainer.max_epochs} "
+        f"-> DataParallel size: {world_size}"
+    )
+    return total_steps

kostyl_toolkit-0.1.36/kostyl/ml/registry_uploader.py ADDED Viewed

@@ -0,0 +1,126 @@
+from abc import ABC
+from abc import abstractmethod
+from collections.abc import Callable
+from functools import partial
+from pathlib import Path
+from typing import override
+from clearml import OutputModel
+from kostyl.utils.logging import setup_logger
+logger = setup_logger()
+class RegistryUploaderCallback(ABC):
+    """Abstract Lightning callback responsible for tracking and uploading the best-performing model checkpoint."""
+    @abstractmethod
+    def upload_checkpoint(self, path: str | Path) -> None:
+        """Upload the checkpoint located at the given path to the configured registry backend."""
+        raise NotImplementedError
+class ClearMLRegistryUploaderCallback(RegistryUploaderCallback):
+    """PyTorch Lightning callback to upload the best model checkpoint to ClearML."""
+    def __init__(
+        self,
+        model_name: str,
+        config_dict: dict[str, str] | None = None,
+        label_enumeration: dict[str, int] | None = None,
+        tags: list[str] | None = None,
+        comment: str | None = None,
+        framework: str | None = None,
+        base_model_id: str | None = None,
+        new_model_per_upload: bool = True,
+        verbose: bool = True,
+    ) -> None:
+        """
+        Initializes the ClearMLRegistryUploaderCallback.
+        Args:
+            model_name: The name for the newly created model.
+            label_enumeration: The label enumeration dictionary of string (label) to integer (value) pairs.
+            config_dict: Optional configuration dictionary to associate with the model.
+            tags: A list of strings which are tags for the model.
+            comment: A comment / description for the model.
+            framework: The framework of the model (e.g., "PyTorch", "TensorFlow").
+            base_model_id: Optional ClearML model ID to use as a base for the new model
+            new_model_per_upload: Whether to create a new ClearML model
+                for every upload or update weights of the same model. When updating weights,
+                the last uploaded checkpoint will be replaced (and deleted).
+            verbose: Whether to log messages during upload.
+        """
+        super().__init__()
+        if base_model_id is not None and new_model_per_upload:
+            raise ValueError(
+                "Cannot set base_model_id when new_model_per_upload is True."
+            )
+        self.verbose = verbose
+        self.new_model_per_upload = new_model_per_upload
+        self.best_model_path: str = ""
+        self.config_dict = config_dict
+        self._output_model: OutputModel | None = None
+        self._last_uploaded_model_path: str = ""
+        self._upload_callback: Callable | None = None
+        self._validate_tags(tags)
+        self.model_fabric = partial(
+            OutputModel,
+            name=model_name,
+            label_enumeration=label_enumeration,
+            tags=tags,
+            comment=comment,
+            framework=framework,
+            base_model_id=base_model_id,
+        )
+        return
+    @staticmethod
+    def _validate_tags(tags: list[str] | None) -> None:
+        if tags is None:
+            return
+        if "LightningCheckpoint" not in tags:
+            tags.append("LightningCheckpoint")
+        return None
+    @property
+    def output_model_(self) -> OutputModel:
+        """Returns the OutputModel instance based on `new_model_per_upload` setting."""
+        if self.new_model_per_upload:
+            model = self.model_fabric()
+            self._output_model = self.model_fabric()
+        else:
+            if self._output_model is None:
+                self._output_model = self.model_fabric()
+            model = self._output_model
+        return model
+    @override
+    def upload_checkpoint(
+        self,
+        path: str | Path,
+    ) -> None:
+        if isinstance(path, Path):
+            path = str(path)
+        if path == self._last_uploaded_model_path:
+            if self.verbose:
+                logger.info("Model unchanged since last upload")
+            return
+        if self.verbose:
+            logger.info(f"Uploading model from {path}")
+        self.output_model_.update_weights(
+            path,
+            auto_delete_file=False,
+            async_enable=False,
+        )
+        self.output_model_.update_design(config_dict=self.config_dict)
+        self._last_uploaded_model_path = path
+        return

kostyl_toolkit-0.1.36/kostyl/ml/schedulers/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+from .composite import CompositeScheduler
+from .cosine import CosineParamScheduler
+from .cosine import CosineScheduler
+from .cosine_with_plateu import CosineWithPlateauParamScheduler
+from .cosine_with_plateu import CosineWithPlateuScheduler
+from .linear import LinearParamScheduler
+from .linear import LinearScheduler
+__all__ = [
+    "CompositeScheduler",
+    "CosineParamScheduler",
+    "CosineScheduler",
+    "CosineWithPlateauParamScheduler",
+    "CosineWithPlateuScheduler",
+    "LinearParamScheduler",
+    "LinearScheduler",
+]

{kostyl_toolkit-0.1.35 → kostyl_toolkit-0.1.36}/kostyl/ml/schedulers/base.py RENAMED Viewed

@@ -6,18 +6,20 @@ from typing import Any
 class BaseScheduler(ABC):
     """Base class for learning rate schedulers."""
+    @abstractmethod
     def state_dict(self) -> dict[str, Any]:
         """Get the state as a state dictionary."""
-        return {
-            key: value
-            for key, value in self.__dict__.items()
-            if key not in ["optimizer", "scheduler_values"]
-        }
+        raise NotImplementedError
+    @abstractmethod
     def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         """Load the state from a state dictionary."""
-        self.__dict__.update(state_dict)
-        return
+        raise NotImplementedError
+    @abstractmethod
+    def _verify(self) -> None:
+        """Verify the scheduler configuration."""
+        raise NotImplementedError
     def __getstate__(self) -> dict[str, Any]:
         """Get the state for pickling."""

{kostyl_toolkit-0.1.35 → kostyl_toolkit-0.1.36}/kostyl/ml/schedulers/cosine.py RENAMED Viewed

@@ -2,7 +2,6 @@ from typing import Any
 from typing import override
 import numpy as np
-import numpy.typing as npt
 import torch
 from .base import BaseScheduler
@@ -29,18 +28,24 @@ class _CosineSchedulerCore(BaseScheduler):
         if freeze_ratio is not None:
             if not (0 < freeze_ratio < 1):
                 raise ValueError(f"Freeze ratio must be in (0, 1), got {freeze_ratio}.")
+        pre_annealing_ratio = (warmup_ratio if warmup_ratio is not None else 0) + (
+            freeze_ratio if freeze_ratio is not None else 0
+        )
+        if pre_annealing_ratio > 1:
+            raise ValueError(
+                "The sum of warmup_ratio and freeze_ratio must <= 1, got "
+                f"{pre_annealing_ratio}."
+            )
         self.param_name = param_name
         self.num_iters = num_iters
         self.base_value = base_value
         self.final_value = final_value
         self.warmup_ratio = warmup_ratio
         self.warmup_value = warmup_value
         self.freeze_ratio = freeze_ratio
-        self.scheduler_values: npt.NDArray[np.float64] = np.array([], dtype=np.float64)
+        self.scheduled_values: np.ndarray = np.array([], dtype=np.float64)
         self.current_value_ = self.base_value
         return
@@ -63,31 +68,29 @@ class _CosineSchedulerCore(BaseScheduler):
             warmup_iters = 0
             warmup_schedule = np.array([], dtype=np.float64)
+        # Create cosine annealing schedule
         cosine_annealing_iters = self.num_iters - warmup_iters - freeze_iters
-        if cosine_annealing_iters <= 0:
-            raise ValueError("Cosine annealing iters must be > 0.")
-        # Create cosine schedule
-        iters = np.arange(cosine_annealing_iters)
-        schedule = self.final_value + 0.5 * (self.base_value - self.final_value) * (
-            1 + np.cos(np.pi * iters / len(iters))
-        )
+        if cosine_annealing_iters > 0:
+            iters = np.arange(cosine_annealing_iters)
+            cosine_annealing_schedule = self.final_value + 0.5 * (
+                self.base_value - self.final_value
+            ) * (1 + np.cos(np.pi * iters / len(iters)))
+        else:
+            cosine_annealing_schedule = np.array([], dtype=np.float64)
         # Concatenate all parts of the schedule
-        self.scheduler_values = np.concatenate(
-            (freeze_schedule, warmup_schedule, schedule)
+        self.scheduled_values = np.concatenate(
+            (freeze_schedule, warmup_schedule, cosine_annealing_schedule)
         )
-        if len(self.scheduler_values) != self.num_iters:
-            raise ValueError(
-                f"Scheduler length ({len(self.scheduler_values)}) does not match num_iters ({self.num_iters})."
-            )
+        self._verify()
         return
     @override
-    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        super().load_state_dict(state_dict)
-        self.scheduler_values = np.array([], dtype=np.float64)
+    def _verify(self) -> None:
+        if len(self.scheduled_values) != self.num_iters:
+            raise ValueError(
+                f"Scheduler length ({len(self.scheduled_values)}) does not match num_iters ({self.num_iters})."
+            )
         return
     @override
@@ -95,13 +98,13 @@ class _CosineSchedulerCore(BaseScheduler):
         raise NotImplementedError
     def _get_value(self, it: int) -> float:
-        if len(self.scheduler_values) == 0:
+        if len(self.scheduled_values) == 0:
             self._create_scheduler()
         if it >= self.num_iters:
             value: float = self.final_value
         else:
-            value: float = self.scheduler_values[it]
+            value: float = self.scheduled_values[it]
         self.current_value_ = value
         return value
@@ -163,6 +166,21 @@ class CosineScheduler(_CosineSchedulerCore):
         self.param_group_field = param_group_field
         return
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        self.__dict__.update(state_dict)
+        self.scheduled_values = np.array([], dtype=np.float64)
+        return
+    @override
+    def state_dict(self) -> dict[str, Any]:
+        state = {
+            k: v
+            for k, v in self.__dict__.items()
+            if k not in ["scheduled_values", "optimizer"]
+        }
+        return state
     @override
     def step(self, it: int) -> None:
         value = self._get_value(it)
@@ -209,3 +227,14 @@ class CosineParamScheduler(_CosineSchedulerCore):
         """
         value = self._get_value(it)
         return value
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        self.__dict__.update(state_dict)
+        self.scheduled_values = np.array([], dtype=np.float64)
+        return
+    @override
+    def state_dict(self) -> dict[str, Any]:
+        state = {k: v for k, v in self.__dict__.items() if k != "scheduled_values"}
+        return state

kostyl-toolkit 0.1.35__tar.gz → 0.1.36__tar.gz

kostyl-toolkit 0.1.35tar.gz → 0.1.36tar.gz