PyPI - fkat - Versions diffs - 0.1.2__py3-none-any.whl - Mend

fkat 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

fkat/__init__.py +147 -0
fkat/data/__init__.py +15 -0
fkat/data/data_module.py +198 -0
fkat/data/datasets/__init__.py +19 -0
fkat/data/datasets/dict.py +78 -0
fkat/data/datasets/json.py +176 -0
fkat/data/datasets/map.py +90 -0
fkat/data/datasets/parquet.py +242 -0
fkat/data/datasets/sized.py +31 -0
fkat/data/dict.py +42 -0
fkat/data/samplers/__init__.py +9 -0
fkat/data/samplers/dict.py +38 -0
fkat/data/samplers/sized.py +16 -0
fkat/data/samplers/strategies.py +68 -0
fkat/data/sharded.py +718 -0
fkat/data/shm.py +364 -0
fkat/predict.py +32 -0
fkat/py.typed +0 -0
fkat/pytorch/__init__.py +3 -0
fkat/pytorch/actions/__init__.py +11 -0
fkat/pytorch/actions/aws/__init__.py +3 -0
fkat/pytorch/actions/aws/batch.py +29 -0
fkat/pytorch/actions/aws/ec2.py +61 -0
fkat/pytorch/callbacks/__init__.py +2 -0
fkat/pytorch/callbacks/cuda/__init__.py +16 -0
fkat/pytorch/callbacks/cuda/cache.py +115 -0
fkat/pytorch/callbacks/cuda/memory.py +200 -0
fkat/pytorch/callbacks/cuda/nsys.py +199 -0
fkat/pytorch/callbacks/cuda/nvtx.py +288 -0
fkat/pytorch/callbacks/cuda/xid.py +173 -0
fkat/pytorch/callbacks/debugging/__init__.py +9 -0
fkat/pytorch/callbacks/debugging/introspection.py +569 -0
fkat/pytorch/callbacks/debugging/optimizer.py +45 -0
fkat/pytorch/callbacks/gc.py +146 -0
fkat/pytorch/callbacks/loggers.py +211 -0
fkat/pytorch/callbacks/logging/__init__.py +12 -0
fkat/pytorch/callbacks/logging/heartbeat.py +76 -0
fkat/pytorch/callbacks/logging/throughput.py +253 -0
fkat/pytorch/callbacks/logging/validation_metrics.py +94 -0
fkat/pytorch/callbacks/monitoring/__init__.py +14 -0
fkat/pytorch/callbacks/monitoring/crash.py +162 -0
fkat/pytorch/callbacks/monitoring/dp.py +130 -0
fkat/pytorch/callbacks/monitoring/hardware_stats.py +135 -0
fkat/pytorch/callbacks/monitoring/shutdown.py +170 -0
fkat/pytorch/callbacks/profiling/__init__.py +13 -0
fkat/pytorch/callbacks/profiling/flops.py +574 -0
fkat/pytorch/callbacks/profiling/memray.py +212 -0
fkat/pytorch/callbacks/profiling/torch.py +197 -0
fkat/pytorch/callbacks/profiling/viztracer.py +197 -0
fkat/pytorch/loggers.py +284 -0
fkat/pytorch/schedule/__init__.py +27 -0
fkat/pytorch/schedule/base.py +308 -0
fkat/pytorch/schedule/mlflow.py +143 -0
fkat/pytorch/utilities.py +49 -0
fkat/test.py +31 -0
fkat/train.py +32 -0
fkat/utils/__init__.py +28 -0
fkat/utils/aws/__init__.py +3 -0
fkat/utils/aws/imds.py +137 -0
fkat/utils/boto3.py +24 -0
fkat/utils/config.py +194 -0
fkat/utils/cuda/__init__.py +3 -0
fkat/utils/cuda/preflight/__init__.py +3 -0
fkat/utils/cuda/preflight/health_check/aws_instance_config.py +82 -0
fkat/utils/cuda/preflight/health_check/constants.py +23 -0
fkat/utils/cuda/preflight/health_check/ddb_client.py +82 -0
fkat/utils/cuda/preflight/health_check/gpu_connection_test.py +104 -0
fkat/utils/cuda/preflight/health_check/gpu_stress_test.py +122 -0
fkat/utils/cuda/preflight/health_check/helpers.py +297 -0
fkat/utils/cuda/preflight/health_check/logger.py +205 -0
fkat/utils/cuda/preflight/health_check/timer.py +31 -0
fkat/utils/cuda/preflight/run.py +560 -0
fkat/utils/cuda/xid.py +48 -0
fkat/utils/logging.py +28 -0
fkat/utils/mlflow.py +33 -0
fkat/utils/pandas.py +25 -0
fkat/utils/pdb.py +84 -0
fkat/utils/pool.py +81 -0
fkat/utils/profiler.py +18 -0
fkat/utils/pyarrow.py +21 -0
fkat/utils/rng.py +27 -0
fkat/utils/shm.py +184 -0
fkat/validate.py +31 -0
fkat-0.1.2.dist-info/METADATA +134 -0
fkat-0.1.2.dist-info/RECORD +88 -0
fkat-0.1.2.dist-info/WHEEL +4 -0
fkat-0.1.2.dist-info/licenses/LICENSE +175 -0
fkat-0.1.2.dist-info/licenses/NOTICE +1 -0

fkat/pytorch/callbacks/cuda/memory.py ADDED Viewed

@@ -0,0 +1,200 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+import pickle
+import tempfile
+from datetime import datetime, timezone
+from typing import Any
+from typing_extensions import override
+import lightning as L
+import torch
+from torch.cuda import memory
+from fkat.pytorch.schedule import (
+    Schedule,
+    Never,
+)
+from fkat.pytorch.loggers import LightningLogger
+from fkat.pytorch.callbacks.loggers import CallbackLogger
+from fkat.utils import safe_timestamp
+logger: logging.Logger = logging.getLogger(__name__)
+def _artifact_path(root_dir: str, rank: int, file_type: str, ext: str) -> tuple[str, str]:
+    base_dir = os.path.join(root_dir, "torch.cuda.memory")
+    timestamp = safe_timestamp()
+    file_path = os.path.join(base_dir, f"rank{rank}/{file_type}/rank{rank}_{timestamp}.{ext}")
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    return base_dir, file_path
+def _reset_recording(kwargs: dict[str, Any]) -> None:
+    if torch.cuda.is_available():
+        memory._record_memory_history(enabled=None)
+        # set the limitation of ring buffer ~100 G. Otherwise, the buffer might be too large and trigger CPU OOM.
+        kwargs.setdefault("max_entries", 1000000)
+        memory._record_memory_history(**kwargs)
+def _detect_tensor_cycles(cb_logger: CallbackLogger, rank: int) -> None:
+    from torch.utils.viz import _cycles
+    def is_cuda_tensor(obj: Any) -> bool:
+        try:
+            return (
+                isinstance(obj, torch.Tensor)
+                and obj.device.type == "cuda"
+                and not isinstance(obj, torch._subclasses.FakeTensor)
+            )
+        except:  # noqa: E722
+            return False
+    _cycles.is_cuda_tensor = is_cuda_tensor  # type: ignore[invalid-assignment]
+    def observer(garbage: Any) -> None:
+        if garbage:
+            if not any(_cycles.is_cuda_tensor(obj) for obj in garbage):
+                logger.debug("No CUDA Tensors found in garbage")
+                return
+            logger.warning("Reference cycle includes a CUDA Tensor")
+            with tempfile.TemporaryDirectory() as temp_dir:
+                base_dir, html_path = _artifact_path(temp_dir, rank, "cycles", "html")
+                logger.debug(f"Saving tensor cycles to {html_path}")
+                with open(html_path, "wb") as f:
+                    f.write(_cycles.to_html(_cycles.create_graph(garbage)))
+                cb_logger.log_artifact(base_dir)
+    _cycles.observe_garbage(observer)
+class MemoryObserver(L.Callback):
+    """This callback registers an observer to dump and log the CUDA memory snapshot.
+    Args:
+        oom: (bool): whether to dump memory snapshot on Out-of-Memory (OOM) event. Defaults to ``True``
+        flamegraph (bool): whether to save memory snapshot in flamegraph format. Defaults to ``True``
+        reset_memory_history (bool): whether to reset memory history after snapshot. Defaults to ``False``
+        snapshot_pickle (bool): whether to dump memory snapshot in pickle format. Defaults to ``False``
+        tensor_cycles (bool): whether to detect and dump graphs with cycles containing tensors in the garbage.
+            Defaults to ``False``.
+        schedule (Optional[Schedule]): Controls when logging occurs besides OOM event. Defaults to :class:`Never`
+        **kwargs (Any): Arbitrary keyword arguments passed as is to ``memory._record_memory_history``.
+    """
+    def __init__(
+        self,
+        flamegraph: bool = True,
+        reset_memory_history: bool = False,
+        snapshot_pickle: bool = False,
+        tensor_cycles: bool = False,
+        schedule: Schedule | None = None,
+        oom: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        self.flamegraph = flamegraph
+        self.reset_memory_history = reset_memory_history
+        self.snapshot_pickle = snapshot_pickle
+        self.tensor_cycles = tensor_cycles
+        self.schedule = schedule or Never()
+        self.oom = oom
+        self.kwargs = kwargs
+        self._cb_logger: LightningLogger | None = None
+        _reset_recording(kwargs)
+    @override
+    def setup(self, trainer: "L.Trainer", pl_module: "L.LightningModule", stage: str) -> None:
+        if not torch.cuda.is_available():
+            logger.warning("No CUDA device is available")
+            return
+        self._cb_logger = CallbackLogger(trainer)
+        if self.tensor_cycles:
+            _detect_tensor_cycles(self._cb_logger, trainer.global_rank)
+        if self.oom:
+            if hasattr(torch._C, "_cuda_attach_out_of_memory_observer"):
+                def oom_observer_func(device: Any, alloc: Any, device_alloc: Any, device_free: Any) -> None:
+                    logger.warning("OOM observer triggered")
+                    return self.dump_memory_snapshot(trainer.global_rank)
+                torch._C._cuda_attach_out_of_memory_observer(oom_observer_func)
+                logger.info("OOM observer registered successfully")
+            else:
+                logger.warning(
+                    f"Failed to register OOM observer because torch._C._cuda_attach_out_of_memory_observer "
+                    f"is missing in torch=={torch.__version__}"
+                )
+    def maybe_dump_memory_snapshot(
+        self, trainer: "L.Trainer", stage: str | None = None, batch_idx: int | None = None
+    ) -> None:
+        if not torch.cuda.is_available():
+            return
+        if self.schedule.check(stage="train", batch_idx=batch_idx, step=trainer.global_step, trainer=trainer):
+            self.dump_memory_snapshot(trainer.global_rank)
+    def dump_memory_snapshot(self, rank: int) -> None:
+        if not hasattr(memory, "_snapshot"):
+            logger.warning(
+                f"Failed to capture memory snapshot because memory._snapshot is missing in torch=={torch.__version__}"
+            )
+            return
+        now = datetime.now(timezone.utc).isoformat()
+        logger.debug(f"Capturing memory snapshot on rank {rank} at {now}")
+        snapshot = memory._snapshot()
+        if self.reset_memory_history:
+            _reset_recording(self.kwargs)
+        with tempfile.TemporaryDirectory() as temp_dir:
+            base_dir: str | None = None
+            if self.snapshot_pickle:
+                base_dir, snapshot_path = _artifact_path(temp_dir, rank, "snapshot", "pickle")
+                logger.debug(f"Saving memory snapshot to {snapshot_path}")
+                with open(snapshot_path, "wb") as f:
+                    pickle.dump(snapshot, f)
+            if self.flamegraph:
+                if hasattr(torch.cuda, "_memory_viz"):
+                    flamegraph = torch.cuda._memory_viz.memory(snapshot)
+                    base_dir, flamegraph_path = _artifact_path(temp_dir, rank, "flamegraph", "svg")
+                    logger.debug(f"Saving memory flamegraph to {flamegraph_path}")
+                    with open(flamegraph_path, "w") as f:
+                        print(flamegraph, file=f)
+                else:
+                    logger.warning(
+                        f"Failed to create flamegraph because torch.cuda._memory_viz "
+                        f"is missing in torch=={torch.__version__}"
+                    )
+            if base_dir is not None:
+                logger.debug(f"Logging memory snapshot files with {self._cb_logger}")
+                assert self._cb_logger
+                self._cb_logger.log_artifact(base_dir)
+        logger.debug("Finished capturing memory snapshot")
+    @override
+    def on_train_batch_start(
+        self,
+        trainer: "L.Trainer",
+        pl_module: "L.LightningModule",
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        self.maybe_dump_memory_snapshot(trainer, stage="train", batch_idx=batch_idx)
+    @override
+    def on_test_batch_start(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", batch: Any, batch_idx: int, dataloader_idx: int = 0
+    ) -> None:
+        self.maybe_dump_memory_snapshot(trainer, stage="test", batch_idx=batch_idx)
+    @override
+    def on_validation_batch_start(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", batch: Any, batch_idx: int, dataloader_idx: int = 0
+    ) -> None:
+        self.maybe_dump_memory_snapshot(trainer, stage="validation", batch_idx=batch_idx)
+    @override
+    def on_predict_batch_start(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", batch: Any, batch_idx: int, dataloader_idx: int = 0
+    ) -> None:
+        self.maybe_dump_memory_snapshot(trainer, stage="predict", batch_idx=batch_idx)

fkat/pytorch/callbacks/cuda/nsys.py ADDED Viewed

@@ -0,0 +1,199 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+import os
+import sys
+import gzip
+import shutil
+import tempfile
+import atexit
+import signal
+from typing import Any, TYPE_CHECKING
+from typing_extensions import override
+from collections.abc import Sequence
+import torch
+import lightning as L
+if TYPE_CHECKING:
+    from lightning.pytorch.utilities.types import STEP_OUTPUT
+from fkat.pytorch.schedule import (
+    Schedule,
+    Never,
+)
+from fkat.pytorch.utilities import get_rank
+from fkat.pytorch.loggers import LightningLogger
+from fkat.pytorch.callbacks.loggers import CallbackLogger
+def exec_with_nsys(kwargs: dict[str, str]) -> None:
+    """Replace current process with nsys profiling of the specified script."""
+    # only capture between explicit API calls to start/stop profiling
+    kwargs["capture-range"] = "cudaProfilerApi"
+    kwargs["capture-range-end"] = "stop"
+    script_path, args = sys.argv[0], sys.argv[1:]
+    nsys_cmd = ["nsys", "profile", *[f"--{k}={v}" for k, v in kwargs.items()], "python", script_path] + args
+    # add current working dir for module resolution
+    os.environ["PYTHONPATH"] = os.path.join(
+        os.getcwd(), *([os.environ["PYTHONPATH"]] if "PYTHONPATH" in os.environ else [])
+    )
+    # replace current process with nsys
+    os.execvp("nsys", nsys_cmd)
+class Nsys(L.Callback):
+    def __init__(
+        self,
+        ranks: Sequence[int] | None = None,
+        output_path_prefix: str | None = None,
+        schedule: Schedule | None = None,
+        compress: bool = True,
+        record_shapes: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """
+        [Nsys](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) PyTorch Lightning callback.
+        This :class:`L.Callback` continiously traces the training process and publishes a report
+        that helps examining the duration of individual calls through time.
+        Args:
+            ranks (Optional[Sequence[int]]): Only trace the provided ranks, defaults to all ranks
+            output_path_prefix (Optional[str]): output path prefix for generated reports,
+                use to persist these files locally, defaults to temporary location that is cleaned as soon as possible
+            schedule (Optional[Schedule]): Controls when tracing occurs during training.
+                Defaults to :class:`Never` - no tracing
+            compress (bool): Whether to compress the report.
+                Defaults to ``True``
+            record_shapes (bool): Whether to include tensor shapes in the report.
+                Defaults to ``False``
+            **kwargs (Any): Arbitrary keyword arguments passed as is to Nsys.
+        """
+        self.rank = get_rank()
+        self.schedule = schedule or Never()
+        self.output_path_prefix = output_path_prefix
+        self.compress = compress
+        self.record_shapes = record_shapes
+        self._enabled = False
+        if ranks is None or self.rank in ranks:
+            # break infinite recusion
+            self.output_file = os.environ.pop("NSYS_OUTPUT", None)
+            if self.output_file is None:
+                output_file = os.path.join(self.output_path_prefix or tempfile.mkdtemp(), f"rank{self.rank}.nsys-rep")
+                os.environ["NSYS_OUTPUT"] = kwargs["output"] = output_file
+                exec_with_nsys(kwargs)
+            self._maybe_trace()
+        self._cb_logger: LightningLogger | None = None
+        self.stage: str | None = None
+        signal.signal(signal.SIGTERM, self._terminate)  # terminate signal
+        signal.signal(signal.SIGINT, self._terminate)  # keyboard interrupt
+        atexit.register(self._terminate)
+    @override
+    def setup(self, trainer: "L.Trainer", pl_module: "L.LightningModule", stage: str) -> None:
+        self._cb_logger = CallbackLogger(trainer)
+        self.stage = stage
+        self._maybe_trace(stage=stage)
+    def _maybe_trace(
+        self, trainer: "L.Trainer | None" = None, stage: str | None = None, batch_idx: int | None = None
+    ) -> None:
+        should_run = self.schedule.check(
+            stage=stage, batch_idx=batch_idx, step=trainer.global_step if trainer else None, trainer=trainer
+        )
+        if should_run:
+            self._start()
+        else:
+            self._stop()
+    def _start(self) -> None:
+        if self._enabled:
+            return
+        self._enabled = True
+        torch.cuda.cudart().cudaProfilerStart()
+        torch.autograd.profiler.emit_nvtx(record_shapes=self.record_shapes).__enter__()
+    def _stop(self) -> None:
+        if not self._enabled:
+            return
+        torch.cuda.cudart().cudaProfilerStop()
+        torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
+        self._enabled = False
+    @override
+    def on_train_batch_end(
+        self,
+        trainer: "L.Trainer",
+        pl_module: "L.LightningModule",
+        outputs: Any,
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        self._maybe_trace(trainer, "train", batch_idx + 1)
+    @override
+    def on_validation_batch_end(
+        self,
+        trainer: "L.Trainer",
+        pl_module: "L.LightningModule",
+        outputs: "STEP_OUTPUT",
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        self._maybe_trace(trainer, "validation", batch_idx + 1)
+    @override
+    def on_predict_batch_end(
+        self,
+        trainer: "L.Trainer",
+        pl_module: "L.LightningModule",
+        outputs: Any,
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        self._maybe_trace(trainer, "predict", batch_idx + 1)
+    @override
+    def on_test_batch_end(
+        self,
+        trainer: "L.Trainer",
+        pl_module: "L.LightningModule",
+        outputs: "STEP_OUTPUT",
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        self._maybe_trace(trainer, "test", batch_idx + 1)
+    def _publish(self) -> None:
+        self._stop()
+        assert self.output_file
+        os.makedirs(os.path.dirname(self.output_file), exist_ok=True)
+        if self.compress:
+            with open(self.output_file, "rb") as f_in:
+                output_file = self.output_file + ".gz"
+                with gzip.open(output_file, "wb") as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+                shutil.rmtree(self.output_file, ignore_errors=True)
+        assert self._cb_logger
+        self._cb_logger.log_artifact(output_file, "nsys")
+        if not self.output_path_prefix:
+            shutil.rmtree(output_file, ignore_errors=True)
+    @override
+    def teardown(self, trainer: "L.Trainer", pl_module: "L.LightningModule", stage: str) -> None:
+        self._terminate()
+    @override
+    def on_exception(self, trainer: "L.Trainer", pl_module: "L.LightningModule", exception: BaseException) -> None:
+        self._terminate()
+    def _terminate(self, *_: Any) -> None:
+        if self.stage:
+            self._publish()

fkat/pytorch/callbacks/cuda/nvtx.py ADDED Viewed

@@ -0,0 +1,288 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+from enum import Enum
+from typing import Any, TYPE_CHECKING
+from typing_extensions import override
+import inspect
+import lightning as L
+import torch
+if TYPE_CHECKING:
+    from lightning.pytorch.utilities.types import STEP_OUTPUT
+try:
+    import nvtx
+except ImportError:
+    from torch.cuda import nvtx
+    _mark = nvtx.mark
+    def _conditional_mark(message: str, *args: Any, **kwargs: Any) -> Any:
+        sig = inspect.signature(_mark)
+        filtered_kwargs = {}
+        if "domain" in kwargs and "color" not in kwargs:
+            kwargs["color"] = DOMAIN_COLORS[kwargs["domain"]]
+        for param in ["color", "domain"]:
+            if param in sig.parameters and param in kwargs:
+                filtered_kwargs[param] = kwargs[param]
+        return _mark(message, **filtered_kwargs)
+    nvtx.mark = _conditional_mark  # type: ignore[invalid-assignment]
+class Domain(str, Enum):
+    INIT = "init"
+    TRAIN = "train"
+    VALIDATION = "validation"
+    TEST = "test"
+    PREDICT = "predict"
+    TUNE = "tune"
+    ERROR = "error"
+    CHECKPOINT = "checkpoint"
+    @staticmethod
+    def from_stage(s: str) -> "Domain":
+        if s == "fit" or s == "train":
+            return Domain.TRAIN
+        if s == "validation":
+            return Domain.VALIDATION
+        if s == "test":
+            return Domain.TEST
+        if s == "predict":
+            return Domain.PREDICT
+        if s == "tune":
+            return Domain.TUNE
+        raise NotImplementedError(f"Unsupported stage: {s}")
+DOMAIN_COLORS = {
+    Domain.INIT: "white",
+    Domain.TUNE: "pink",
+    Domain.TRAIN: "green",
+    Domain.VALIDATION: "blue",
+    Domain.TEST: "purple",
+    Domain.PREDICT: "yellow",
+    Domain.ERROR: "red",
+    Domain.CHECKPOINT: "orange",
+}
+class Nvtx(L.Callback):
+    def __init__(self) -> None:
+        nvtx.mark("__init__()", domain=Domain.INIT)  # type: ignore[unknown-argument]
+    @override
+    def setup(self, trainer: "L.Trainer", pl_module: "L.LightningModule", stage: str) -> None:
+        domain = Domain.from_stage(stage)
+        nvtx.mark(f"setup(stage={stage})", domain=domain)  # type: ignore[unknown-argument]
+    @override
+    def teardown(self, trainer: "L.Trainer", pl_module: "L.LightningModule", stage: str) -> None:
+        domain = Domain.from_stage(stage)
+        nvtx.mark(f"teardown(stage={stage})", domain=domain)  # type: ignore[unknown-argument]
+    @override
+    def on_train_start(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_train_start()", domain=Domain.TRAIN)  # type: ignore[unknown-argument]
+    @override
+    def on_train_epoch_start(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_train_epoch_start()", domain=Domain.TRAIN)  # type: ignore[unknown-argument]
+    @override
+    def on_train_batch_start(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", batch: Any, batch_idx: int
+    ) -> None:
+        nvtx.mark(
+            f"on_train_batch_start(batch_idx={batch_idx})",
+            domain=Domain.TRAIN,  # type: ignore[unknown-argument]
+        )
+    @override
+    def on_before_zero_grad(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", optimizer: "torch.optim.Optimizer"
+    ) -> None:
+        nvtx.mark("on_before_zero_grad()", domain=Domain.TRAIN)  # type: ignore[unknown-argument]
+    @override
+    def on_before_backward(self, trainer: "L.Trainer", pl_module: "L.LightningModule", loss: "torch.Tensor") -> None:
+        nvtx.mark("on_before_backward()", domain=Domain.TRAIN)  # type: ignore[unknown-argument]
+    @override
+    def on_after_backward(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_after_backward()", domain=Domain.TRAIN)  # type: ignore[unknown-argument]
+    @override
+    def on_before_optimizer_step(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", optimizer: "torch.optim.Optimizer"
+    ) -> None:
+        nvtx.mark("on_before_optimizer_step()", domain=Domain.TRAIN)  # type: ignore[unknown-argument]
+    @override
+    def on_train_batch_end(
+        self,
+        trainer: "L.Trainer",
+        pl_module: "L.LightningModule",
+        outputs: "STEP_OUTPUT",
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        nvtx.mark(f"on_train_batch_end(batch_idx={batch_idx})", domain=Domain.TRAIN)  # type: ignore[unknown-argument]
+    @override
+    def on_train_epoch_end(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_train_epoch_end()", domain=Domain.TRAIN)  # type: ignore[unknown-argument]
+    @override
+    def on_train_end(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_train_end()", domain=Domain.TRAIN)  # type: ignore[unknown-argument]
+    @override
+    def on_sanity_check_start(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_validation_start()", domain=Domain.VALIDATION)  # type: ignore[unknown-argument]
+    def on_sanity_check_end(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_sanity_check_start()", domain=Domain.VALIDATION)  # type: ignore[unknown-argument]
+    @override
+    def on_validation_start(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_sanity_check_end()", domain=Domain.VALIDATION)  # type: ignore[unknown-argument]
+    @override
+    def on_validation_epoch_start(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_validation_epoch_start()", domain=Domain.VALIDATION)  # type: ignore[unknown-argument]
+    @override
+    def on_validation_batch_start(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", batch: Any, batch_idx: int, dataloader_idx: int = 0
+    ) -> None:
+        nvtx.mark(
+            f"on_validation_batch_start(batch_idx={batch_idx})",
+            domain=Domain.VALIDATION,  # type: ignore[unknown-argument]
+        )
+    @override
+    def on_validation_batch_end(
+        self,
+        trainer: "L.Trainer",
+        pl_module: "L.LightningModule",
+        outputs: "STEP_OUTPUT",
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        nvtx.mark(
+            f"on_validation_batch_end(batch_idx={batch_idx})",
+            domain=Domain.VALIDATION,  # type: ignore[unknown-argument]
+        )
+    @override
+    def on_validation_epoch_end(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_validation_epoch_end()", domain=Domain.VALIDATION)  # type: ignore[unknown-argument]
+    @override
+    def on_validation_end(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_validation_end()", domain=Domain.VALIDATION)  # type: ignore[unknown-argument]
+    @override
+    def on_test_start(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_test_start()", domain=Domain.TEST)  # type: ignore[unknown-argument]
+    @override
+    def on_test_epoch_start(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_test_epoch_start()", domain=Domain.TEST)  # type: ignore[unknown-argument]
+    @override
+    def on_test_batch_start(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", batch: Any, batch_idx: int, dataloader_idx: int = 0
+    ) -> None:
+        nvtx.mark(f"on_test_batch_start(batch_idx={batch_idx})", domain=Domain.TEST)  # type: ignore[unknown-argument]
+    @override
+    def on_test_batch_end(
+        self,
+        trainer: "L.Trainer",
+        pl_module: "L.LightningModule",
+        outputs: "STEP_OUTPUT",
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        nvtx.mark(f"on_test_batch_end(batch_idx={batch_idx})", domain=Domain.TEST)  # type: ignore[unknown-argument]
+    @override
+    def on_test_epoch_end(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_test_epoch_end()", domain=Domain.TEST)  # type: ignore[unknown-argument]
+    @override
+    def on_test_end(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_test_end()", domain=Domain.TEST)  # type: ignore[unknown-argument]
+    @override
+    def on_predict_start(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_predict_start()", domain=Domain.PREDICT)  # type: ignore[unknown-argument]
+    @override
+    def on_predict_epoch_start(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_predict_epoch_start()", domain=Domain.PREDICT)  # type: ignore[unknown-argument]
+    @override
+    def on_predict_batch_start(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", batch: Any, batch_idx: int, dataloader_idx: int = 0
+    ) -> None:
+        nvtx.mark(
+            f"on_predict_batch_start(batch_idx={batch_idx})",
+            domain=Domain.PREDICT,  # type: ignore[unknown-argument]
+        )
+    @override
+    def on_predict_batch_end(
+        self,
+        trainer: "L.Trainer",
+        pl_module: "L.LightningModule",
+        outputs: Any,
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        nvtx.mark(
+            f"on_predict_batch_end(batch_idx={batch_idx})",
+            domain=Domain.PREDICT,  # type: ignore[unknown-argument]
+        )
+    @override
+    def on_predict_epoch_end(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_predict_epoch_end()", domain=Domain.PREDICT)  # type: ignore[unknown-argument]
+    @override
+    def on_predict_end(self, trainer: "L.Trainer", pl_module: "L.LightningModule") -> None:
+        nvtx.mark("on_predict_end()", domain=Domain.PREDICT)  # type: ignore[unknown-argument]
+    @override
+    def state_dict(self) -> dict[str, Any]:
+        nvtx.mark("state_dict()", domain=Domain.CHECKPOINT)  # type: ignore[unknown-argument]
+        return {}
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        nvtx.mark("load_state_dict()", domain=Domain.CHECKPOINT)  # type: ignore[unknown-argument]
+    @override
+    def on_save_checkpoint(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", checkpoint: dict[str, Any]
+    ) -> None:
+        nvtx.mark("on_save_checkpoint()", domain=Domain.CHECKPOINT)  # type: ignore[unknown-argument]
+    @override
+    def on_load_checkpoint(
+        self, trainer: "L.Trainer", pl_module: "L.LightningModule", checkpoint: dict[str, Any]
+    ) -> None:
+        nvtx.mark("on_load_checkpoint()", domain=Domain.CHECKPOINT)  # type: ignore[unknown-argument]
+    @override
+    def on_exception(self, trainer: "L.Trainer", pl_module: "L.LightningModule", exception: BaseException) -> None:
+        nvtx.mark(f"on_exception({type(exception)})", domain=Domain.ERROR)  # type: ignore[unknown-argument]