PyPI - d9d - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

d9d 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

d9d/kernel/swiglu/op.py +17 -4
d9d/loop/component/__init__.py +5 -2
d9d/loop/component/model_stage_factory.py +14 -54
d9d/loop/component/{loss_computer.py → pipeline_result_processing.py} +73 -10
d9d/loop/component/{train_task_operator.py → task_operator.py} +97 -57
d9d/loop/config/config.py +1 -1
d9d/loop/control/task.py +2 -2
d9d/loop/run/__init__.py +3 -0
d9d/loop/run/inference.py +256 -0
d9d/loop/run/train.py +2 -4
d9d/loop/state.py +4 -1
d9d/pipelining/api/__init__.py +3 -0
d9d/pipelining/api/types.py +28 -0
d9d/pipelining/factory/factory.py +68 -29
d9d/pipelining/infra/schedule/component/runtime/__init__.py +3 -1
d9d/pipelining/infra/schedule/component/runtime/action.py +7 -10
d9d/pipelining/infra/schedule/component/runtime/{loss.py → callback.py} +33 -10
d9d/pipelining/infra/schedule/component/runtime/executor.py +10 -8
d9d/pipelining/infra/schedule/component/runtime/offline.py +70 -0
{d9d-0.1.0.dist-info → d9d-0.2.0.dist-info}/METADATA +22 -1
{d9d-0.1.0.dist-info → d9d-0.2.0.dist-info}/RECORD +23 -19
{d9d-0.1.0.dist-info → d9d-0.2.0.dist-info}/WHEEL +1 -1
d9d-0.2.0.dist-info/licenses/LICENSE +201 -0

d9d/kernel/swiglu/op.py CHANGED Viewed

@@ -3,6 +3,15 @@ import triton
 import triton.language as tl
+def _size_bucket(n_elements: int) -> int:
+    # different auto-tuning for small and asymptotically large kernels
+    # perhaps we could extend this in future?
+    if n_elements < 8192:
+        return 0
+    else:
+        return 1
 @triton.autotune(
     configs=[
         triton.Config({"BLOCK_SIZE": 1024}, num_warps=4),
@@ -11,7 +20,7 @@ import triton.language as tl
         triton.Config({"BLOCK_SIZE": 4096}, num_warps=8),
         triton.Config({"BLOCK_SIZE": 8192}, num_warps=8),
     ],
-    key=["n_elements"]
+    key=["size_bucket"]
 )
 @triton.jit
 def _silu_mul_kernel(
@@ -19,6 +28,7 @@ def _silu_mul_kernel(
         y_ptr: torch.Tensor,
         out_ptr: torch.Tensor,
         n_elements: int,
+        size_bucket: int,  # used for autotuning
         BLOCK_SIZE: tl.constexpr,
 ):
     # prepare
@@ -72,7 +82,8 @@ def silu_mul_forward(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     _silu_mul_kernel[_grid](
         x, y, out,
-        n_elements
+        n_elements,
+        size_bucket=_size_bucket(n_elements)
     )
     return out
@@ -86,7 +97,7 @@ def silu_mul_forward(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         triton.Config({"BLOCK_SIZE": 4096}, num_warps=8),
         triton.Config({"BLOCK_SIZE": 8192}, num_warps=8),
     ],
-    key=["n_elements"]
+    key=["size_bucket"]
 )
 @triton.jit
 def _silu_mul_backward_kernel(
@@ -96,6 +107,7 @@ def _silu_mul_backward_kernel(
         grad_x_ptr: torch.Tensor,
         grad_y_ptr: torch.Tensor,
         n_elements: int,
+        size_bucket: int,  # used for autotuning
         BLOCK_SIZE: tl.constexpr
 ):
     # prepare
@@ -161,7 +173,8 @@ def silu_mul_backward(
     _silu_mul_backward_kernel[_grid](
         grad_output, x, y,
         grad_x, grad_y,
-        n_elements
+        n_elements,
+        size_bucket=_size_bucket(n_elements)
     )
     return grad_x, grad_y

d9d/loop/component/__init__.py CHANGED Viewed

@@ -6,13 +6,13 @@ from .gradient_clipper import GradientClipper
 from .gradient_manager import GradientManager
 from .job_logger import JobLogger
 from .job_profiler import JobProfiler
-from .loss_computer import LossComputer
 from .model_stage_exporter import ModelStageExporter
 from .model_stage_factory import ModelStageFactory, TrackedModules
 from .optimizer_factory import OptimizerFactory
+from .pipeline_result_processing import InferenceProcessor, LossComputer, PipelineOutputsProcessor
 from .stepper import Stepper
+from .task_operator import ForwardResult, InferenceTaskOperator, TrainTaskOperator
 from .timeout_manager import TimeoutManager
-from .train_task_operator import ForwardResult, TrainTaskOperator
 __all__ = [
     "BatchMaths",
@@ -20,6 +20,8 @@ __all__ = [
     "ForwardResult",
     "GradientClipper",
     "GradientManager",
+    "InferenceProcessor",
+    "InferenceTaskOperator",
     "JobLogger",
     "JobProfiler",
     "LossComputer",
@@ -27,6 +29,7 @@ __all__ = [
     "ModelStageExporter",
     "ModelStageFactory",
     "OptimizerFactory",
+    "PipelineOutputsProcessor",
     "StateCheckpointer",
     "Stepper",
     "TimeoutManager",

d9d/loop/component/model_stage_factory.py CHANGED Viewed

@@ -15,7 +15,7 @@ from d9d.pipelining.api import PipelineStageInfo
 from d9d.pipelining.factory.factory import PipelineScheduleInfo, build_schedule
 from .batch_maths import BatchMaths
-from .loss_computer import LossComputer
+from .pipeline_result_processing import PipelineOutputsProcessor
 StatefulPredicate = Callable[[str, torch.Tensor], bool]
 """Determines if a specific parameter or buffer should be included in the state dictionary."""
@@ -51,28 +51,6 @@ class TrackedModules(Stateful):
         self._modules = modules
         self._stateful_predicate = stateful_predicate
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        """
-        Forwards execution to the only pipeline stage.
-        This method is only valid when pipeline parallelism is disabled.
-        Args:
-            *args: Positional arguments passed to the module.
-            **kwargs: Keyword arguments passed to the module.
-        Returns:
-            The output of the model execution.
-        Raises:
-            ValueError: If pipeline parallelism is configured.
-        """
-        if self._dist_context.mesh_params.has_pipeline_parallel:
-            raise ValueError("You cannot call tracked modules when using pipelining")
-        return self._modules[0](*args, **kwargs)
     @property
     def modules(self) -> list[nn.Module]:
         """Returns the list of underlying PyTorch model modules."""
@@ -159,8 +137,8 @@ class ModelStageFactory:
             dist_context: DistributedContext,
             batch_maths: BatchMaths,
             config_model: ModelStageFactoryConfig,
-            config_pipelining: PipeliningConfig | None,
-            loss_computer: LossComputer | None
+            config_pipelining: PipeliningConfig,
+            pipeline_callback: PipelineOutputsProcessor
     ):
         """Constructs a ModelStageFactory object."""
@@ -169,7 +147,7 @@ class ModelStageFactory:
         self._config_model = config_model
         self._config_pipelining = config_pipelining
         self._batch_maths = batch_maths
-        self._loss_computer = loss_computer
+        self._pipeline_callback = pipeline_callback
     def _build_model_stage(self, stage: PipelineStageInfo) -> nn.Module:
         # create a model with no real memory occupied
@@ -218,21 +196,13 @@ class ModelStageFactory:
     def build_pipeline_and_modules(
             self
-    ) -> tuple[PipelineScheduleInfo | None, TrackedModules]:
+    ) -> tuple[PipelineScheduleInfo, TrackedModules]:
         """
         Constructs the execution schedule and the model container.
-        If pipeline parallelism is enabled, this orchestrates the creation of a
-        distributed pipeline schedule.
-        Otherwise, it simply builds a standalone model stage.
         Returns:
-           The pipeline schedule information (or None if no pipelining).
+           The pipeline schedule information.
            The `TrackedModules` instance wrapping the created model stage(s).
-        Raises:
-           ValueError: If pipelining configuration is missing but a pipeline is requested.
         """
         if self._config_model.checkpoint_only_trainable_parameters:
@@ -240,22 +210,12 @@ class ModelStageFactory:
         else:
             stateful_predicate = _stateful_predicate_always
-        if self._dist_context.mesh_params.has_pipeline_parallel:
-            if self._config_pipelining is None:
-                raise ValueError("Pipelining is enabled, but not configured")
-            loss_fn = self._loss_computer.compute_loss_mul_weight if self._loss_computer is not None else None
-            schedule, modules = build_schedule(
-                dist_context=self._dist_context,
-                n_microbatches=self._batch_maths.num_microbatches_pipelining,
-                schedule_config=self._config_pipelining.schedule,
-                model_provider=self._build_model_stage,
-                loss_fn=loss_fn
-            )
-            return schedule, TrackedModules(self._dist_context, modules, stateful_predicate)
-        else:
-            model = self._build_model_stage(PipelineStageInfo(num_stages=1, current_stage=0))
+        schedule, modules = build_schedule(
+            dist_context=self._dist_context,
+            n_microbatches=self._batch_maths.num_microbatches_pipelining,
+            schedule_config=self._config_pipelining.schedule,
+            model_provider=self._build_model_stage,
+            callback=self._pipeline_callback
+        )
-            return None, TrackedModules(self._dist_context, [model], stateful_predicate)
+        return schedule, TrackedModules(self._dist_context, modules, stateful_predicate)

d9d/loop/component/{loss_computer.py → pipeline_result_processing.py} RENAMED Viewed

@@ -1,7 +1,10 @@
+import abc
+from typing import Generic, TypeVar
 import torch
 from d9d.internals.pipeline_state import PipelineStateHandler
-from d9d.loop.control import ComputeLossContext, TrainTask
+from d9d.loop.control import ComputeLossContext, InferenceTask, ProcessOutputsContext, TrainTask
 from .stepper import Stepper
@@ -9,7 +12,20 @@ STATE_LOSS = "__internal_loss"
 STATE_LOSS_WEIGHT = "__internal_loss_weight"
-class LossComputer:
+TOutput = TypeVar("TOutput")
+class PipelineOutputsProcessor(abc.ABC, Generic[TOutput]):
+    @abc.abstractmethod
+    def __call__(
+            self,
+            pipeline_outputs: dict[str, torch.Tensor],
+            microbatch_idx: int
+    ) -> TOutput:
+        ...
+class LossComputer(PipelineOutputsProcessor[torch.Tensor]):
     """
     Handles the computation of loss values and their integration into the pipeline state.
@@ -38,10 +54,10 @@ class LossComputer:
         self._task = task
         self._stepper = stepper
-    def compute_loss_mul_weight(
+    def __call__(
             self,
             pipeline_outputs: dict[str, torch.Tensor],
-            microbatch_idx: int | None
+            microbatch_idx: int
     ) -> torch.Tensor:
         """
         Computes the weighted loss for a specific sharded microbatch or the full microbatch.
@@ -61,12 +77,9 @@ class LossComputer:
             The calculated loss multiplied by its weight.
         """
-        if microbatch_idx is None:
-            state = self._state.global_state()
-        else:
-            state = self._state.sharded_state(
-                shard_id=microbatch_idx
-            )
+        state = self._state.sharded_state(
+            shard_id=microbatch_idx
+        )
         computation = self._task.compute_loss(ComputeLossContext(
             pipeline_results=pipeline_outputs,
@@ -84,3 +97,53 @@ class LossComputer:
         state[STATE_LOSS_WEIGHT] = loss_weight[None]
         return loss * loss_weight
+class InferenceProcessor(PipelineOutputsProcessor[None]):
+    """
+    Handles the processing of model outputs during inference or evaluation.
+    This component retrieves the appropriate state context
+    and delegates the output processing logic to the user-defined inference task.
+    """
+    def __init__(
+            self,
+            state: PipelineStateHandler,
+            task: InferenceTask
+    ):
+        """
+        Constructs a new ModelOutputsProcessor.
+        Args:
+            state: Handler for managing global and sharded pipeline states.
+            task: The user-defined inference task containing processing logic.
+        """
+        self._state = state
+        self._task = task
+    def __call__(
+            self,
+            pipeline_outputs: dict[str, torch.Tensor],
+            microbatch_idx: int
+    ) -> None:
+        """
+        Processes model outputs for a specific microbatch or full batch.
+        This method retrieves the relevant state (scoped by microbatch index if provided)
+        and invokes the task's output processing logic.
+        Args:
+            pipeline_outputs: Dictionary containing model output tensors.
+            microbatch_idx: Index of the current microbatch, or None if not using microbatching.
+        """
+        state = self._state.sharded_state(
+            shard_id=microbatch_idx
+        )
+        self._task.process_outputs(ProcessOutputsContext(
+            pipeline_results=pipeline_outputs,
+            state=state
+        ))

d9d/loop/component/{train_task_operator.py → task_operator.py} RENAMED Viewed

@@ -5,12 +5,17 @@ import torch
 from d9d.core.dist_context import DistributedContext
 from d9d.core.types import PyTree
 from d9d.internals.pipeline_state import PipelineStateHandler
-from d9d.loop.control import BuildForwardInputsContext, BuildForwardInputsResult, TrainTask, UpdateMetricsContext
+from d9d.loop.control import (
+    BaseTask,
+    BuildForwardInputsContext,
+    InferenceTask,
+    TrainTask,
+    UpdateMetricsContext,
+)
 from d9d.metric.impl import ComposeMetric
 from d9d.pipelining.factory.factory import PipelineScheduleInfo
-from .loss_computer import STATE_LOSS, STATE_LOSS_WEIGHT, LossComputer
-from .model_stage_factory import TrackedModules
+from .pipeline_result_processing import STATE_LOSS, STATE_LOSS_WEIGHT
 @dataclasses.dataclass(kw_only=True)
@@ -27,12 +32,34 @@ class ForwardResult:
     loss_weight: torch.Tensor
+def _run_pipeline(
+        task: BaseTask,
+        pipeline: PipelineScheduleInfo,
+        pipeline_state: PipelineStateHandler,
+        batch: PyTree
+):
+    model_inputs = task.build_forward_inputs(
+        BuildForwardInputsContext(
+            batch=batch,
+            state=pipeline_state.global_state()
+        )
+    )
+    pipeline.schedule.configure_buffers(
+        inputs=model_inputs.inputs,
+        kwargs=model_inputs.kwargs,
+        sharding_spec=model_inputs.pipeline_sharding_spec
+    )
+    pipeline.schedule.step(
+        inputs=model_inputs.inputs,
+        kwargs=model_inputs.kwargs
+    )
 class TrainTaskOperator:
     """
     Orchestrates the execution of the forward and backward passes for a specific training task.
-    This class abstracts the difference between standard execution
-    and pipeline-parallel execution. It manages input construction, schedule execution,
+    It manages input construction, schedule execution,
     loss computation, and metric updates within the lifecycle of a single step.
     """
@@ -40,9 +67,7 @@ class TrainTaskOperator:
             self,
             dist_context: DistributedContext,
             task: TrainTask,
-            pp_schedule: PipelineScheduleInfo | None,
-            tracked_modules: TrackedModules,
-            loss_computer: LossComputer,
+            pipeline: PipelineScheduleInfo,
             pipeline_state: PipelineStateHandler,
             metrics: ComposeMetric
     ):
@@ -52,48 +77,17 @@ class TrainTaskOperator:
         Args:
             dist_context: The distributed context.
             task: The user-defined training task logic.
-            pp_schedule: Information about the pipeline schedule.
-            tracked_modules: The model modules being trained.
-            loss_computer: Component responsible for calculating loss from outputs.
+            pipeline: Information about the pipeline schedule.
             pipeline_state: Handler for transient state storage during the step.
             metrics: Metric collection to update after the pass.
         """
         self._dist_context = dist_context
         self._task = task
-        self._pp_schedule = pp_schedule
-        self._tracked_modules = tracked_modules
-        self._loss_computer = loss_computer
+        self._pipeline = pipeline
         self._pipeline_state = pipeline_state
         self._metrics = metrics
-    def _forward_backward_pipelining(self, model_inputs: BuildForwardInputsResult):
-        if self._pp_schedule is None:
-            raise ValueError("Cannot run pipelined pass if pipelining is disabled")
-        self._pp_schedule.schedule.configure_buffers(
-            inputs=model_inputs.inputs,
-            kwargs=model_inputs.kwargs,
-            sharding_spec=model_inputs.pipeline_sharding_spec
-        )
-        self._pp_schedule.schedule.step(
-            inputs=model_inputs.inputs,
-            kwargs=model_inputs.kwargs
-        )
-    def _forward_backward_regular(self, model_inputs: BuildForwardInputsResult):
-        pipeline_outputs = self._tracked_modules(
-            **model_inputs.inputs,
-            **model_inputs.kwargs
-        )
-        loss = self._loss_computer.compute_loss_mul_weight(
-            pipeline_outputs=pipeline_outputs,
-            microbatch_idx=None
-        )
-        # free to avoid bwd peaking memory
-        del pipeline_outputs
-        loss.backward()
     def forward_backward(self, batch: PyTree) -> ForwardResult | None:
         """
         Executes the forward and backward passes for a single batch.
@@ -117,27 +111,17 @@ class TrainTaskOperator:
         try:
             # Do forward and backward pass
-            model_inputs = self._task.build_forward_inputs(
-                BuildForwardInputsContext(
-                    batch=batch,
-                    state=self._pipeline_state.global_state()
-                )
+            _run_pipeline(
+                pipeline_state=self._pipeline_state,
+                task=self._task,
+                pipeline=self._pipeline,
+                batch=batch
             )
-            if self._dist_context.mesh_params.has_pipeline_parallel:
-                self._forward_backward_pipelining(model_inputs)
-            else:
-                self._forward_backward_regular(model_inputs)
             # Update metrics if possible
             pipeline_state = self._pipeline_state.global_state()
-            if (
-                    self._dist_context.mesh_params.has_pipeline_parallel and
-                    self._pp_schedule is not None and
-                    not self._pp_schedule.has_last_stage
-            ):
+            if not self._pipeline.has_last_stage:
                 return None
             self._task.update_metrics(UpdateMetricsContext(
@@ -150,3 +134,59 @@ class TrainTaskOperator:
             )
         finally:
             self._pipeline_state.reset()
+class InferenceTaskOperator:
+    """
+    Orchestrates the execution of the forward pass for a specific inference task.
+    It manages input
+    construction, schedule execution, and state lifecycle management.
+    """
+    def __init__(
+            self,
+            dist_context: DistributedContext,
+            task: InferenceTask,
+            pipeline: PipelineScheduleInfo,
+            pipeline_state: PipelineStateHandler
+    ):
+        """
+        Constructs the InferenceTaskOperator.
+        Args:
+            dist_context: The distributed context.
+            task: The user-defined inference task logic.
+            pipeline: Information about the pipeline schedule.
+            pipeline_state: Handler for transient state storage during the step.
+        """
+        self._dist_context = dist_context
+        self._task = task
+        self._pipeline = pipeline
+        self._pipeline_state = pipeline_state
+    def forward(self, batch: PyTree) -> None:
+        """
+        Executes the forward pass for a single batch.
+        This method handles:
+        1. Context preparation and input building via the `InferenceTask`.
+        2. Execution via Pipeline Parallel schedule.
+        3. Reliable cleanup of the pipeline state.
+        Args:
+            batch: The input batch data.
+        """
+        try:
+            # Do forward pass
+            _run_pipeline(
+                pipeline_state=self._pipeline_state,
+                task=self._task,
+                pipeline=self._pipeline,
+                batch=batch
+            )
+        finally:
+            self._pipeline_state.reset()

d9d/loop/config/config.py CHANGED Viewed

@@ -190,7 +190,7 @@ class TrainerConfig(BaseModel):
     batching: BatchingConfig
     data_loading: DataLoadingConfig
     logging: JobLoggerConfig
-    pipelining: PipeliningConfig | None
+    pipelining: PipeliningConfig
     model_stage_factory: ModelStageFactoryConfig
     determinism: DeterminismConfig
     gc: GarbageCollectionConfig

d9d/loop/control/task.py CHANGED Viewed

@@ -252,11 +252,11 @@ class ProcessOutputsContext:
     Context data provided to process outputs during inference.
     Attributes:
-        outputs: The outputs returned by the model's forward pass.
+        pipeline_results: The outputs returned by the model's forward pass.
         state: The current state of the pipeline.
     """
-    outputs: dict[str, torch.Tensor]
+    pipeline_results: dict[str, torch.Tensor]
     state: "PipelineState"

d9d/loop/run/__init__.py CHANGED Viewed

@@ -1,6 +1,9 @@
+from .inference import Inference, InferenceConfigurator
 from .train import Trainer, TrainingConfigurator
 __all__ = [
+    "Inference",
+    "InferenceConfigurator",
     "Trainer",
     "TrainingConfigurator"
 ]

d9d 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

d9d 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl