PyPI - d9d - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

d9d 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

d9d/kernel/swiglu/op.py +17 -4
d9d/loop/component/__init__.py +5 -2
d9d/loop/component/model_stage_factory.py +14 -54
d9d/loop/component/{loss_computer.py → pipeline_result_processing.py} +73 -10
d9d/loop/component/{train_task_operator.py → task_operator.py} +97 -57
d9d/loop/config/config.py +1 -1
d9d/loop/control/task.py +2 -2
d9d/loop/run/__init__.py +3 -0
d9d/loop/run/inference.py +256 -0
d9d/loop/run/train.py +2 -4
d9d/loop/state.py +4 -1
d9d/pipelining/api/__init__.py +3 -0
d9d/pipelining/api/types.py +28 -0
d9d/pipelining/factory/factory.py +68 -29
d9d/pipelining/infra/schedule/component/runtime/__init__.py +3 -1
d9d/pipelining/infra/schedule/component/runtime/action.py +7 -10
d9d/pipelining/infra/schedule/component/runtime/{loss.py → callback.py} +33 -10
d9d/pipelining/infra/schedule/component/runtime/executor.py +10 -8
d9d/pipelining/infra/schedule/component/runtime/offline.py +70 -0
{d9d-0.1.1.dist-info → d9d-0.2.0.dist-info}/METADATA +2 -1
{d9d-0.1.1.dist-info → d9d-0.2.0.dist-info}/RECORD +23 -19
d9d-0.2.0.dist-info/licenses/LICENSE +201 -0
{d9d-0.1.1.dist-info → d9d-0.2.0.dist-info}/WHEEL +0 -0

d9d/loop/run/inference.py ADDED Viewed

@@ -0,0 +1,256 @@
+import torch
+from tqdm import tqdm
+from d9d.core.dist_context import DeviceMeshParameters
+from d9d.internals.determinism import set_seeds
+from d9d.internals.pipeline_state import PipelineStateHandler
+from d9d.loop.component import (
+    BatchMaths,
+    DataLoaderFactory,
+    InferenceProcessor,
+    InferenceTaskOperator,
+    JobProfiler,
+    ManualGarbageCollector,
+    ModelStageFactory,
+    StateCheckpointer,
+    Stepper,
+    TimeoutManager,
+)
+from d9d.loop.config import InferenceConfig, PipeliningConfig
+from d9d.loop.control import (
+    DatasetProvider,
+    FinalizeContext,
+    InferenceTaskProvider,
+    InferenceTaskProviderContext,
+    ModelProvider,
+)
+from d9d.loop.state import InferenceJobState
+from d9d.pipelining.factory import PipelineScheduleInferenceConfig
+class InferenceConfigurator:
+    """
+    Orchestrates the assembly of the distributed inference environment.
+    This class binds the infrastructure configuration (DeviceMesh), the inference
+    parameters, and the user-defined logic (Providers) to create a fully
+    initialized state object capable of running the inference loop.
+    """
+    def __init__(
+            self,
+            mesh: DeviceMeshParameters,
+            parameters: InferenceConfig,
+            task_provider: InferenceTaskProvider,
+            model_provider: ModelProvider,
+            data_provider: DatasetProvider
+    ):
+        """
+        Constructs a configurator capable of building the full inference state.
+        Args:
+            mesh: Definition of the distributed device mesh topology.
+            parameters: The global configuration object for inference.
+            task_provider: Factory for creating the inference task logic.
+            model_provider: Factory for defining and creating model stages.
+            data_provider: Factory for providing inference datasets.
+        """
+        self._mesh = mesh
+        self._parameters = parameters
+        self._task_provider = task_provider
+        self._model_provider = model_provider
+        self._data_provider = data_provider
+    def _build_new_state(self) -> InferenceJobState:
+        dist_context = self._mesh.build()
+        pipelining_config = PipeliningConfig(
+            schedule=PipelineScheduleInferenceConfig()
+        )
+        set_seeds(dist_context, seed=self._parameters.determinism.base_seed)
+        task = self._task_provider(InferenceTaskProviderContext(
+            dist_context=dist_context
+        ))
+        batch_maths = BatchMaths(
+            dist_context=dist_context,
+            config_batching=self._parameters.batching,
+            config_pipelining=pipelining_config
+        )
+        data_loader_factory = DataLoaderFactory(
+            dist_context=dist_context,
+            provider=self._data_provider,
+            config_data_loading=self._parameters.data_loading,
+            batch_maths=batch_maths
+        )
+        data_loader_infer = data_loader_factory.build_dataloader_for_infer_job()
+        stepper = Stepper(
+            initial_step=1,
+            total_steps=len(data_loader_infer)
+        )
+        pipeline_state_handler = PipelineStateHandler(
+            sharding_spec={},
+            num_shards=batch_maths.num_microbatches_pipelining
+        )
+        processor = InferenceProcessor(
+            state=pipeline_state_handler,
+            task=task
+        )
+        schedule, modules = ModelStageFactory(
+            model_provider=self._model_provider,
+            dist_context=dist_context,
+            config_model=self._parameters.model_stage_factory,
+            config_pipelining=pipelining_config,
+            batch_maths=batch_maths,
+            pipeline_callback=processor
+        ).build_pipeline_and_modules()
+        task_operator = InferenceTaskOperator(
+            dist_context=dist_context,
+            task=task,
+            pipeline=schedule,
+            pipeline_state=pipeline_state_handler
+        )
+        gc = ManualGarbageCollector(
+            dist_ctx=dist_context,
+            config=self._parameters.gc,
+            step=stepper
+        )
+        checkpointer = StateCheckpointer(
+            dist_context=dist_context,
+            stepper=stepper,
+            config=self._parameters.checkpointing,
+            gc=gc,
+            run_name=None
+        )
+        profiler = JobProfiler(
+            dist_context=dist_context,
+            stepper=stepper,
+            config=self._parameters.profiling
+        )
+        timeout_manager = TimeoutManager(
+            dist_context=dist_context,
+            config=self._parameters.timeout
+        )
+        return InferenceJobState(
+            dist_context=dist_context,
+            data_loader=data_loader_infer,
+            stepper=stepper,
+            tracked_modules=modules,
+            garbage_collector=gc,
+            batch_maths=batch_maths,
+            checkpointer=checkpointer,
+            task=task,
+            profiler=profiler,
+            timeout_manager=timeout_manager,
+            task_operator=task_operator
+        )
+    def configure(self) -> "Inference":
+        """
+        Instantiates all inference components and returns a configured Inference engine.
+        This method triggers the creation of the distributed context, sets seeds,
+        builds the model, data loaders, and attaches all auxiliary components.
+        Returns:
+            Inference: A ready-to-use inference engine instance encapsulating the job state.
+        """
+        state = self._build_new_state()
+        return Inference(state)
+class Inference:
+    """
+    The main execution engine for running a distributed inference job.
+    This class manages the inference loop, lifecycle events, distributed synchronization,
+    and periodic side-effects (profiling, checkpointing). It ensures the model is in
+    evaluation mode and runs within a `torch.inference_mode` context.
+    """
+    def __init__(self, state: InferenceJobState):
+        """
+        Constructs an Inference engine from a pre-built job state.
+        Args:
+            state: The encapsulated state object containing all initialized components.
+        """
+        self._state = state
+    def _enable_eval_mode(self):
+        for module in self._state.tracked_modules.modules:
+            module.eval()
+    def infer(self):
+        """
+        Executes the full inference workflow.
+        This method:
+        1. Waits for world synchronization.
+        2. Loads the latest checkpoint if available.
+        3. Iterates through the data loader.
+        4. Executes the pipeline forward pass for every batch.
+        5. Handles periodic garbage collection and profiling.
+        6. Finalizes the task upon completion.
+        """
+        with torch.inference_mode():
+            self._enable_eval_mode()
+            self._state.dist_context.logger.info("Waiting for the world to start job")
+            self._state.dist_context.wait_world()
+            self._state.dist_context.logger.info("Trying to load last checkpoint before doing anything else")
+            self._state.checkpointer.load_last_checkpoint(self._state)
+            if self._state.stepper.current_step >= self._state.stepper.total_steps:
+                self._state.dist_context.logger.info("Already ran, will do nothing")
+                return
+            self._state.dist_context.wait_world()
+            with (
+                tqdm(
+                    desc="Inference",
+                    total=self._state.stepper.total_steps,
+                    disable=not self._state.dist_context.is_local_main_process,
+                    initial=self._state.stepper.current_step
+                ) as bar,
+                self._state.garbage_collector as gc,
+                self._state.profiler.open() as profiler
+            ):
+                self._state.timeout_manager.step()
+                for batch_group in self._state.data_loader:
+                    for batch in batch_group:
+                        self._state.task_operator.forward(batch)
+                    gc.collect_periodic()
+                    self._state.stepper.step()
+                    bar.update()
+                    # checkpoint at the end of the step
+                    self._state.checkpointer.checkpoint_if_needed(self._state)
+                    if profiler:
+                        profiler.step()
+                self._state.task.finalize(FinalizeContext())

d9d/loop/run/train.py CHANGED Viewed

@@ -121,7 +121,7 @@ class TrainingConfigurator:
             config_model=self._parameters.model_stage_factory,
             config_pipelining=self._parameters.pipelining,
             batch_maths=batch_maths,
-            loss_computer=loss_computer
+            pipeline_callback=loss_computer
         ).build_pipeline_and_modules()
         metrics = ComposeMetric(task.create_metrics(CreateMetricsContext()).metrics)
@@ -130,9 +130,7 @@ class TrainingConfigurator:
         task_operator = TrainTaskOperator(
             dist_context=dist_context,
             task=task,
-            pp_schedule=schedule,
-            tracked_modules=modules,
-            loss_computer=loss_computer,
+            pipeline=schedule,
             pipeline_state=pipeline_state_handler,
             metrics=metrics
         )

d9d/loop/state.py CHANGED Viewed

@@ -10,6 +10,7 @@ from d9d.loop.component import (
     BatchMaths,
     GradientClipper,
     GradientManager,
+    InferenceTaskOperator,
     JobLogger,
     JobProfiler,
     ManualGarbageCollector,
@@ -123,14 +124,16 @@ class TrainJobState(JobState):
 @dataclasses.dataclass(kw_only=True)
-class InferJobState(JobState):
+class InferenceJobState(JobState):
     """
     Container for the state of an inference job.
     Attributes:
         task: The specific inference task logic definition.
+        task_operator: Executor for running forward and backward passes.
     """
     task: InferenceTask
+    task_operator: InferenceTaskOperator
     def state_dict(self) -> dict[str, Any]:
         return {

d9d/pipelining/api/__init__.py CHANGED Viewed

@@ -9,9 +9,12 @@ from .module import (
 )
 from .schedule import PipelineSchedule
 from .sharding import PipelineShardingSpec
+from .types import PipelineLossFn, PipelineResultFn
 __all__ = [
     "ModuleSupportsPipelining",
+    "PipelineLossFn",
+    "PipelineResultFn",
     "PipelineSchedule",
     "PipelineShardingSpec",
     "PipelineStageInfo",

d9d/pipelining/api/types.py ADDED Viewed

@@ -0,0 +1,28 @@
+from collections.abc import Callable
+from typing import Any
+import torch
+PipelineResultFn = Callable[[dict[str, torch.Tensor], int], Any]
+"""
+Callback function type for handling results from a final pipeline stage.
+Args:
+    outputs: A dictionary mapping output names to tensors produced by the stage.
+    microbatch_idx: The index of the current micro-batch being processed.
+Returns:
+    Anything - not used.
+"""
+PipelineLossFn = Callable[[dict[str, torch.Tensor], int], torch.Tensor]
+"""
+Callback function type for calculating loss in the final pipeline stage.
+Args:
+    outputs: A dictionary mapping output names to tensors produced by the model.
+    microbatch_idx: The index of the current micro-batch being processed.
+Returns:
+    The computed loss tensor (scalar).
+"""

d9d/pipelining/factory/factory.py CHANGED Viewed

@@ -1,19 +1,19 @@
 import dataclasses
 from collections.abc import Callable
-import torch
 from torch import nn
 from ...core.dist_context import REGULAR_DOMAIN, DistributedContext
-from ..api import PipelineSchedule, PipelineStageInfo
+from ..api import PipelineLossFn, PipelineResultFn, PipelineSchedule, PipelineStageInfo
 from ..infra.schedule.component.program import (
     build_stage_to_host_rank_topology,
     invert_stage_to_host_rank_topology,
 )
-from ..infra.schedule.component.runtime import PipelineScheduleExecutor
+from ..infra.schedule.component.runtime import OfflinePipelineExecutor, PipelineScheduleExecutor
 from ..infra.stage import PipelineStage
 from .config import (
     AnyPipelineScheduleConfig,
+    PipelineScheduleInferenceConfig,
 )
 from .registry import PIPELINE_PROGRAM_REGISTRY
@@ -27,38 +27,31 @@ class PipelineScheduleInfo:
     has_last_stage: bool
-def build_schedule(
-        dist_context: DistributedContext,
-        n_microbatches: int,
+def _build_schedule_local(
         schedule_config: AnyPipelineScheduleConfig,
         model_provider: Callable[[PipelineStageInfo], nn.Module],
-        loss_fn: Callable[[dict[str, torch.Tensor], int], torch.Tensor] | None,
+        callback: PipelineLossFn | PipelineResultFn
 ) -> tuple[PipelineScheduleInfo, list[nn.Module]]:
-    """
-    Constructs the pipeline schedule and instantiates model stages.
+    stage_info = PipelineStageInfo(num_stages=1, current_stage=0)
-    This function coordinates the creation of the distributed pipeline. It:
-    1.  Selects the appropriate `PipelineProgramBuilder` based on the config.
-    2.  Calculates the global stage topology mapping stages to ranks.
-    3.  Instantiates the local model stages for the current rank using `model_provider`.
-    4.  Wraps models in `PipelineStage` containers.
-    5.  Generates the execution program (action list).
-    6.  Builds the runtime executor.
+    model = model_provider(stage_info)
+    has_backward = not isinstance(schedule_config, PipelineScheduleInferenceConfig)
+    scheduler = OfflinePipelineExecutor(model=model, callback=callback, do_backward=has_backward)
-    Args:
-        dist_context: The distributed context.
-        n_microbatches: Number of microbatches per global step.
-        schedule_config: Configuration object determining the schedule strategy.
-        model_provider: A factory function that accepts stage info and returns an `nn.Module`
-            for that specific stage.
-        loss_fn: Optional loss function. Required if training (backward pass needed).
+    return PipelineScheduleInfo(
+        schedule=scheduler,
+        has_first_stage=True,
+        has_last_stage=True
+    ), [model]
-    Returns:
-        A tuple containing:
-        1.  `PipelineScheduleInfo`: The executable schedule and metadata.
-        2.  `list[nn.Module]`: The local PyTorch modules created for this rank.
-    """
+def _build_schedule_distributed(
+        dist_context: DistributedContext,
+        n_microbatches: int,
+        schedule_config: AnyPipelineScheduleConfig,
+        model_provider: Callable[[PipelineStageInfo], nn.Module],
+        callback: PipelineLossFn | PipelineResultFn,
+) -> tuple[PipelineScheduleInfo, list[nn.Module]]:
     program_builder = PIPELINE_PROGRAM_REGISTRY.program_for(schedule_config)
     mesh = dist_context.mesh_for(REGULAR_DOMAIN)["pp"]
@@ -103,7 +96,7 @@ def build_schedule(
         dist_context=dist_context,
         stages=stages,
         num_microbatches=n_microbatches,
-        loss_fn=loss_fn,
+        callback=callback,
         program=program
     )
@@ -112,3 +105,49 @@ def build_schedule(
         has_first_stage=has_first_stage,
         has_last_stage=has_last_stage
     ), modules
+def build_schedule(
+        dist_context: DistributedContext,
+        n_microbatches: int,
+        schedule_config: AnyPipelineScheduleConfig,
+        model_provider: Callable[[PipelineStageInfo], nn.Module],
+        callback: PipelineLossFn | PipelineResultFn,
+) -> tuple[PipelineScheduleInfo, list[nn.Module]]:
+    """
+    Constructs the pipeline schedule and instantiates model stages.
+    This function coordinates the creation of the pipeline. If the context is
+    distributed, it builds a parallel schedule (`PipelineScheduleExecutor`) by
+    calculating topology and creating stages for the current rank. If the
+    context is local, it builds an offline schedule (`OfflinePipelineExecutor`)
+    for direct execution.
+    Args:
+        dist_context: The distributed context.
+        n_microbatches: Number of microbatches per global step.
+        schedule_config: Configuration object determining the schedule strategy.
+        model_provider: A factory function that accepts stage info and returns an `nn.Module`
+            for that specific stage.
+        callback: Callback either computing loss function (if training) or just processing pipeline outputs
+            (if not training).
+    Returns:
+        A tuple containing the schedule info (executor and metadata) and a list
+        of local PyTorch modules created for this rank.
+    """
+    if dist_context.mesh_params.is_distributed:
+        return _build_schedule_distributed(
+            dist_context=dist_context,
+            n_microbatches=n_microbatches,
+            schedule_config=schedule_config,
+            model_provider=model_provider,
+            callback=callback
+        )
+    else:
+        return _build_schedule_local(
+            schedule_config=schedule_config,
+            model_provider=model_provider,
+            callback=callback
+        )

d9d/pipelining/infra/schedule/component/runtime/__init__.py CHANGED Viewed

@@ -14,6 +14,7 @@ from .action import (
     ForwardSendAction,
 )
 from .executor import PipelineScheduleExecutor
+from .offline import OfflinePipelineExecutor
 __all__ = [
     "ActionBase",
@@ -25,5 +26,6 @@ __all__ = [
     "ForwardComputeAction",
     "ForwardReceiveAction",
     "ForwardSendAction",
-    "PipelineScheduleExecutor",
+    "OfflinePipelineExecutor",
+    "PipelineScheduleExecutor"
 ]

d9d/pipelining/infra/schedule/component/runtime/action.py CHANGED Viewed

@@ -7,8 +7,8 @@ import torch
 from d9d.pipelining.infra.stage import PipelineStage
+from .callback import PipelineLossHandler, PipelineResultHandler
 from .communications import PipelineCommunicationHandler
-from .loss import PipelineLossHandler
 @dataclasses.dataclass(kw_only=True, slots=True)
@@ -21,7 +21,7 @@ class ActionContext:
         pipeline_kwargs_microbatches: The global keyword arguments sharded by microbatch.
         stages: A mapping of stage indices to their active PipelineStage instances.
         communications: The handler for P2P communications.
-        loss: The handler for loss computation, or None if not available.
+        callback: The handler for either loss computation or result processing.
     """
     pipeline_inputs_microbatches: tuple[dict[str, torch.Tensor], ...]
@@ -29,7 +29,7 @@ class ActionContext:
     stages: dict[int, PipelineStage]
     communications: PipelineCommunicationHandler
-    loss: PipelineLossHandler | None
+    callback: PipelineLossHandler | PipelineResultHandler
 class ActionWorkType(StrEnum):
@@ -208,7 +208,6 @@ class ForwardComputeAction(ActionBase):
     microbatch_idx: int
     def apply(self, ctx: ActionContext):
-        # todo check unsharded
         stage = ctx.stages[self.stage_idx]
         if not stage.info.is_current_stage_first and self.stage_idx - 1 not in ctx.stages:
@@ -221,8 +220,8 @@ class ForwardComputeAction(ActionBase):
         )
         result = stage.get_local_fwd_output(self.microbatch_idx)
-        if stage.info.is_current_stage_last and ctx.loss is not None:
-            ctx.loss.compute_loss(result, self.microbatch_idx)
+        if stage.info.is_current_stage_last:
+            ctx.callback.trigger(result, self.microbatch_idx)
         if not stage.info.is_current_stage_last and self.stage_idx + 1 in ctx.stages:
             ctx.stages[self.stage_idx + 1].set_local_fwd_input(
@@ -260,14 +259,13 @@ class BackwardFullInputComputeAction(ActionBase):
     full_backward: bool
     def apply(self, ctx: ActionContext):
-        # todo unshard
         stage = ctx.stages[self.stage_idx]
         if not stage.info.is_current_stage_last and self.stage_idx + 1 not in ctx.stages:
             ctx.communications.wait_bwd_recv(self.stage_idx, self.microbatch_idx)
-        if stage.info.is_current_stage_last and ctx.loss is not None:
-            loss = ctx.loss.acquire_loss(self.microbatch_idx)
+        if stage.info.is_current_stage_last and isinstance(ctx.callback, PipelineLossHandler):
+            loss = ctx.callback.acquire_loss(self.microbatch_idx)
         else:
             loss = None
@@ -310,7 +308,6 @@ class BackwardWeightComputeAction(ActionBase):
     microbatch_idx: int
     def apply(self, ctx: ActionContext):
-        # todo unshard
         stage = ctx.stages[self.stage_idx]
         stage.backward_weight_one_chunk(

d9d/pipelining/infra/schedule/component/runtime/{loss.py → callback.py} RENAMED Viewed

@@ -1,14 +1,41 @@
-from collections.abc import Callable
 import torch
-LossFn = Callable[[dict[str, torch.Tensor], int], torch.Tensor]
+from d9d.pipelining.api import PipelineLossFn, PipelineResultFn
+class PipelineResultHandler:
+    """
+    Wraps a callback function to handle results from pipeline execution.
+    """
+    def __init__(self, callback_fn: PipelineResultFn):
+        """
+        Constructs PipelineResultHandler object.
+        Args:
+            callback_fn: The function called with results.
+        """
+        self._callback_fn = callback_fn
+    def trigger(self, forward_result: dict[str, torch.Tensor], microbatch_index: int):
+        """
+        Invokes the underlying callback with the provided results.
+        Args:
+            forward_result: Dictionary of output tensors from the pipeline.
+            microbatch_index: The index of the current micro-batch.
+        """
+        self._callback_fn(forward_result, microbatch_index)
 class PipelineLossHandler:
-    """Manages loss computation and state caching across forward and backward passes."""
+    """
+    Manages loss computation and state caching across forward and backward passes.
+    """
-    def __init__(self, loss_fn: LossFn):
+    def __init__(self, loss_fn: PipelineLossFn):
         """
         Constructs the loss handler.
@@ -19,21 +46,17 @@ class PipelineLossHandler:
         self._loss_fn = loss_fn
         self._cached_values: dict[int, torch.Tensor] = {}
-    def compute_loss(self, forward_result: dict[str, torch.Tensor], microbatch_index: int) -> torch.Tensor:
+    def trigger(self, forward_result: dict[str, torch.Tensor], microbatch_index: int):
         """
         Computes loss for a given microbatch result and caches it.
         Args:
             forward_result: The output from the last stage of the model.
             microbatch_index: The index of the microbatch being processed.
-        Returns:
-            The computed loss tensor.
         """
         result = self._loss_fn(forward_result, microbatch_index)
         self._cached_values[microbatch_index] = result
-        return result
     def acquire_loss(self, microbatch_index: int) -> torch.Tensor:
         """

d9d 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

d9d 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl