PyPI - d9d - Versions diffs - 0.1.0__py3-none-any.whl - Mend

d9d 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

d9d/__init__.py +0 -0
d9d/core/__init__.py +0 -0
d9d/core/autograd/__init__.py +7 -0
d9d/core/autograd/grad_context.py +85 -0
d9d/core/dist_context/__init__.py +19 -0
d9d/core/dist_context/configured.py +215 -0
d9d/core/dist_context/device_mesh_domains.py +185 -0
d9d/core/dist_context/log.py +30 -0
d9d/core/dist_context/params.py +113 -0
d9d/core/dist_ops/__init__.py +16 -0
d9d/core/dist_ops/object.py +68 -0
d9d/core/dist_ops/tensor.py +192 -0
d9d/core/protocol/__init__.py +8 -0
d9d/core/protocol/training.py +38 -0
d9d/core/sharding/__init__.py +15 -0
d9d/core/sharding/auto_spec.py +66 -0
d9d/core/sharding/shard.py +154 -0
d9d/core/sharding/spec.py +28 -0
d9d/core/sharding/unshard.py +117 -0
d9d/core/types/__init__.py +12 -0
d9d/core/types/data.py +14 -0
d9d/core/types/pytree.py +26 -0
d9d/dataset/__init__.py +17 -0
d9d/dataset/buffer_sorted.py +143 -0
d9d/dataset/padding.py +79 -0
d9d/dataset/sharded.py +195 -0
d9d/internals/__init__.py +0 -0
d9d/internals/determinism/__init__.py +10 -0
d9d/internals/determinism/seed.py +63 -0
d9d/internals/grad_norm/__init__.py +8 -0
d9d/internals/grad_norm/group.py +87 -0
d9d/internals/grad_norm/norm.py +169 -0
d9d/internals/grad_sync/__init__.py +14 -0
d9d/internals/grad_sync/bucket.py +317 -0
d9d/internals/grad_sync/placement_helper.py +23 -0
d9d/internals/grad_sync/synchronizer.py +257 -0
d9d/internals/pipeline_state/__init__.py +14 -0
d9d/internals/pipeline_state/api.py +45 -0
d9d/internals/pipeline_state/handler.py +111 -0
d9d/internals/pipeline_state/storage.py +236 -0
d9d/internals/profiling/__init__.py +7 -0
d9d/internals/profiling/profile.py +112 -0
d9d/internals/state/__init__.py +6 -0
d9d/internals/state/main_process.py +44 -0
d9d/kernel/__init__.py +0 -0
d9d/kernel/cce/__init__.py +5 -0
d9d/kernel/cce/cce.py +298 -0
d9d/kernel/cce/main.py +282 -0
d9d/kernel/general/__init__.py +5 -0
d9d/kernel/general/get_int_dtype.py +7 -0
d9d/kernel/gmm/__init__.py +5 -0
d9d/kernel/gmm/function.py +78 -0
d9d/kernel/moe/__init__.py +8 -0
d9d/kernel/moe/indices_to_multihot.py +268 -0
d9d/kernel/moe/permute_with_probs.py +1035 -0
d9d/kernel/stochastic/__init__.py +11 -0
d9d/kernel/stochastic/adamw_step.py +204 -0
d9d/kernel/stochastic/copy.py +104 -0
d9d/kernel/stochastic/ops/__init__.py +5 -0
d9d/kernel/stochastic/ops/round.py +22 -0
d9d/kernel/swiglu/__init__.py +5 -0
d9d/kernel/swiglu/function.py +36 -0
d9d/kernel/swiglu/op.py +167 -0
d9d/loop/__init__.py +0 -0
d9d/loop/auto/__init__.py +9 -0
d9d/loop/auto/auto_lr_scheduler.py +46 -0
d9d/loop/auto/auto_optimizer.py +196 -0
d9d/loop/component/__init__.py +35 -0
d9d/loop/component/batch_maths.py +106 -0
d9d/loop/component/checkpointer.py +172 -0
d9d/loop/component/data_loader_factory.py +258 -0
d9d/loop/component/garbage_collector.py +94 -0
d9d/loop/component/gradient_clipper.py +89 -0
d9d/loop/component/gradient_manager.py +149 -0
d9d/loop/component/job_logger.py +146 -0
d9d/loop/component/job_profiler.py +62 -0
d9d/loop/component/loss_computer.py +86 -0
d9d/loop/component/model_stage_exporter.py +37 -0
d9d/loop/component/model_stage_factory.py +261 -0
d9d/loop/component/optimizer_factory.py +88 -0
d9d/loop/component/stepper.py +52 -0
d9d/loop/component/timeout_manager.py +54 -0
d9d/loop/component/train_task_operator.py +152 -0
d9d/loop/config/__init__.py +36 -0
d9d/loop/config/config.py +225 -0
d9d/loop/config/types.py +24 -0
d9d/loop/control/__init__.py +61 -0
d9d/loop/control/dataset_provider.py +58 -0
d9d/loop/control/lr_scheduler_provider.py +47 -0
d9d/loop/control/model_provider.py +162 -0
d9d/loop/control/optimizer_provider.py +45 -0
d9d/loop/control/task.py +304 -0
d9d/loop/run/__init__.py +6 -0
d9d/loop/run/train.py +355 -0
d9d/loop/state.py +143 -0
d9d/lr_scheduler/__init__.py +9 -0
d9d/lr_scheduler/piecewise/__init__.py +18 -0
d9d/lr_scheduler/piecewise/builder.py +152 -0
d9d/lr_scheduler/piecewise/config.py +176 -0
d9d/lr_scheduler/piecewise/curves.py +75 -0
d9d/lr_scheduler/piecewise/engine.py +76 -0
d9d/lr_scheduler/visualizer.py +74 -0
d9d/metric/__init__.py +10 -0
d9d/metric/abc.py +79 -0
d9d/metric/impl/__init__.py +7 -0
d9d/metric/impl/compose.py +54 -0
d9d/metric/impl/mean.py +94 -0
d9d/model_state/__init__.py +0 -0
d9d/model_state/io/__init__.py +21 -0
d9d/model_state/io/dto.py +30 -0
d9d/model_state/io/module_reader.py +75 -0
d9d/model_state/io/module_writer.py +123 -0
d9d/model_state/io/reader.py +125 -0
d9d/model_state/io/writer.py +309 -0
d9d/model_state/mapper/__init__.py +10 -0
d9d/model_state/mapper/abc.py +70 -0
d9d/model_state/mapper/adapters/__init__.py +12 -0
d9d/model_state/mapper/adapters/mapper.py +27 -0
d9d/model_state/mapper/adapters/module.py +22 -0
d9d/model_state/mapper/compose/__init__.py +17 -0
d9d/model_state/mapper/compose/helper.py +22 -0
d9d/model_state/mapper/compose/parallel.py +58 -0
d9d/model_state/mapper/compose/sequential.py +131 -0
d9d/model_state/mapper/compose/shard.py +36 -0
d9d/model_state/mapper/leaf/__init__.py +18 -0
d9d/model_state/mapper/leaf/dtensor.py +56 -0
d9d/model_state/mapper/leaf/identity.py +23 -0
d9d/model_state/mapper/leaf/rename.py +26 -0
d9d/model_state/mapper/leaf/select_child.py +37 -0
d9d/model_state/mapper/leaf/stack.py +29 -0
d9d/module/__init__.py +0 -0
d9d/module/base/__init__.py +7 -0
d9d/module/base/late_init.py +10 -0
d9d/module/block/__init__.py +0 -0
d9d/module/block/attention/__init__.py +7 -0
d9d/module/block/attention/grouped_query.py +139 -0
d9d/module/block/attention/sdpa/__init__.py +5 -0
d9d/module/block/attention/sdpa/flash.py +52 -0
d9d/module/block/embedding/__init__.py +7 -0
d9d/module/block/embedding/shard_token_embedding.py +103 -0
d9d/module/block/ffn/__init__.py +5 -0
d9d/module/block/ffn/swiglu.py +60 -0
d9d/module/block/head/__init__.py +6 -0
d9d/module/block/head/language_modelling.py +87 -0
d9d/module/block/hidden_states_aggregator/__init__.py +12 -0
d9d/module/block/hidden_states_aggregator/base.py +35 -0
d9d/module/block/hidden_states_aggregator/factory.py +48 -0
d9d/module/block/hidden_states_aggregator/mean.py +61 -0
d9d/module/block/hidden_states_aggregator/noop.py +27 -0
d9d/module/block/moe/__init__.py +13 -0
d9d/module/block/moe/communications/__init__.py +11 -0
d9d/module/block/moe/communications/base.py +58 -0
d9d/module/block/moe/communications/deepep.py +300 -0
d9d/module/block/moe/communications/naive.py +68 -0
d9d/module/block/moe/grouped_experts.py +81 -0
d9d/module/block/moe/grouped_linear.py +78 -0
d9d/module/block/moe/layer.py +122 -0
d9d/module/block/moe/router.py +103 -0
d9d/module/block/positional/__init__.py +8 -0
d9d/module/block/positional/rope.py +150 -0
d9d/module/model/__init__.py +0 -0
d9d/module/model/qwen3_moe/__init__.py +16 -0
d9d/module/model/qwen3_moe/decoder_layer.py +110 -0
d9d/module/model/qwen3_moe/model.py +373 -0
d9d/module/model/qwen3_moe/params.py +69 -0
d9d/module/parallelism/__init__.py +0 -0
d9d/module/parallelism/api/__init__.py +18 -0
d9d/module/parallelism/api/expert_parallel.py +36 -0
d9d/module/parallelism/api/fully_sharded.py +43 -0
d9d/module/parallelism/api/hybrid_sharded.py +49 -0
d9d/module/parallelism/api/replicate_parallel.py +33 -0
d9d/module/parallelism/model/__init__.py +0 -0
d9d/module/parallelism/model/qwen3_moe.py +99 -0
d9d/module/parallelism/style/__init__.py +7 -0
d9d/module/parallelism/style/shard_experts.py +60 -0
d9d/module/parallelism/style/to_local.py +86 -0
d9d/optim/__init__.py +0 -0
d9d/optim/stochastic/__init__.py +5 -0
d9d/optim/stochastic/adamw.py +158 -0
d9d/peft/__init__.py +13 -0
d9d/peft/all/__init__.py +12 -0
d9d/peft/all/config.py +31 -0
d9d/peft/all/method.py +76 -0
d9d/peft/applicator.py +47 -0
d9d/peft/base.py +70 -0
d9d/peft/full_tune/__init__.py +11 -0
d9d/peft/full_tune/config.py +20 -0
d9d/peft/full_tune/method.py +46 -0
d9d/peft/lora/__init__.py +15 -0
d9d/peft/lora/config.py +35 -0
d9d/peft/lora/layer.py +177 -0
d9d/peft/lora/method.py +132 -0
d9d/pipelining/__init__.py +0 -0
d9d/pipelining/api/__init__.py +19 -0
d9d/pipelining/api/module.py +149 -0
d9d/pipelining/api/schedule.py +50 -0
d9d/pipelining/api/sharding.py +9 -0
d9d/pipelining/factory/__init__.py +21 -0
d9d/pipelining/factory/config.py +89 -0
d9d/pipelining/factory/factory.py +114 -0
d9d/pipelining/factory/registry.py +82 -0
d9d/pipelining/infra/__init__.py +0 -0
d9d/pipelining/infra/schedule/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/program/__init__.py +22 -0
d9d/pipelining/infra/schedule/component/program/base.py +35 -0
d9d/pipelining/infra/schedule/component/program/communications.py +203 -0
d9d/pipelining/infra/schedule/component/program/topology.py +78 -0
d9d/pipelining/infra/schedule/component/runtime/__init__.py +29 -0
d9d/pipelining/infra/schedule/component/runtime/action.py +361 -0
d9d/pipelining/infra/schedule/component/runtime/communications.py +101 -0
d9d/pipelining/infra/schedule/component/runtime/executor.py +113 -0
d9d/pipelining/infra/schedule/component/runtime/loss.py +55 -0
d9d/pipelining/infra/schedule/program/__init__.py +15 -0
d9d/pipelining/infra/schedule/program/bfs.py +86 -0
d9d/pipelining/infra/schedule/program/dualpipev.py +234 -0
d9d/pipelining/infra/schedule/program/interleaved.py +240 -0
d9d/pipelining/infra/schedule/program/zerobubblev.py +227 -0
d9d/pipelining/infra/stage/__init__.py +5 -0
d9d/pipelining/infra/stage/communications.py +274 -0
d9d/pipelining/infra/stage/computations.py +317 -0
d9d/pipelining/infra/stage/splitgrad.py +377 -0
d9d/pipelining/infra/stage/stage.py +321 -0
d9d/pipelining/infra/stage/struct_helper.py +46 -0
d9d/pipelining/training/__init__.py +7 -0
d9d/pipelining/training/optimizer.py +41 -0
d9d/pipelining/training/scheduler.py +34 -0
d9d/tracker/__init__.py +14 -0
d9d/tracker/base.py +124 -0
d9d/tracker/factory.py +57 -0
d9d/tracker/provider/__init__.py +0 -0
d9d/tracker/provider/aim/__init__.py +0 -0
d9d/tracker/provider/aim/config.py +23 -0
d9d/tracker/provider/aim/tracker.py +114 -0
d9d/tracker/provider/null.py +61 -0
d9d-0.1.0.dist-info/METADATA +90 -0
d9d-0.1.0.dist-info/RECORD +238 -0
d9d-0.1.0.dist-info/WHEEL +4 -0

d9d/core/dist_ops/object.py ADDED Viewed

@@ -0,0 +1,68 @@
+from typing import TypeVar, cast
+import torch.distributed as dist
+T = TypeVar("T")
+def gather_object(
+        obj: T,
+        group: dist.ProcessGroup,
+        group_dst: int
+) -> list[T] | None:
+    """
+    Gathers picklable objects from the whole process group to a specific destination rank.
+    This acts as a wrapper around torch.distributed.gather_object that automatically
+    initializes the output buffer list on the destination rank.
+    Args:
+        obj: The local object to send. Must be picklable.
+        group: The process group to work on.
+        group_dst: The rank within the group that will receive the objects.
+    Returns:
+        A list of objects from all ranks on the destination rank; None on other ranks.
+    """
+    if group.rank() == group_dst:
+        # We initialize with None, but we cast to list[T] because we know
+        # dist.gather_object will populate these slots with actual objects.
+        save_list = cast(list[T], [None for _ in range(group.size())])
+    else:
+        save_list = None
+    dist.gather_object(
+        obj,
+        save_list,
+        group=group,
+        group_dst=group_dst
+    )
+    return save_list
+def all_gather_object(
+        obj: T,
+        group: dist.ProcessGroup
+) -> list[T]:
+    """
+    Gathers picklable objects from the whole process group to all ranks.
+    This acts as a wrapper around torch.distributed.all_gather_object that automatically
+    initializes the output buffer list on all ranks.
+    Args:
+        obj: The local object to send. Must be picklable.
+        group: The process group to work on.
+    Returns:
+        A list of objects containing the data gathered from all ranks.
+    """
+    # We initialize with None, but we cast to list[T] because we know
+    # dist.gather_object will populate these slots with actual objects.
+    save_list = cast(list[T], [None for _ in range(group.size())])
+    dist.all_gather_object(
+        save_list,
+        obj,
+        group=group
+    )
+    return save_list

d9d/core/dist_ops/tensor.py ADDED Viewed

@@ -0,0 +1,192 @@
+from collections.abc import Sequence
+from typing import cast
+import torch
+import torch.distributed as dist
+def gather(
+        tensor: torch.Tensor,
+        group: dist.ProcessGroup,
+        group_dst: int,
+        async_op: bool = False
+) -> list[torch.Tensor] | tuple[list[torch.Tensor] | None, dist.Work] | None:
+    """
+    Gathers tensors from the process group to a specific destination rank.
+    This function assumes that tensors on all ranks have the same shape and dtype
+    as the tensor on the current rank. It automatically allocates the output
+    buffer list on the destination.
+    Args:
+        tensor: The local tensor to send.
+        group: The process group to work on.
+        group_dst: The rank within the group that will receive the tensors.
+        async_op: Whether the operation should be asynchronous.
+    Returns:
+        If async_op is False: A list of tensors on the destination rank, None elsewhere.
+        If async_op is True: A tuple containing (buffer_list, work_handle).
+    """
+    if group.rank() == group_dst:
+        save_list = [torch.empty_like(tensor) for _ in range(group.size())]
+    else:
+        save_list = None
+    work = dist.gather(
+        tensor,
+        save_list,
+        group=group,
+        group_dst=group_dst,
+        async_op=async_op
+    )
+    if async_op:
+        return save_list, work
+    else:
+        return save_list
+def all_gather(
+        tensor: torch.Tensor,
+        group: dist.ProcessGroup,
+        async_op: bool = False
+) -> list[torch.Tensor] | tuple[list[torch.Tensor], dist.Work]:
+    """
+    Gathers tensors from the whole process group to all ranks.
+    This function assumes that tensors on all ranks have the same shape and dtype
+    as the tensor on the current rank. It automatically allocates the output
+    buffer list.
+    Args:
+        tensor: The local tensor to send.
+        group: The process group to work on.
+        async_op: Whether the operation should be asynchronous.
+    Returns:
+        If async_op is False: A list of gathered tensors.
+        If async_op is True: A tuple containing (buffer_list, work_handle).
+    """
+    save_list = [torch.empty_like(tensor) for _ in range(group.size())]
+    work = dist.all_gather(
+        save_list,
+        tensor,
+        group=group,
+        async_op=async_op
+    )
+    if async_op:
+        return save_list, work
+    else:
+        return save_list
+def _all_gather_shapes(
+        tensor: torch.Tensor,
+        group: dist.ProcessGroup,
+) -> Sequence[torch.Tensor]:
+    all_ndim = [torch.empty((), dtype=torch.long, device=tensor.device) for _ in range(group.size())]
+    all_ndim_wait = dist.all_gather(
+        all_ndim,
+        torch.tensor(tensor.ndim, dtype=torch.long, device=tensor.device),
+        group=group,
+        async_op=True
+    )
+    all_ndim_wait.wait()
+    all_shape = [torch.empty(cast(int, ndim.item()), dtype=torch.long, device=tensor.device) for ndim in all_ndim]
+    all_shape_wait = dist.all_gather(
+        all_shape,
+        torch.tensor(tensor.shape, dtype=torch.long, device=tensor.device),
+        group=group,
+        async_op=True
+    )
+    all_shape_wait.wait()
+    return all_shape
+def all_gather_variadic_shape(
+        tensor: torch.Tensor,
+        group: dist.ProcessGroup,
+        async_op: bool = False
+) -> list[torch.Tensor] | tuple[list[torch.Tensor], dist.Work]:
+    """
+    Gathers tensors of different shapes from the whole process group to all ranks.
+    Unlike standard all_gather, this function first communicates the shape of the
+    tensor on every rank allowing for dynamic sizing.
+    Args:
+        tensor: The local tensor to send.
+        group: The process group to work on.
+        async_op: Whether the final data gathering should be asynchronous.
+                  Note that shape gathering is always synchronous.
+    Returns:
+        If async_op is False: A list of gathered tensors of varying shapes.
+        If async_op is True: A tuple containing (buffer_list, work_handle).
+    """
+    all_shape = _all_gather_shapes(tensor, group)
+    all_result = [torch.empty(tuple(shape), dtype=tensor.dtype, device=tensor.device) for shape in all_shape]
+    all_result_wait = dist.all_gather(
+        all_result,
+        tensor,
+        group=group,
+        async_op=async_op
+    )
+    if async_op:
+        return all_result, all_result_wait
+    else:
+        return all_result
+def gather_variadic_shape(
+        tensor: torch.Tensor,
+        group: dist.ProcessGroup,
+        group_dst: int
+) -> list[torch.Tensor] | None:
+    """
+    Gathers tensors of different shapes from the process group to a specific rank.
+    This function coordinates shape exchange and uses point-to-point communication
+    (isend/irecv) to gather tensors that may differ in shape across ranks.
+    Currently, does not support async_op.
+    Args:
+        tensor: The local tensor to send.
+        group: The process group to work on.
+        group_dst: The rank within the group that will receive the tensors.
+    Returns:
+        A list of tensors of varying shapes on the destination rank; None on other ranks.
+    """
+    is_current_dst = group.rank() == group_dst
+    all_shape = _all_gather_shapes(tensor, group)
+    if is_current_dst:
+        all_recv_futures: list[dist.Work] = []
+        all_result: list[torch.Tensor] = cast(list[torch.Tensor], [None for _ in range(group.size())])
+        for group_src_i in range(group.size()):
+            if group_src_i == group_dst:
+                all_result[group_src_i] = tensor
+                continue
+            all_result[group_src_i] = torch.empty(
+                tuple(all_shape[group_src_i]), dtype=tensor.dtype, device=tensor.device
+            )
+            all_recv_future = dist.irecv(all_result[group_src_i], group=group, group_src=group_src_i)
+            all_recv_future = cast(dist.Work, all_recv_future)  # we know we are on dst rank
+            all_recv_futures.append(all_recv_future)
+        for recv_future in all_recv_futures:
+            recv_future.wait()
+        return all_result
+    else:
+        dist.isend(tensor=tensor, group=group, group_dst=group_dst)
+        return None

d9d/core/protocol/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Package providing protocol definitions for standard PyTorch objects."""
+from .training import LRSchedulerProtocol, OptimizerProtocol
+__all__ = [
+    "LRSchedulerProtocol",
+    "OptimizerProtocol"
+]

d9d/core/protocol/training.py ADDED Viewed

@@ -0,0 +1,38 @@
+from typing import Protocol, runtime_checkable
+from torch.distributed.checkpoint.stateful import Stateful
+@runtime_checkable
+class OptimizerProtocol(Protocol, Stateful):
+    """
+    Protocol defining an interface for standard PyTorch Optimizer object.
+    This protocol ensures that the wrapped optimizer supports standard
+    API and state checkpointing via the Stateful interface.
+    """
+    def step(self):
+        """Performs a single optimization step."""
+        ...
+    def zero_grad(self):
+        """Sets the gradients of all optimized tensors to zero."""
+        ...
+@runtime_checkable
+class LRSchedulerProtocol(Protocol, Stateful):
+    """
+    Protocol defining an interface for a Learning Rate Scheduler.
+    This protocol ensures that the wrapped scheduler supports stepping
+    and state checkpointing via the Stateful interface.
+    """
+    def step(self):
+        """Performs a single learning rate scheduling step."""
+        ...

d9d/core/sharding/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from .auto_spec import shard_spec_nothing, shard_spec_on_dim
+from .shard import shard_tree
+from .spec import ShardingSpec, ShardingSpecLeaf, SpecReplicate, SpecShard
+from .unshard import unshard_tree
+__all__ = [
+    "ShardingSpec",
+    "ShardingSpecLeaf",
+    "SpecReplicate",
+    "SpecShard",
+    "shard_spec_nothing",
+    "shard_spec_on_dim",
+    "shard_tree",
+    "unshard_tree"
+]

d9d/core/sharding/auto_spec.py ADDED Viewed

@@ -0,0 +1,66 @@
+from typing import Any
+import torch
+import torch.utils._pytree as pytree  # noqa: PLC2701
+from d9d.core.types import PyTree
+from .spec import ShardingSpec, ShardingSpecLeaf, SpecReplicate, SpecShard
+def _tree_item_to_shard(item: Any, shard_on_dim: int) -> ShardingSpecLeaf:
+    if isinstance(item, list):
+        if shard_on_dim != 0:
+            raise ValueError(f"Cannot shard list on dim {shard_on_dim}. Lists behave as 1D sequences.")
+        return SpecShard(0)
+    elif isinstance(item, torch.Tensor):
+        if item.ndim == 0:
+            return SpecReplicate()
+        if item.ndim <= shard_on_dim:
+            raise ValueError(f"Cannot shard {item.ndim}-dimensional tensor on dim {shard_on_dim}")
+        return SpecShard(shard_on_dim)
+    else:
+        return SpecReplicate()
+def shard_spec_on_dim(tree: PyTree[Any], dim: int) -> ShardingSpec:
+    """
+    Creates a sharding specification to split all tensors in the tree on a specific dimension.
+    Iterates over the input tree:
+    *   If a leaf is a Tensor with enough dimensions, it is mapped to a SpecShard(dim) object.
+    *   If a leaf is a list, it is mapped to a SpecShard(0) object (only dim=0 is allowed for lists).
+    *   Other types and 0-dim tensors are mapped to SpecReplicate.
+    Args:
+        tree: The input PyTree structure.
+        dim: The dimension index to shard eligible tensors on.
+    Returns:
+        A new PyTree matching the input structure, containing SpecShard or SpecReplicate objects.
+    Raises:
+        ValueError: If a tensor exists in the tree with rank less than or equal to 'dim'.
+    """
+    return pytree.tree_map(
+        lambda x: _tree_item_to_shard(x, dim),
+        tree,
+        is_leaf=lambda x: isinstance(x, (torch.Tensor, list))
+    )
+def shard_spec_nothing(tree: PyTree[Any]) -> ShardingSpec:
+    """
+    Creates a sharding specification where no sharding is performed.
+    This effectively clones the tree structure but replaces every leaf with SpecReplicate.
+    Args:
+        tree: The input PyTree structure.
+    Returns:
+        A new PyTree matching the input structure, containing strictly SpecReplicate for all leaves.
+    """
+    return pytree.tree_map(lambda _: SpecReplicate(), tree, is_leaf=lambda x: isinstance(x, (torch.Tensor, list)))

d9d/core/sharding/shard.py ADDED Viewed

@@ -0,0 +1,154 @@
+from collections.abc import Sequence
+from typing import TypeVar, cast
+import torch
+import torch.utils._pytree as pytree  # noqa: PLC2701
+from d9d.core.types import PyTree
+from .spec import ShardingSpec, SpecReplicate, SpecShard
+TLeaf = TypeVar("TLeaf")
+TSameTree = TypeVar("TSameTree", bound=PyTree)
+def _shard_list(
+        item: list[TLeaf],
+        spec: SpecShard,
+        num_shards: int,
+        enforce_even_split: bool
+) -> Sequence[list[TLeaf] | TLeaf]:
+    if spec.dim != 0:
+        raise ValueError(f"Lists can only be sharded on dim 0, got {spec.dim}")
+    if spec.do_stack:
+        if len(item) != num_shards:
+            raise ValueError(
+                f"do_stack=True requires list length ({len(item)}) to match num_shards ({num_shards})"
+            )
+        return item
+    if enforce_even_split and len(item) % num_shards != 0:
+        raise ValueError(
+            f"Tried to shard a list with length {len(item)} "
+            f"to {num_shards} shards, but the length is not perfectly divisible."
+        )
+    shard_size, shard_extra = divmod(len(item), num_shards)
+    return [
+        item[
+            shard_id * shard_size + min(shard_id, shard_extra):
+            (shard_id + 1) * shard_size + min(shard_id + 1, shard_extra)
+        ]
+        for shard_id in range(num_shards)
+    ]
+def _shard_tensor(
+        item: torch.Tensor,
+        spec: SpecShard,
+        num_shards: int,
+        enforce_even_split: bool
+) -> Sequence[torch.Tensor]:
+    if item.ndim == 0:
+        raise ValueError("Found a 0-dim Tensor for sharding")
+    if spec.do_stack:
+        if item.shape[spec.dim] != num_shards:
+            raise ValueError(
+                f"do_stack=True requires tensor shape[{spec.dim}] ({item.shape[spec.dim]}) "
+                f"to match num_shards ({num_shards})"
+            )
+        return torch.unbind(item, dim=spec.dim)
+    if enforce_even_split and item.shape[spec.dim] % num_shards != 0:
+        raise ValueError(
+            f"Tried to shard a tensor with shape {item.shape} on dim {spec.dim} "
+            f"to {num_shards} shards, but the dimension is not perfectly divisible."
+        )
+    return torch.tensor_split(item, sections=num_shards, dim=spec.dim)
+def _shard_leaf_to_list(
+        item: TLeaf,
+        spec: SpecShard | SpecReplicate,
+        num_shards: int,
+        enforce_even_split: bool
+) -> Sequence[TLeaf]:
+    """Helper to split an item into a list of items for each rank."""
+    if isinstance(spec, SpecReplicate):
+        # Replicated: strict copy reference for all shards
+        return [item] * num_shards
+    if not isinstance(spec, SpecShard):
+        raise TypeError(f"Unknown sharding spec object type: {type(spec)}")
+    if isinstance(item, torch.Tensor):
+        return cast(Sequence[TLeaf], _shard_tensor(
+            item=item,
+            num_shards=num_shards,
+            enforce_even_split=enforce_even_split,
+            spec=spec
+        ))
+    elif isinstance(item, list):
+        return cast(Sequence[TLeaf], _shard_list(
+            item=item,
+            num_shards=num_shards,
+            enforce_even_split=enforce_even_split,
+            spec=spec
+        ))
+    else:
+        raise TypeError(
+            f"Sharding spec found a SpecShard object, but the item was not a Tensor and not a list (got {type(item)})"
+        )
+def shard_tree(
+        tree: TSameTree,
+        sharding_spec: ShardingSpec,
+        num_shards: int,
+        enforce_even_split: bool
+) -> tuple[TSameTree, ...]:
+    """
+    Shards a PyTree into a tuple of PyTrees, one for each shard rank.
+    This function takes a single global data structure and splits it into `num_shards`
+    structures.
+    *   If a spec leaf is a ``SpecShard(dim)``, the tensor or list is split along that dimension,
+        and the ``i``-th slice goes to the ``i``-th output tree.
+    *   If a spec leaf is ``SpecReplicate``, the item is replicated (reference copy) to all
+        output trees.
+    Args:
+        tree: The structure containing tensors to be sharded.
+        sharding_spec: A structure matching 'tree' containing ``SpecShard`` or ``SpecReplicate`` objects.
+        num_shards: The total number of shards to split the tensors into.
+        enforce_even_split: If True, raises a ValueError if a tensor's dimension
+            size is not perfectly divisible by ``num_shards``.
+    Returns:
+        A tuple of length ``num_shards``. Each element is a PyTree matching
+        the structure of the input ``tree``, containing the local data for
+        that specific rank.
+    Raises:
+        ValueError: If tree structures do not match, or valid sharding conditions
+            are not met.
+    """
+    flat_spec, spec_struct = pytree.tree_flatten(sharding_spec)
+    try:
+        flat_tree = spec_struct.flatten_up_to(tree)
+    except (ValueError, TypeError) as e:
+        raise ValueError("Tree structure does not match sharding spec") from e
+    sharded_leaves_per_node = [
+        _shard_leaf_to_list(item, spec, num_shards, enforce_even_split)
+        for item, spec in zip(flat_tree, flat_spec, strict=True)
+    ]
+    rank_leaves = list(zip(*sharded_leaves_per_node, strict=True))
+    return tuple(spec_struct.unflatten(leaves) for leaves in rank_leaves)

d9d/core/sharding/spec.py ADDED Viewed

@@ -0,0 +1,28 @@
+import dataclasses
+from d9d.core.types import PyTree
+@dataclasses.dataclass(slots=True, frozen=True)
+class SpecReplicate:
+    """
+    Specifies that a leaf node should be replicated across all shards.
+    """
+@dataclasses.dataclass(slots=True, frozen=True)
+class SpecShard:
+    """
+    Specifies that a leaf node should be split along a specific dimension.
+    Attributes:
+        dim: The dimension to split.
+        do_stack: If True, sharding will squeeze the sharded dimension (it should be exactly the num_shards length)
+    """
+    dim: int
+    do_stack: bool = False
+ShardingSpecLeaf = SpecReplicate | SpecShard
+ShardingSpec = PyTree[ShardingSpecLeaf]

d9d/core/sharding/unshard.py ADDED Viewed

@@ -0,0 +1,117 @@
+from collections.abc import Sequence
+from typing import TypeVar, cast
+import torch
+import torch.utils._pytree as pytree  # noqa: PLC2701
+from d9d.core.types import PyTree
+from .spec import ShardingSpec, ShardingSpecLeaf, SpecReplicate, SpecShard
+TLeaf = TypeVar("TLeaf")
+TSameTree = TypeVar("TSameTree", bound=PyTree)
+def _unshard_list(
+        group: Sequence[list[TLeaf] | TLeaf],
+        spec: SpecShard
+) -> list[TLeaf]:
+    if spec.dim != 0:
+        raise ValueError(f"Lists can only be unsharded on dim 0, got {spec.dim}")
+    if spec.do_stack:
+        return cast(list[TLeaf], list(group))
+    merged_list: list[TLeaf] = []
+    for x in group:
+        merged_list.extend(cast(list[TLeaf], x))
+    return merged_list
+def _unshard_tensor(
+        group: list[torch.Tensor],
+        spec: SpecShard
+) -> torch.Tensor:
+    if spec.do_stack:
+        return torch.stack(group, dim=spec.dim)
+    return torch.cat(group, dim=spec.dim)
+def _unshard_leaf_from_group(
+        group: Sequence[TLeaf],
+        spec: ShardingSpecLeaf
+) -> TLeaf:
+    """Helper to merge a group of items from different ranks into one."""
+    if isinstance(spec, SpecReplicate):
+        return group[0]
+    if not isinstance(spec, SpecShard):
+        raise TypeError(f"Unknown sharding spec object type: {type(spec)}")
+    first_item = group[0]
+    if isinstance(first_item, torch.Tensor):
+        return cast(TLeaf, _unshard_tensor(
+            cast(list[torch.Tensor], group),
+            spec
+        ))
+    elif spec.do_stack or isinstance(first_item, list):
+        return cast(TLeaf, _unshard_list(group, spec))
+    else:
+        raise TypeError(f"Expected Tensor or list instances, got {type(group[0])}")
+def unshard_tree(
+        sharded_trees: Sequence[TSameTree],
+        sharding_spec: ShardingSpec
+) -> TSameTree:
+    """
+    Combines a sequence of PyTrees (one per rank) into a single global PyTree.
+    This is the inverse of ``shard_tree``. It iterates over the provided trees,
+    gathering corresponding leaves from each rank.
+    *   If the spec for a leaf is ``SpecShard(dim)``, the tensors from all ranks are
+        concatenated (or stacked if ``do_stack=True``) along that dimension.
+    *   If the spec is ``SpecReplicate``, it assumes the data is replicated
+        and takes the item from the first rank.
+    Args:
+        sharded_trees: A sequence (list or tuple) of PyTrees. There must be
+            one tree for each shard rank, and they must all share the same
+            structure as ``sharding_spec``.
+        sharding_spec: A structure matching the input trees containing
+            ``SpecShard`` or ``SpecReplicate`` objects.
+    Returns:
+        A single PyTree where distinct shards have been merged into full tensors.
+    Raises:
+        ValueError: If ``sharded_trees`` is empty, or if unit structures do
+            not match the spec.
+    """
+    if not sharded_trees:
+        raise ValueError("sharded_trees sequence cannot be empty")
+    flat_spec, spec_struct = pytree.tree_flatten(sharding_spec)
+    flat_shards_per_rank = []
+    for i, tree in enumerate(sharded_trees):
+        try:
+            leaves = spec_struct.flatten_up_to(tree)
+        except (ValueError, TypeError) as e:
+            raise ValueError(
+                f"Structure mismatch at shard {i}: tree does not match sharding spec structure"
+            ) from e
+        flat_shards_per_rank.append(leaves)
+    grouped_leaves = list(zip(*flat_shards_per_rank, strict=True))
+    reconstructed_leaves = [
+        _unshard_leaf_from_group(group, spec)
+        for group, spec in zip(grouped_leaves, flat_spec, strict=True)
+    ]
+    return spec_struct.unflatten(reconstructed_leaves)