PyPI - d9d - Versions diffs - 0.1.0__py3-none-any.whl - Mend

d9d 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

d9d/__init__.py +0 -0
d9d/core/__init__.py +0 -0
d9d/core/autograd/__init__.py +7 -0
d9d/core/autograd/grad_context.py +85 -0
d9d/core/dist_context/__init__.py +19 -0
d9d/core/dist_context/configured.py +215 -0
d9d/core/dist_context/device_mesh_domains.py +185 -0
d9d/core/dist_context/log.py +30 -0
d9d/core/dist_context/params.py +113 -0
d9d/core/dist_ops/__init__.py +16 -0
d9d/core/dist_ops/object.py +68 -0
d9d/core/dist_ops/tensor.py +192 -0
d9d/core/protocol/__init__.py +8 -0
d9d/core/protocol/training.py +38 -0
d9d/core/sharding/__init__.py +15 -0
d9d/core/sharding/auto_spec.py +66 -0
d9d/core/sharding/shard.py +154 -0
d9d/core/sharding/spec.py +28 -0
d9d/core/sharding/unshard.py +117 -0
d9d/core/types/__init__.py +12 -0
d9d/core/types/data.py +14 -0
d9d/core/types/pytree.py +26 -0
d9d/dataset/__init__.py +17 -0
d9d/dataset/buffer_sorted.py +143 -0
d9d/dataset/padding.py +79 -0
d9d/dataset/sharded.py +195 -0
d9d/internals/__init__.py +0 -0
d9d/internals/determinism/__init__.py +10 -0
d9d/internals/determinism/seed.py +63 -0
d9d/internals/grad_norm/__init__.py +8 -0
d9d/internals/grad_norm/group.py +87 -0
d9d/internals/grad_norm/norm.py +169 -0
d9d/internals/grad_sync/__init__.py +14 -0
d9d/internals/grad_sync/bucket.py +317 -0
d9d/internals/grad_sync/placement_helper.py +23 -0
d9d/internals/grad_sync/synchronizer.py +257 -0
d9d/internals/pipeline_state/__init__.py +14 -0
d9d/internals/pipeline_state/api.py +45 -0
d9d/internals/pipeline_state/handler.py +111 -0
d9d/internals/pipeline_state/storage.py +236 -0
d9d/internals/profiling/__init__.py +7 -0
d9d/internals/profiling/profile.py +112 -0
d9d/internals/state/__init__.py +6 -0
d9d/internals/state/main_process.py +44 -0
d9d/kernel/__init__.py +0 -0
d9d/kernel/cce/__init__.py +5 -0
d9d/kernel/cce/cce.py +298 -0
d9d/kernel/cce/main.py +282 -0
d9d/kernel/general/__init__.py +5 -0
d9d/kernel/general/get_int_dtype.py +7 -0
d9d/kernel/gmm/__init__.py +5 -0
d9d/kernel/gmm/function.py +78 -0
d9d/kernel/moe/__init__.py +8 -0
d9d/kernel/moe/indices_to_multihot.py +268 -0
d9d/kernel/moe/permute_with_probs.py +1035 -0
d9d/kernel/stochastic/__init__.py +11 -0
d9d/kernel/stochastic/adamw_step.py +204 -0
d9d/kernel/stochastic/copy.py +104 -0
d9d/kernel/stochastic/ops/__init__.py +5 -0
d9d/kernel/stochastic/ops/round.py +22 -0
d9d/kernel/swiglu/__init__.py +5 -0
d9d/kernel/swiglu/function.py +36 -0
d9d/kernel/swiglu/op.py +167 -0
d9d/loop/__init__.py +0 -0
d9d/loop/auto/__init__.py +9 -0
d9d/loop/auto/auto_lr_scheduler.py +46 -0
d9d/loop/auto/auto_optimizer.py +196 -0
d9d/loop/component/__init__.py +35 -0
d9d/loop/component/batch_maths.py +106 -0
d9d/loop/component/checkpointer.py +172 -0
d9d/loop/component/data_loader_factory.py +258 -0
d9d/loop/component/garbage_collector.py +94 -0
d9d/loop/component/gradient_clipper.py +89 -0
d9d/loop/component/gradient_manager.py +149 -0
d9d/loop/component/job_logger.py +146 -0
d9d/loop/component/job_profiler.py +62 -0
d9d/loop/component/loss_computer.py +86 -0
d9d/loop/component/model_stage_exporter.py +37 -0
d9d/loop/component/model_stage_factory.py +261 -0
d9d/loop/component/optimizer_factory.py +88 -0
d9d/loop/component/stepper.py +52 -0
d9d/loop/component/timeout_manager.py +54 -0
d9d/loop/component/train_task_operator.py +152 -0
d9d/loop/config/__init__.py +36 -0
d9d/loop/config/config.py +225 -0
d9d/loop/config/types.py +24 -0
d9d/loop/control/__init__.py +61 -0
d9d/loop/control/dataset_provider.py +58 -0
d9d/loop/control/lr_scheduler_provider.py +47 -0
d9d/loop/control/model_provider.py +162 -0
d9d/loop/control/optimizer_provider.py +45 -0
d9d/loop/control/task.py +304 -0
d9d/loop/run/__init__.py +6 -0
d9d/loop/run/train.py +355 -0
d9d/loop/state.py +143 -0
d9d/lr_scheduler/__init__.py +9 -0
d9d/lr_scheduler/piecewise/__init__.py +18 -0
d9d/lr_scheduler/piecewise/builder.py +152 -0
d9d/lr_scheduler/piecewise/config.py +176 -0
d9d/lr_scheduler/piecewise/curves.py +75 -0
d9d/lr_scheduler/piecewise/engine.py +76 -0
d9d/lr_scheduler/visualizer.py +74 -0
d9d/metric/__init__.py +10 -0
d9d/metric/abc.py +79 -0
d9d/metric/impl/__init__.py +7 -0
d9d/metric/impl/compose.py +54 -0
d9d/metric/impl/mean.py +94 -0
d9d/model_state/__init__.py +0 -0
d9d/model_state/io/__init__.py +21 -0
d9d/model_state/io/dto.py +30 -0
d9d/model_state/io/module_reader.py +75 -0
d9d/model_state/io/module_writer.py +123 -0
d9d/model_state/io/reader.py +125 -0
d9d/model_state/io/writer.py +309 -0
d9d/model_state/mapper/__init__.py +10 -0
d9d/model_state/mapper/abc.py +70 -0
d9d/model_state/mapper/adapters/__init__.py +12 -0
d9d/model_state/mapper/adapters/mapper.py +27 -0
d9d/model_state/mapper/adapters/module.py +22 -0
d9d/model_state/mapper/compose/__init__.py +17 -0
d9d/model_state/mapper/compose/helper.py +22 -0
d9d/model_state/mapper/compose/parallel.py +58 -0
d9d/model_state/mapper/compose/sequential.py +131 -0
d9d/model_state/mapper/compose/shard.py +36 -0
d9d/model_state/mapper/leaf/__init__.py +18 -0
d9d/model_state/mapper/leaf/dtensor.py +56 -0
d9d/model_state/mapper/leaf/identity.py +23 -0
d9d/model_state/mapper/leaf/rename.py +26 -0
d9d/model_state/mapper/leaf/select_child.py +37 -0
d9d/model_state/mapper/leaf/stack.py +29 -0
d9d/module/__init__.py +0 -0
d9d/module/base/__init__.py +7 -0
d9d/module/base/late_init.py +10 -0
d9d/module/block/__init__.py +0 -0
d9d/module/block/attention/__init__.py +7 -0
d9d/module/block/attention/grouped_query.py +139 -0
d9d/module/block/attention/sdpa/__init__.py +5 -0
d9d/module/block/attention/sdpa/flash.py +52 -0
d9d/module/block/embedding/__init__.py +7 -0
d9d/module/block/embedding/shard_token_embedding.py +103 -0
d9d/module/block/ffn/__init__.py +5 -0
d9d/module/block/ffn/swiglu.py +60 -0
d9d/module/block/head/__init__.py +6 -0
d9d/module/block/head/language_modelling.py +87 -0
d9d/module/block/hidden_states_aggregator/__init__.py +12 -0
d9d/module/block/hidden_states_aggregator/base.py +35 -0
d9d/module/block/hidden_states_aggregator/factory.py +48 -0
d9d/module/block/hidden_states_aggregator/mean.py +61 -0
d9d/module/block/hidden_states_aggregator/noop.py +27 -0
d9d/module/block/moe/__init__.py +13 -0
d9d/module/block/moe/communications/__init__.py +11 -0
d9d/module/block/moe/communications/base.py +58 -0
d9d/module/block/moe/communications/deepep.py +300 -0
d9d/module/block/moe/communications/naive.py +68 -0
d9d/module/block/moe/grouped_experts.py +81 -0
d9d/module/block/moe/grouped_linear.py +78 -0
d9d/module/block/moe/layer.py +122 -0
d9d/module/block/moe/router.py +103 -0
d9d/module/block/positional/__init__.py +8 -0
d9d/module/block/positional/rope.py +150 -0
d9d/module/model/__init__.py +0 -0
d9d/module/model/qwen3_moe/__init__.py +16 -0
d9d/module/model/qwen3_moe/decoder_layer.py +110 -0
d9d/module/model/qwen3_moe/model.py +373 -0
d9d/module/model/qwen3_moe/params.py +69 -0
d9d/module/parallelism/__init__.py +0 -0
d9d/module/parallelism/api/__init__.py +18 -0
d9d/module/parallelism/api/expert_parallel.py +36 -0
d9d/module/parallelism/api/fully_sharded.py +43 -0
d9d/module/parallelism/api/hybrid_sharded.py +49 -0
d9d/module/parallelism/api/replicate_parallel.py +33 -0
d9d/module/parallelism/model/__init__.py +0 -0
d9d/module/parallelism/model/qwen3_moe.py +99 -0
d9d/module/parallelism/style/__init__.py +7 -0
d9d/module/parallelism/style/shard_experts.py +60 -0
d9d/module/parallelism/style/to_local.py +86 -0
d9d/optim/__init__.py +0 -0
d9d/optim/stochastic/__init__.py +5 -0
d9d/optim/stochastic/adamw.py +158 -0
d9d/peft/__init__.py +13 -0
d9d/peft/all/__init__.py +12 -0
d9d/peft/all/config.py +31 -0
d9d/peft/all/method.py +76 -0
d9d/peft/applicator.py +47 -0
d9d/peft/base.py +70 -0
d9d/peft/full_tune/__init__.py +11 -0
d9d/peft/full_tune/config.py +20 -0
d9d/peft/full_tune/method.py +46 -0
d9d/peft/lora/__init__.py +15 -0
d9d/peft/lora/config.py +35 -0
d9d/peft/lora/layer.py +177 -0
d9d/peft/lora/method.py +132 -0
d9d/pipelining/__init__.py +0 -0
d9d/pipelining/api/__init__.py +19 -0
d9d/pipelining/api/module.py +149 -0
d9d/pipelining/api/schedule.py +50 -0
d9d/pipelining/api/sharding.py +9 -0
d9d/pipelining/factory/__init__.py +21 -0
d9d/pipelining/factory/config.py +89 -0
d9d/pipelining/factory/factory.py +114 -0
d9d/pipelining/factory/registry.py +82 -0
d9d/pipelining/infra/__init__.py +0 -0
d9d/pipelining/infra/schedule/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/program/__init__.py +22 -0
d9d/pipelining/infra/schedule/component/program/base.py +35 -0
d9d/pipelining/infra/schedule/component/program/communications.py +203 -0
d9d/pipelining/infra/schedule/component/program/topology.py +78 -0
d9d/pipelining/infra/schedule/component/runtime/__init__.py +29 -0
d9d/pipelining/infra/schedule/component/runtime/action.py +361 -0
d9d/pipelining/infra/schedule/component/runtime/communications.py +101 -0
d9d/pipelining/infra/schedule/component/runtime/executor.py +113 -0
d9d/pipelining/infra/schedule/component/runtime/loss.py +55 -0
d9d/pipelining/infra/schedule/program/__init__.py +15 -0
d9d/pipelining/infra/schedule/program/bfs.py +86 -0
d9d/pipelining/infra/schedule/program/dualpipev.py +234 -0
d9d/pipelining/infra/schedule/program/interleaved.py +240 -0
d9d/pipelining/infra/schedule/program/zerobubblev.py +227 -0
d9d/pipelining/infra/stage/__init__.py +5 -0
d9d/pipelining/infra/stage/communications.py +274 -0
d9d/pipelining/infra/stage/computations.py +317 -0
d9d/pipelining/infra/stage/splitgrad.py +377 -0
d9d/pipelining/infra/stage/stage.py +321 -0
d9d/pipelining/infra/stage/struct_helper.py +46 -0
d9d/pipelining/training/__init__.py +7 -0
d9d/pipelining/training/optimizer.py +41 -0
d9d/pipelining/training/scheduler.py +34 -0
d9d/tracker/__init__.py +14 -0
d9d/tracker/base.py +124 -0
d9d/tracker/factory.py +57 -0
d9d/tracker/provider/__init__.py +0 -0
d9d/tracker/provider/aim/__init__.py +0 -0
d9d/tracker/provider/aim/config.py +23 -0
d9d/tracker/provider/aim/tracker.py +114 -0
d9d/tracker/provider/null.py +61 -0
d9d-0.1.0.dist-info/METADATA +90 -0
d9d-0.1.0.dist-info/RECORD +238 -0
d9d-0.1.0.dist-info/WHEEL +4 -0

d9d/model_state/io/dto.py ADDED Viewed

@@ -0,0 +1,30 @@
+from pydantic import BaseModel
+class ModelStateIndexMeta(BaseModel):
+    """
+    Metadata for the model state index.
+    Attributes:
+        total_size: Total size of the model parameters in bytes.
+    """
+    total_size: int
+class ModelStateIndex(BaseModel):
+    """
+    Represents the content of the `model.safetensors.index.json` file.
+    This index maps every weight name to the specific .safetensors file containing it.
+    Attributes:
+        metadata: Global metadata about the checkpoint.
+        weight_map: Mapping from parameter name to filename.
+    """
+    metadata: ModelStateIndexMeta
+    weight_map: dict[str, str]
+MODEL_STATE_INDEX_FILE_NAME = "model.safetensors.index.json"

d9d/model_state/io/module_reader.py ADDED Viewed

@@ -0,0 +1,75 @@
+from pathlib import Path
+import torch
+from torch import nn
+from torch.distributed.tensor import DTensor
+from d9d.model_state.mapper import ModelStateMapper
+from d9d.model_state.mapper.compose import (
+    ModelStateMapperParallel,
+    ModelStateMapperSequential,
+)
+from d9d.model_state.mapper.leaf import (
+    ModelStateMapperDistribute,
+    ModelStateMapperIdentity,
+)
+from .reader import read_model_state
+def _build_injection_mapper(name: str, state: torch.Tensor) -> ModelStateMapper:
+    if isinstance(state, DTensor):
+        return ModelStateMapperDistribute(name=name, placements=state.placements, device_mesh=state.device_mesh)
+    else:
+        return ModelStateMapperIdentity(name)
+def _augment_mapper_for_injection(model: nn.Module, mapper: ModelStateMapper) -> ModelStateMapper:
+    states_to_load = {output for group in mapper.state_dependency_groups() for output in group.outputs}
+    current_state_dict = model.state_dict()
+    mapper = ModelStateMapperSequential([
+        mapper,
+        ModelStateMapperParallel([_build_injection_mapper(name, current_state_dict[name]) for name in states_to_load])
+    ])
+    return mapper
+def load_model_state(
+        src_dir: Path,
+        mapper: ModelStateMapper,
+        device: str,
+        model: nn.Module,
+        show_progress: bool = True,
+):
+    """
+    High-level utility to stream a checkpoint directly into a PyTorch module.
+    This function orchestrates the full loading lifecycle:
+    1.  Topology Mapping: Uses `mapper` to rename/stack/reshape on-disk states to model states.
+    2.  Automatic Distribution: If the `model` contains `DTensor`s, the loaded local tensors are automatically
+        sharded/replicated to match the model's placement schema.
+    3.  Streaming Read & Inject: After loading and transforming a model state, it will be injected into `model`
+        using `load_state_dict(...)`.
+    NOTICE: Only states specified in `mapper` will be loaded! You can use
+    `d9d.model_state.mapper.adapters.identity_mapper_from_module(module)` to create a mapper that will load every
+    model state without changing it.
+    Args:
+        src_dir: Directory containing .safetensors and index files.
+        mapper: The topology defining how mapping from disk keys to model keys works.
+        device: The device to load tensors onto (usually "cpu" or "cuda").
+        model: The model instance to load weights into.
+        show_progress: Whether to display the loading progress bar.
+    """
+    for state_name, state_value in read_model_state(
+            src_dir=src_dir,
+            mapper=_augment_mapper_for_injection(model, mapper),
+            device=device,
+            show_progress=show_progress
+    ):
+        model.load_state_dict({state_name: state_value}, strict=False)

d9d/model_state/io/module_writer.py ADDED Viewed

@@ -0,0 +1,123 @@
+from collections.abc import Iterable
+from pathlib import Path
+import torch
+from torch import nn
+from torch.distributed import DeviceMesh
+from torch.distributed.tensor import DTensor
+from d9d.model_state.mapper import ModelStateMapper
+from d9d.model_state.mapper.compose import (
+    ModelStateMapperParallel,
+    ModelStateMapperSequential,
+)
+from d9d.model_state.mapper.leaf import (
+    ModelStateMapperGatherFullTensor,
+    ModelStateMapperIdentity,
+)
+from .writer import (
+    write_model_state_local,
+    write_model_state_pipeline_parallel,
+)
+def _build_extraction_mapper(name: str, state: torch.Tensor) -> ModelStateMapper:
+    if isinstance(state, DTensor):
+        return ModelStateMapperGatherFullTensor(name)
+    else:
+        return ModelStateMapperIdentity(name)
+def _augment_mapper_for_extraction(models: list[nn.Module], mapper: ModelStateMapper) -> ModelStateMapper:
+    states_to_save = {input_state for group in mapper.state_dependency_groups() for input_state in group.inputs}
+    current_state_dict = {}
+    for model in models:
+        current_state_dict.update(model.state_dict())
+    mapper = ModelStateMapperSequential([
+        ModelStateMapperParallel([_build_extraction_mapper(name, current_state_dict[name]) for name in states_to_save]),
+        mapper
+    ])
+    return mapper
+def _state_generator(models: list[nn.Module]) -> Iterable[tuple[str, torch.Tensor]]:
+    for model in models:
+        yield from model.state_dict().items()
+def save_model_state(
+        dest_dir: Path,
+        mapper: ModelStateMapper,
+        model: nn.Module,
+        shard_size_gb: float = 4.0,
+        show_progress: bool = True
+):
+    """
+    High-level utility to save a PyTorch model to disk on a **single** process.
+    NOTICE: Only states specified in `mapper` will be saved! You can use
+    `d9d.model_state.mapper.adapters.identity_mapper_from_module(module)` to create a mapper that will save every
+    model state without changing it.
+    Args:
+        dest_dir: The directory to save .safetensors shards and index.
+        mapper: Topology defining how model keys map to disk keys.
+        model: The PyTorch module to save.
+        shard_size_gb: Max size per shard file in Gigabytes.
+        show_progress: Whether to display a progress bar.
+    """
+    write_model_state_local(
+        dest_dir=dest_dir,
+        mapper=_augment_mapper_for_extraction([model], mapper),
+        state_generator=_state_generator([model]),
+        shard_size_gb=shard_size_gb,
+        show_progress=show_progress
+    )
+def save_model_state_pipeline_parallel(
+        dest_dir: Path,
+        mapper: ModelStateMapper,
+        device_mesh: DeviceMesh,
+        pipeline_dim_name: str,
+        models: list[nn.Module],
+        shard_size_gb: float = 4.0,
+        show_progress: bool = True
+):
+    """
+    High-level utility to save a model in a Distributed Pipeline Parallel environment to disk.
+    Features:
+    1. **Auto-Gather**: Converts `DTensor` parameters to full tensors before saving.
+    2. **Distribution Awareness**: Uses the `device_mesh` to ensure that for a given pipeline stage,
+       only the master rank writes the checkpoint, preventing Write-After-Write conflicts.
+    3. **Index Merging**: Aggregates metadata from all independent pipeline stages into one global index file.
+    NOTICE: Only states specified in `mapper` will be saved! You can use
+    `d9d.model_state.mapper.adapters.identity_mapper_from_module(module)` to create a mapper that will save every
+    model state without changing it.
+    Args:
+        dest_dir: directory to save .safetensors shards and index file.
+        mapper: Topology defining how model keys map to disk keys.
+        device_mesh: The cluster topology mesh.
+        pipeline_dim_name: The specific dimension name in the mesh used for pipelining.
+        models: A list of modules (pipeline stages) processed by this PP rank.
+        shard_size_gb: Max size per shard file in Gigabytes.
+        show_progress: Whether to display a progress bar.
+    """
+    write_model_state_pipeline_parallel(
+        dest_dir=dest_dir,
+        mapper=_augment_mapper_for_extraction(models, mapper),
+        state_generator=_state_generator(models),
+        device_mesh=device_mesh,
+        pipeline_dim_name=pipeline_dim_name,
+        shard_size_gb=shard_size_gb,
+        show_progress=show_progress
+    )

d9d/model_state/io/reader.py ADDED Viewed

@@ -0,0 +1,125 @@
+from collections import defaultdict
+from collections.abc import Generator, Iterable
+from pathlib import Path
+import torch
+from safetensors import safe_open
+from tqdm import tqdm
+from d9d.model_state.io.dto import MODEL_STATE_INDEX_FILE_NAME, ModelStateIndex
+from d9d.model_state.mapper import ModelStateMapper
+class _StateLoadingFlow:
+    """
+    Internal orchestration logic for loading and transforming model states in a streamed manner.
+    """
+    def __init__(
+            self,
+            src_dir: Path,
+            mapper: ModelStateMapper,
+            device: str,
+            show_progress: bool
+    ):
+        self._src_dir = src_dir
+        self._mapper = mapper
+        self._device = device
+        # I/O in constructor!
+        self._index = self._load_index()
+        self._groups_to_process = set(mapper.state_dependency_groups())
+        self._stored_states: dict[str, torch.Tensor] = {}
+        self._check_index()
+        self._pbar = tqdm(
+            desc="Loading Model States",
+            total=len([output_name for group in self._groups_to_process for output_name in group.outputs]),
+            disable=not show_progress
+        )
+    def _load_index(self) -> ModelStateIndex:
+        index_file = self._src_dir / MODEL_STATE_INDEX_FILE_NAME
+        index_data = index_file.read_text(encoding="utf-8")
+        index = ModelStateIndex.model_validate_json(index_data)
+        return index
+    def _check_index(self):
+        will_process_inputs: set[str] = set()
+        for group in self._groups_to_process:
+            will_process_inputs.update(group.inputs)
+        on_disk_inputs = set(self._index.weight_map.keys())
+        missing_inputs = will_process_inputs.difference(on_disk_inputs)
+        if len(missing_inputs) > 0:
+            raise ValueError(f"Cannot run state loading: states {missing_inputs} are missing!")
+    def _update_in_memory_states(self, file_to_load: str, params_to_load: set[str]):
+        with safe_open(str(self._src_dir / file_to_load), framework="pt", device=str(self._device)) as st:
+            for param_to_load in params_to_load:
+                self._stored_states[param_to_load] = st.get_tensor(param_to_load)
+    def _process_available_groups(self) -> Generator[tuple[str, torch.Tensor], None, None]:
+        for group in self._groups_to_process.copy():
+            if not group.inputs.issubset(self._stored_states.keys()):
+                continue
+            self._groups_to_process.remove(group)
+            loaded_states = self._mapper.apply(
+                {k: v for k, v in self._stored_states.items() if k in group.inputs}
+            )
+            yield from loaded_states.items()
+            self._pbar.update(len(loaded_states))
+            for input_name in group.inputs:
+                del self._stored_states[input_name]
+    def _build_file_loading_plan(self) -> dict[str, set[str]]:
+        plan = defaultdict(set)
+        for group in self._mapper.state_dependency_groups():
+            for key in group.inputs:
+                require_file = self._index.weight_map[key]
+                plan[require_file].add(key)
+        return plan
+    def load(self) -> Iterable[tuple[str, torch.Tensor]]:
+        with self._pbar:
+            for file_to_load, params_to_load in self._build_file_loading_plan().items():
+                self._update_in_memory_states(file_to_load, params_to_load)
+                yield from self._process_available_groups()
+def read_model_state(
+        src_dir: Path,
+        mapper: ModelStateMapper,
+        device: str,
+        show_progress: bool = True
+) -> Iterable[tuple[str, torch.Tensor]]:
+    """
+    Reads a model checkpoint from disk, transforming it on-the-fly according to the state mapper.
+    This function uses a streaming approach. It analyzes the mapper to determine which files
+    need to be loaded. Tensors are loaded into memory only when needed and evicted immediately
+    after the mapper processes them.
+    Args:
+        src_dir: The directory containing .safetensors files and `model.safetensors.index.json` file.
+        mapper: The transformation graph defining how to map on-disk keys to output keys.
+        device: The device to load tensors onto (e.g., "cpu", "cuda:0").
+        show_progress: Whether to display a progress bar.
+    Yields:
+        A tuple containing the transformed parameter name and its tensor value.
+    """
+    yield from _StateLoadingFlow(
+        src_dir=src_dir,
+        device=device,
+        mapper=mapper,
+        show_progress=show_progress
+    ).load()

d9d/model_state/io/writer.py ADDED Viewed

@@ -0,0 +1,309 @@
+import warnings
+from collections.abc import Iterable
+from pathlib import Path
+from typing import cast
+import torch
+from safetensors.torch import save_file
+from torch.distributed import DeviceMesh, ProcessGroup
+from tqdm import tqdm
+from d9d.core.dist_ops import all_gather_object
+from d9d.model_state.io.dto import (
+    MODEL_STATE_INDEX_FILE_NAME,
+    ModelStateIndex,
+    ModelStateIndexMeta,
+)
+from d9d.model_state.mapper import ModelStateMapper
+class _StateWritingFlowLocal:
+    """
+    Internal orchestration logic for buffering, transforming, and sharding model states during save.
+    """
+    def __init__(
+            self,
+            dest_dir: Path,
+            mapper: ModelStateMapper,
+            shard_size_gb: float,
+            show_progress: bool,
+            sharding_rank: int,
+            # so we have to call writing flow from all processes, but
+            is_current_process_rank_master: bool
+    ):
+        self._dest_dir = dest_dir
+        self._mapper = mapper
+        self._shard_size_bytes = int(shard_size_gb * (1024 ** 3))
+        self._groups_to_process = set(mapper.state_dependency_groups())
+        self._available_source_states: dict[str, torch.Tensor] = {}
+        self._total_size = 0
+        self._pending_write_tensors: dict[str, torch.Tensor] = {}
+        self._current_shard_size = 0
+        self._sharding_rank = sharding_rank
+        self._weight_name_to_local_shard_idx: dict[str, int] = {}
+        self._local_shard_idx_to_tmp_path: dict[int, Path] = {}
+        self._is_current_process_rank_master = is_current_process_rank_master
+        total_num_outputs = len([out_name for group in self._groups_to_process for out_name in group.outputs])
+        self._pbar = tqdm(
+            desc="Saving Model States",
+            total=total_num_outputs,
+            disable=not (show_progress and is_current_process_rank_master)
+        )
+    def _flush_shard(self):
+        if not self._pending_write_tensors:
+            return
+        local_shard_num = len(self._local_shard_idx_to_tmp_path) + 1
+        shard_tmp_path = self._dest_dir / f".tmp-rank{self._sharding_rank}-shard-{local_shard_num}.safetensors"
+        self._local_shard_idx_to_tmp_path[local_shard_num] = shard_tmp_path
+        save_file(self._pending_write_tensors, str(shard_tmp_path))
+        for state_name in self._pending_write_tensors:
+            self._weight_name_to_local_shard_idx[state_name] = local_shard_num
+        self._pbar.update(len(self._pending_write_tensors))
+        self._total_size += self._current_shard_size
+        self._pending_write_tensors.clear()
+        self._current_shard_size = 0
+    def _process_available_groups(self):
+        for group in self._groups_to_process.copy():
+            if not group.inputs.issubset(self._available_source_states.keys()):
+                continue
+            self._groups_to_process.remove(group)
+            states_to_save = self._mapper.apply(
+                {k: self._available_source_states[k] for k in group.inputs}
+            )
+            for input_name in group.inputs:
+                del self._available_source_states[input_name]
+            # proceed with stateful saving only on master rank
+            if self._is_current_process_rank_master:
+                for name, tensor in states_to_save.items():
+                    update_size = tensor.numel() * tensor.element_size()
+                    if update_size > self._shard_size_bytes:
+                        raise ValueError(f"Cannot save state {name} that is larger than shard size")
+                    if self._current_shard_size + update_size > self._shard_size_bytes:
+                        self._flush_shard()
+                    self._pending_write_tensors[name] = tensor
+                    self._current_shard_size += update_size
+    def _finalize_locally(self) -> ModelStateIndex:
+        self._flush_shard()
+        if self._groups_to_process:
+            missing_groups = {g.inputs for g in self._groups_to_process}
+            raise ValueError(
+                f"Writing failed: not all source tensors were provided to satisfy mapper dependencies. "
+                f"Missing inputs for groups: {missing_groups}"
+            )
+        if self._available_source_states:
+            warnings.warn(
+                f"State Writing: The following source tensors were provided but not consumed by any "
+                f"mapper group and will be ignored: {sorted(self._available_source_states.keys())}",
+                stacklevel=2
+            )
+        weight_map_local = {
+            name: self._local_shard_idx_to_tmp_path[shard_idx].name
+            for name, shard_idx in self._weight_name_to_local_shard_idx.items()
+        }
+        return ModelStateIndex(
+            metadata=ModelStateIndexMeta(total_size=self._total_size),
+            weight_map=weight_map_local
+        )
+    def write(self, state_generator: Iterable[tuple[str, torch.Tensor]]) -> ModelStateIndex | None:
+        with self._pbar:
+            self._dest_dir.mkdir(parents=True, exist_ok=True)
+            for name, tensor in state_generator:
+                self._available_source_states[name] = tensor
+                self._process_available_groups()
+            if self._is_current_process_rank_master:
+                return self._finalize_locally()
+            else:
+                return None
+def _finalize_master(dest_dir: Path, indices: list[ModelStateIndex]):
+    total_size = sum(index.metadata.total_size for index in indices)
+    total_weight_map_local = dict(pair for index in indices for pair in index.weight_map.items())
+    shard_count = len({file_name for index in indices for _, file_name in index.weight_map.items()})
+    total_weight_map = {}
+    local_file_to_global_file = {}
+    used_global_files = 0
+    for weight_name, old_file_name in total_weight_map_local.items():
+        if old_file_name not in local_file_to_global_file:
+            used_global_files += 1
+            new_file_name = f"model-{used_global_files:05d}-of-{shard_count:05d}.safetensors"
+            (dest_dir / old_file_name).rename(dest_dir / new_file_name)
+            local_file_to_global_file[old_file_name] = new_file_name
+        total_weight_map[weight_name] = local_file_to_global_file[old_file_name]
+    index_path = dest_dir / MODEL_STATE_INDEX_FILE_NAME
+    index_path.write_text(
+        ModelStateIndex(
+            metadata=ModelStateIndexMeta(total_size=total_size),
+            weight_map=total_weight_map
+        ).model_dump_json(indent=4),
+        encoding="utf-8"
+    )
+def write_model_state_local(
+        dest_dir: Path,
+        mapper: ModelStateMapper,
+        state_generator: Iterable[tuple[str, torch.Tensor]],
+        shard_size_gb: float = 4.0,
+        show_progress: bool = True
+):
+    """
+    Saves model states to disk in a single local process.
+    This function uses a streaming approach. It analyzes the mapper to determine which files
+    need to be saved. Tensors are loaded into memory only when needed and evicted immediately
+    after the mapper processes them.
+    Args:
+        dest_dir: Destination directory.
+        mapper: Mapping to apply to states before saving.
+        state_generator: Stream of (name, tensor) pairs to save.
+        shard_size_gb: Maximum size of a single .safetensors file in GB.
+        show_progress: Whether to show the progress bar.
+    """
+    idx = _StateWritingFlowLocal(
+        dest_dir=dest_dir,
+        mapper=mapper,
+        shard_size_gb=shard_size_gb,
+        show_progress=show_progress,
+        sharding_rank=0,
+        is_current_process_rank_master=True
+    ).write(state_generator=state_generator)
+    idx = cast(ModelStateIndex, idx)  # we are sure is_current_process_rank_master=True
+    _finalize_master(dest_dir, [idx])
+def write_model_state_distributed(
+        dest_dir: Path,
+        mapper: ModelStateMapper,
+        state_generator: Iterable[tuple[str, torch.Tensor]],
+        process_group: ProcessGroup,
+        shard_size_gb: float = 4.0,
+        show_progress: bool = True
+):
+    """
+    Saves model states in a distributed setup (multiple processes).
+    This function uses a streaming approach. It analyzes the mapper to determine which files
+    need to be saved. Tensors are loaded into memory only when needed and evicted immediately
+    after the mapper processes them.
+    Each rank writes its own shard. Rank 0 gathers indices and finalizes the checkpoint.
+    Args:
+        dest_dir: Destination directory.
+        mapper: Mapping to apply to states before saving.
+        state_generator: Stream of (name, tensor) pairs from the model.
+        process_group: The distributed process group.
+        shard_size_gb: Maximum shard size in GB.
+        show_progress: Whether to show the progress bar.
+    """
+    current_idx = _StateWritingFlowLocal(
+        dest_dir=dest_dir,
+        mapper=mapper,
+        shard_size_gb=shard_size_gb,
+        show_progress=show_progress,
+        sharding_rank=process_group.rank(),
+        is_current_process_rank_master=True
+    ).write(state_generator=state_generator)
+    gather_idx = all_gather_object(current_idx, process_group)
+    gather_idx_filter = [x for x in gather_idx if x is not None]
+    if process_group.rank() == 0:
+        _finalize_master(dest_dir, gather_idx_filter)
+def write_model_state_pipeline_parallel(
+        dest_dir: Path,
+        mapper: ModelStateMapper,
+        state_generator: Iterable[tuple[str, torch.Tensor]],
+        device_mesh: DeviceMesh,
+        pipeline_dim_name: str,
+        shard_size_gb: float = 4.0,
+        show_progress: bool = True
+):
+    """
+    Saves model states in a complex ND distributed training setting.
+    This function uses a streaming approach. It analyzes the mapper to determine which files
+    need to be saved. Tensors are loaded into memory only when needed and evicted immediately
+    after the mapper processes them.
+    This handles Pipeline Parallelism by ensuring that only one rank per pipeline stage
+    actually writes data to disk to avoid duplication.
+    Args:
+        dest_dir: Destination directory.
+        mapper: Mapping to apply to states before saving.
+        state_generator: Stream of (name, tensor) pairs from the model.
+        device_mesh: The PyTorch DeviceMesh representing the cluster layout.
+        pipeline_dim_name: The name of the mesh dimension responsible for pipeline parallelism.
+        shard_size_gb: Maximum shard size in GB.
+        show_progress: Whether to show the progress bar.
+    """
+    pipeline_rank = device_mesh[pipeline_dim_name].get_rank()
+    mesh_dim_names = device_mesh.mesh_dim_names
+    coords = device_mesh.get_coordinate()
+    if mesh_dim_names is None or coords is None:
+        raise ValueError("Cannot save state using a DeviceMesh with no dim names or coords")
+    non_pipeline_coord_sum = sum(
+        coord
+        for name, coord
+        in zip(mesh_dim_names, coords, strict=True)
+        if name != pipeline_dim_name
+    )
+    master_within_pipeline_rank = non_pipeline_coord_sum == 0
+    current_idx = _StateWritingFlowLocal(
+        dest_dir=dest_dir,
+        mapper=mapper,
+        shard_size_gb=shard_size_gb,
+        show_progress=show_progress,
+        sharding_rank=pipeline_rank,
+        is_current_process_rank_master=master_within_pipeline_rank
+    ).write(state_generator=state_generator)
+    gather_idx = all_gather_object(current_idx, device_mesh.get_group(0))
+    gather_idx_filter = [x for x in gather_idx if x is not None]
+    if pipeline_rank == 0 and master_within_pipeline_rank:
+        _finalize_master(dest_dir, gather_idx_filter)

d9d/model_state/mapper/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+This package provides core components of the state mapping system.
+"""
+from .abc import ModelStateMapper, StateGroup
+__all__ = [
+    "ModelStateMapper",
+    "StateGroup"
+]