PyPI - checkpoint-engine - Versions diffs - 0.2.3__tar.gz → 0.3.0rc1__tar.gz - Mend

checkpoint-engine 0.2.3tar.gz → 0.3.0rc1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.2.3
+Version: 0.3.0rc1
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc1}/checkpoint_engine/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.2.3'
-__version_tuple__ = version_tuple = (0, 2, 3)
+__version__ = version = '0.3.0rc1'
+__version_tuple__ = version_tuple = (0, 3, 0, 'rc1')
-__commit_id__ = commit_id = 'g0a6244951'
+__commit_id__ = commit_id = 'g88370e267'

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc1}/checkpoint_engine/ps.py RENAMED Viewed

@@ -118,6 +118,7 @@ class MemoryBuffer(BaseModel):
     buffer: _TorchTensor
     size: int
     metas: list[ParameterMeta]
+    manually_pinned: bool = False
 class MemoryBufferMetaList(BaseModel):
@@ -520,7 +521,7 @@ def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[Memor
         logger.info(
             f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB"
         )
-        return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=metas)
+        return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=metas, manually_pinned=True)
     memory_buffers: list[MemoryBuffer] = []
     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
@@ -792,20 +793,6 @@ def _get_master_port(master_port: int | None = None) -> int:
     return master_port
-def _get_bcast_rank_map(world_size: int, ranks: list[int] | None) -> dict[int, int]:
-    """
-    map the real ranks (receiver_rank) to the bcast ranks (0 ~ len(ranks) - 1),
-    which are generated in self.init_process_group_for_ranks
-    """
-    bcast_rank_map: dict[int, int] = {}
-    if not ranks:
-        bcast_rank_map = {r: r for r in range(world_size)}
-    else:
-        for i, r in enumerate(ranks):
-            bcast_rank_map[r] = i
-    return bcast_rank_map
 class P2PStore:
     def __init__(self, device_manager: DeviceManager):
         from mooncake.engine import TransferEngine
@@ -888,7 +875,7 @@ class ParameterServer:
         *,
         rank: int | None = None,
         world_size: int | None = None,
-        auto_pg: bool = False,
+        auto_pg: bool = True,
         gpu_count: int | None = None,
         mem_fraction: float | None = None,
     ):
@@ -897,7 +884,7 @@ class ParameterServer:
         Args:
             auto_pg: Whether to automatically initialize the process group.
-                Notice that if auto_pg is True, will destroy the process group after update.
+                Notice that if auto_pg is True, will destroy the process group after update. It is recommended to set auto_pg to True!
             mem_fraction: The proportion (as a fraction) of the current free device memory for allocation.
         """
         self._rank = rank or int(os.environ.get("RANK", None))
@@ -979,12 +966,12 @@ class ParameterServer:
         files: list[str] | None = None,
         named_tensors: dict[str, torch.Tensor] | None = None,
         use_shared_memory_pool: bool = False,
-        use_inplace_pin_memory: bool = False,
+        use_inplace_pin_memory: bool = True,
     ) -> None:
         """
         Register a checkpoint to the parameter server. Both files and named_tensors will be registered together.
         Warning: if `use_inplace_pin_memory` is True, .safetensors files in /dev/shm/ will be pinned in-place, and the files will be REMOVED after pinning.
-        Please make sure to copy the files to disks if you need to keep them.
+        Please make sure to copy the files to disks if you need to keep them. NPU does not support inplace pin memory.
         Args:
             checkpoint_name: The name of the checkpoint.
@@ -995,9 +982,14 @@ class ParameterServer:
                 cannot accommodate checkpoints with different memory requirements.
                 To free the actual memory of the shared pool or to modify its shape,
                 please unregister the current user of the shared memory pool using `unregister_checkpoint` with `force=True`.
-            use_inplace_pin_memory: If True, allows inplace pin memory for /dev/shm/ safetensors files. This option is ignored when ``use_shared_memory_pool`` is True.
-                Currently, this feature is experimental and may crash.
+            use_inplace_pin_memory: If True (default), allows inplace pin memory for /dev/shm/ safetensors files.
+                This option is ignored when ``use_shared_memory_pool`` is True.
         """
+        if self.device_manager.device_type != "cuda" and use_inplace_pin_memory:
+            logger.warning(
+                f"[rank{self._rank}] Only cuda devices support in-place pin memory, set use_inplace_pin_memory to False"
+            )
+            use_inplace_pin_memory = False
         try:
             if use_shared_memory_pool:
                 logger.info(
@@ -1016,6 +1008,7 @@ class ParameterServer:
                     named_tensors=named_tensors or {},
                     rank=self._rank,
                     shared_pin_memory=self._memory_pool[self.shared_memory_pool_name],
+                    inplace_pin=False,  # inplace pin memory is not compatible with shared memory pool
                 )
                 self._current_shared_memory_pool_user = checkpoint_name
                 if self._p2p_store is not None and _is_first_time:
@@ -1074,6 +1067,46 @@ class ParameterServer:
             del self._memory_pool[self.shared_memory_pool_name]
             self._memory_pool[self.shared_memory_pool_name] = []
         else:
+            def _unpin(t: torch.Tensor):
+                """
+                Un-pin the pinned memory.
+                """
+                p_flags = ctypes.c_uint()
+                try:
+                    libc = ctypes.CDLL(None)  # get all symbols from the current process
+                    cuda_host_get_flags = libc.cudaHostGetFlags
+                    cuda_host_get_flags.argtypes = [ctypes.POINTER(ctypes.c_uint), ctypes.c_void_p]
+                    cuda_host_get_flags.restype = ctypes.c_int
+                except AttributeError:
+                    logger.error("cudaHostGetFlags not found in libc, cannot unpin memory manually")
+                    raise
+                r = cuda_host_get_flags(ctypes.byref(p_flags), ctypes.c_void_p(t.data_ptr()))
+                assert r == 0, f"get pin flags error, error code: {r}"
+                # p_flags value meaning from cuda/include/driver_types.h
+                # cudaHostRegisterDefault             0x00  /**< Default host memory registration flag */
+                # cudaHostRegisterPortable            0x01  /**< Pinned memory accessible by all CUDA contexts */
+                # cudaHostRegisterMapped              0x02  /**< Map registered memory into device space */
+                # cudaHostRegisterIoMemory            0x04  /**< Memory-mapped I/O space */
+                # cudaHostRegisterReadOnly            0x08  /**< Memory-mapped read-only */
+                assert p_flags.value == 0x02, (
+                    f"pin memory flag error, expected: 0x02 (cudaHostRegisterMapped), got flag: {p_flags.value}"
+                )
+                cudart = torch.cuda.cudart()
+                r = cudart.cudaHostUnregister(t.data_ptr())
+                assert r == 0, f"unpin memory error, error code: {r}"
+            # if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually
+            try:
+                for memory_buffer in self._memory_pool.get(checkpoint_name, []):
+                    if memory_buffer.manually_pinned:
+                        _unpin(memory_buffer.buffer)
+            except Exception as e:
+                logger.error(
+                    f"[rank{self._rank}] fail to unpin memory for checkpoint {checkpoint_name}: {e}"
+                )
+                raise
+            # we won't delete the memory pool if unpinning fails.
             del self._memory_pool[checkpoint_name]
         # see https://github.com/pytorch/pytorch/blob/31d5c675394705f8a6bc767f80ae14bf4f01246b/torch/csrc/cuda/Module.cpp#L2018
         # this works by using torch>=2.5.0
@@ -1176,15 +1209,41 @@ class ParameterServer:
         )
         logger.info(f"[rank{self._rank}] init process group successfully.")
+    def store_based_barrier(
+        self, store: dist.TCPStore, timeout: timedelta = timedelta(minutes=5)
+    ) -> None:
+        """
+        Perform a store-based barrier synchronization across all ranks.
+        This barrier uses a TCP store directly rather than a process group,
+        allowing all ranks to synchronize regardless of which process group
+        they belong to.
+        Args:
+            store: The TCPStore instance to use for synchronization.
+        """
+        dist.distributed_c10d._store_based_barrier(
+            rank=self._rank,
+            store=store,
+            group_name="parameter_server_barrier",
+            rendezvous_count=self._world_size,
+            timeout=timeout,
+        )
     def update(
         self,
         checkpoint_name: str,
         req_func: Callable[[list[tuple[str, str]]], None],
         *,
+        timeout: timedelta = timedelta(minutes=10),
         ranks: list[int] | None = None,
+        master_addr: str | None = None,
+        master_port: int | None = None,
     ) -> None:
         """
         Update the checkpoint to inference engine. This function should be called after gather_metas.
+        Warning: if _auto_pg is False when initializing ParameterServer, please make sure ALL ranks in the WORLD_SIZE call `update` function,
+        otherwise, it will hang.
         Args:
             checkpoint_name: The name of the checkpoint.
@@ -1193,34 +1252,45 @@ class ParameterServer:
                 which is the fastest way to update weights, especially in colocated architecture.
                 If set, will use p2p to update to the ranks, this is flexible to update to a group of ranks,
                 which is useful in disaggregated architecture.
+            master_addr: The master address for process group initialization. If not set, will use env MASTER_ADDR.
+            master_port: The master port for process group initialization. If not set, will use _get_master_port to get the port, which will use MASTER_PORT+1.
+            timeout: The timeout of the barrier operation.
         """
         assert req_func is not None, "req_func is required"
+        ranks_group = None
         try:
-            # if both ranks is None or [], it will use fully broadcast to update to all ranks
-            if not ranks:
-                if self._auto_pg and not dist.is_initialized():
-                    self.init_process_group()
-                self._update_per_bucket(checkpoint_name, req_func)
+            master_addr = os.getenv("MASTER_ADDR") or master_addr
+            assert master_addr, "master_addr is required"
+            if self._auto_pg:
+                if not dist.is_initialized():
+                    self.init_process_group(
+                        timeout=timeout, master_addr=master_addr, master_port=master_port
+                    )
+                manager_store = dist.distributed_c10d._get_default_store()
             else:
-                if self._auto_pg:
-                    if dist.is_initialized():
-                        dist.destroy_process_group()
-                        # HACK: wait 2s to ensure destroy is finished
-                        time.sleep(2)
-                    self.init_process_group_for_ranks(ranks)
-                if self._rank not in ranks:
-                    return
-                self._update_per_bucket(checkpoint_name, req_func, ranks)
+                # HACK: MASTER_PORT+2 for barrier store if master_port is not provided, _get_master_port() returns MASTER_PORT+1
+                # If master_port is provided, use master_port+1 for barrier store
+                manager_store = dist.TCPStore(
+                    master_addr,
+                    _get_master_port(master_port) + 1,
+                    self._world_size,
+                    timeout=timeout,
+                    is_master=self._rank == 0,
+                )
+            # if ranks is None or [], it will use fully broadcast to update to all ranks
+            ranks_group = dist.new_group(ranks) if ranks else None
+            self._update_per_bucket(checkpoint_name, req_func, ranks_group, ranks)
+            self.store_based_barrier(manager_store)
         except Exception as e:
             logger.exception(
                 f"[rank{self._rank}] update checkpoint {checkpoint_name} with ranks {ranks} error {e}"
             )
             raise
         finally:
-            if self._auto_pg and (not ranks or self._rank in ranks):
+            if ranks_group:
+                dist.destroy_process_group(ranks_group)
+            if self._auto_pg and dist.is_initialized():
                 dist.destroy_process_group()
             self.device_manager.device_module.empty_cache()
             logger.info(
                 f"[rank{self._rank}] update checkpoint {checkpoint_name} with ranks {ranks} done. "
@@ -1238,7 +1308,9 @@ class ParameterServer:
         self._zmq_addr_counter += 1
         return socket, socket_paths
-    def _detect_bucket_size(self, *, disable_h2d_buffer: bool = False) -> tuple[int, bool]:
+    def _detect_bucket_size(
+        self, ranks_group: dist.ProcessGroup | None, *, disable_h2d_buffer: bool = False
+    ) -> tuple[int, bool]:
         GiB = 1 << 30  # noqa: N806
         # auto detect bucket size
         tensor = torch.tensor(
@@ -1254,7 +1326,7 @@ class ParameterServer:
             dtype=torch.int64,
             device=self.device_manager.device_type,
         )
-        dist.all_reduce(tensor, op=dist.ReduceOp.MIN)
+        dist.all_reduce(tensor, op=dist.ReduceOp.MIN, group=ranks_group)
         tensor = tensor.cpu()
         free_bytes, self._zmq_addr_counter = tensor[0].item(), -tensor[1].item()
         max_tensor_bytes = 0
@@ -1317,51 +1389,6 @@ class ParameterServer:
             self._p2p_store.batch_transfer_sync_read(target_addr, buf_ptrs, remote_ptrs, lens)
         self.device_manager.device_module.synchronize()
-    def init_process_group_for_ranks(
-        self,
-        ranks: list[int],
-        *,
-        master_port: int | None = None,
-        timeout: timedelta = timedelta(minutes=10),
-    ):
-        """
-        Initialize the process group for the ranks. This global group can be easily destroyed by calling dist.destroy_process_group.
-        Args:
-            ranks: The ranks to initialize the process group. ranks should be a subset of all ranks.
-            master_port: The specified port of the master node. If not set, will use _get_master_port to get the port.
-            timeout: The timeout of the process group.
-        """
-        assert not dist.is_initialized()
-        assert ranks, "ranks should be set"
-        if self._rank not in ranks:
-            return
-        assert self._all_hosts, "all_hosts should be set"
-        assert len(self._all_hosts) == self._world_size // self._gpu_count, (
-            f"world_size {self._world_size} should be equal to all_hosts {len(self._all_hosts)}"
-        )
-        rank = ranks.index(self._rank)
-        master_addr = self._all_hosts[ranks[0] // self._gpu_count]
-        master_port = _get_master_port(master_port)
-        logger.info(
-            f"[rank{self._rank}] start to init process group as virtual_rank {rank}, "
-            f"master_addr {master_addr}, master_port {master_port}, world_size {len(ranks)}, "
-        )
-        # only initialize process group and store for ranks, other nodes are not initialized
-        # and will not participate in this update. Since they have registered memory addresses
-        # to p2p_store at the beginning, update ranks can directly get the memory addresses
-        # from other nodes and put the weights into the buffer.
-        store = dist.TCPStore(
-            master_addr, master_port, len(ranks), is_master=rank == 0, timeout=timeout
-        )
-        dist.init_process_group(
-            backend=self.device_manager.backend,
-            world_size=len(ranks),
-            rank=rank,
-            timeout=timeout,
-            store=store,
-        )
     def _get_addr_ptrs(self, owner_rank: int) -> tuple[str, list[tuple[int, int]]]:
         addr = self._current_global_parameter_metas[owner_rank].p2p_store_addr
         metas_list = self._current_global_parameter_metas[owner_rank].memory_buffer_metas_list
@@ -1401,10 +1428,12 @@ class ParameterServer:
         self,
         checkpoint_name: str,
         req_func: Callable[[list[tuple[str, str]]], None],
+        ranks_group: dist.ProcessGroup | None,
         ranks: list[int] | None = None,
     ):
         assert len(self._current_global_parameter_metas) != 0, "parameter metas is empty"
         assert dist.is_initialized(), "process group is not initialized"
         # if both ranks is None or [], it will use fully broadcast to update to all ranks
         if not ranks:
             logger.info(f"[rank{self._rank}] update checkpoint {checkpoint_name}")
@@ -1422,9 +1451,9 @@ class ParameterServer:
             if not need_update:
                 return
             # first execute a barrier to avoid subsequent device oom
-            dist.barrier()
+            dist.barrier(group=ranks_group)
-        bucket_size, disable_h2d_buffer = self._detect_bucket_size()
+        bucket_size, disable_h2d_buffer = self._detect_bucket_size(ranks_group)
         buckets = _gen_h2d_buckets(
             self._current_global_parameter_metas,
             bucket_size,
@@ -1471,7 +1500,6 @@ class ParameterServer:
         gidx = 0
         ret_code = torch.zeros((), device=self.device_manager.device_type, dtype=torch.int64)
-        bcast_rank_map = _get_bcast_rank_map(self._world_size, ranks)
         try:
             for i in range(max_len):
                 if i < len(receiver_rank_buckets) and not disable_h2d_buffer:
@@ -1501,8 +1529,7 @@ class ParameterServer:
                             self._copy_to_buffer(checkpoint_name, bucket, buffer_b)
                         else:
                             buffer_b.data.copy_(h2d_buffer[: bucket.size])
-                    brank = bcast_rank_map[receiver_rank]
-                    dist.broadcast(buffer_b, src=brank)
+                    dist.broadcast(buffer_b, src=receiver_rank, group=ranks_group)
                     resp = socket.recv()
                     if resp != b"":
                         msg = resp.decode("utf-8")
@@ -1510,7 +1537,7 @@ class ParameterServer:
                             f"[rank{self._rank}] receive error response from rank {receiver_rank} for bucket {gidx} in checkpoint {checkpoint_name}: {msg}"
                         )
                         ret_code.fill_(1)
-                    dist.all_reduce(ret_code, op=dist.ReduceOp.SUM)
+                    dist.all_reduce(ret_code, op=dist.ReduceOp.SUM, group=ranks_group)
                     self.device_manager.device_module.synchronize()
                     if ret_code.item() != 0:
                         # quit early if any rank failed
@@ -1524,7 +1551,7 @@ class ParameterServer:
             socket.recv()
         finally:
             req_thread.join()
-            dist.barrier()
+            dist.barrier(group=ranks_group)
             socket.close()
             if ranks and h2d_buffer is not None:
                 self._p2p_store.unregister_named_tensors([h2d_buffer_name])

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc1}/checkpoint_engine.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.2.3
+Version: 0.3.0rc1
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc1}/checkpoint_engine.egg-info/SOURCES.txt RENAMED Viewed

@@ -23,6 +23,7 @@ figures/overlap-update-and-copy.png
 figures/pipeline.png
 patches/vllm_fp8.patch
 tests/test_assign_receiver_ranks.py
-tests/test_pin_memory.py
+tests/test_inplace_unpin.py
 tests/test_rdma_parser.py
+tests/test_reuse_pin_memory.py
 tests/test_update.py

checkpoint_engine-0.3.0rc1/tests/test_inplace_unpin.py ADDED Viewed

@@ -0,0 +1,81 @@
+import os
+import subprocess
+import time
+import pytest
+import torch.distributed as dist
+from test_update import device_manager, gen_test_tensors, get_world_size
+from checkpoint_engine.ps import ParameterServer
+dev_shm_dir = "/dev/shm/checkpoint_engine_tests"  # noqa: S108
+def get_files() -> list[str]:
+    rank = int(os.getenv("RANK"))
+    named_tensors = dict(gen_test_tensors(rank))
+    import safetensors.torch
+    files = []
+    os.makedirs(dev_shm_dir, exist_ok=True)
+    tensors_in_dev_shm = named_tensors
+    time.sleep(1)
+    dev_shm_files = [
+        os.path.join(dev_shm_dir, f"rank{rank}_checkpoint.safetensors")
+        for _ in range(get_world_size())
+    ]
+    safetensors.torch.save_file(tensors_in_dev_shm, dev_shm_files[rank])
+    time.sleep(1)
+    files.append(dev_shm_files[rank])
+    return files
+def run_pin_and_unpin(num_runs: int):
+    ps = ParameterServer(auto_pg=True)
+    checkpoint_name = "test_with_files"
+    for _ in range(num_runs):
+        files = get_files()
+        ps.register_checkpoint(checkpoint_name, files=files)
+        ps.gather_metas(checkpoint_name)
+        dist.barrier()
+        ps.unregister_checkpoint(checkpoint_name)
+    if ps._rank == 0:
+        import shutil
+        shutil.rmtree(dev_shm_dir)
+    dist.destroy_process_group()
+@pytest.mark.gpu
+def test_unpin_files():
+    world_size = device_manager.device_module.device_count()
+    assert world_size >= 2, "This test requires at least 2 GPUs."
+    master_addr = "localhost"
+    master_port = 25400
+    cmd = [
+        "torchrun",
+        "--nproc_per_node",
+        str(world_size),
+        "--master_addr",
+        master_addr,
+        "--master_port",
+        str(master_port),
+        __file__,
+    ]
+    result = subprocess.run(  # noqa: S603
+        cmd,
+        capture_output=False,
+        text=True,
+        cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+        shell=False,
+        check=False,
+    )
+    assert result.returncode == 0
+if __name__ == "__main__":
+    run_pin_and_unpin(3)

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc1}/tests/test_update.py RENAMED Viewed

@@ -185,7 +185,6 @@ def run_with_files(
     os.makedirs(dev_shm_dir, exist_ok=True)
     os.makedirs(disk_dir, exist_ok=True)
     tensors_items = list(named_tensors.items())
-    tensors_in_dev_shm = named_tensors
     tensors_in_dev_shm = dict(tensors_items[: len(tensors_items) // 2])
     tensors_in_disk = dict(tensors_items[len(tensors_items) // 3 : 2 * len(tensors_items) // 3])
     tensors_in_memory = dict(tensors_items[1 * len(tensors_items) // 2 :])
@@ -218,7 +217,6 @@ def run_with_files(
     if rank == 0:
         import shutil
-        # this test should be run under use_inplace_pin_memory=False. Otherwise, the files in /dev/shm/ will be deleted.
         shutil.rmtree(dev_shm_dir)
         shutil.rmtree(disk_dir)
     assert proc.exitcode == 0
@@ -238,7 +236,13 @@ def run_with_files(
             ],
         ),
         ("test_with_remote_error", [[]]),
-        # ("long_test_no_error", [list(random.sample(range(get_world_size()), k=num_ranks)) for num_ranks in range(get_world_size() + 1)]),
+        (
+            "test_no_error",
+            [
+                list(random.sample(range(get_world_size()), k=num_ranks))
+                for num_ranks in range(get_world_size() + 1)
+            ],
+        ),
     ],
 )
 def test_update(test_name: str, rank_list: list[list[int]] | None):