PyPI - checkpoint-engine - Versions diffs - 0.3.0rc0__tar.gz → 0.3.0rc1__tar.gz - Mend

checkpoint-engine 0.3.0rc0tar.gz → 0.3.0rc1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{checkpoint_engine-0.3.0rc0 → checkpoint_engine-0.3.0rc1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.3.0rc0
+Version: 0.3.0rc1
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine

{checkpoint_engine-0.3.0rc0 → checkpoint_engine-0.3.0rc1}/checkpoint_engine/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.3.0rc0'
-__version_tuple__ = version_tuple = (0, 3, 0, 'rc0')
+__version__ = version = '0.3.0rc1'
+__version_tuple__ = version_tuple = (0, 3, 0, 'rc1')
-__commit_id__ = commit_id = 'gbaf6f6196'
+__commit_id__ = commit_id = 'g88370e267'

{checkpoint_engine-0.3.0rc0 → checkpoint_engine-0.3.0rc1}/checkpoint_engine/ps.py RENAMED Viewed

@@ -118,6 +118,7 @@ class MemoryBuffer(BaseModel):
     buffer: _TorchTensor
     size: int
     metas: list[ParameterMeta]
+    manually_pinned: bool = False
 class MemoryBufferMetaList(BaseModel):
@@ -520,7 +521,7 @@ def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[Memor
         logger.info(
             f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB"
         )
-        return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=metas)
+        return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=metas, manually_pinned=True)
     memory_buffers: list[MemoryBuffer] = []
     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
@@ -622,6 +623,7 @@ def _register_checkpoint(
     named_tensors: dict[str, torch.Tensor],
     rank: int | None = None,
     shared_pin_memory: list[MemoryBuffer] | None = None,
+    inplace_pin: bool = False,
 ) -> list[MemoryBuffer]:
     logger.info(
         f"[rank{rank}] start to register checkpoint with {len(files)} files and {len(named_tensors)} named_tensors"
@@ -629,12 +631,17 @@ def _register_checkpoint(
     if not files and not named_tensors:
         return []
     memory_buffers: list[MemoryBuffer] = []
-    files_to_inplace_pin = [
-        file
-        for file in files
-        if file.startswith("/dev/shm/") and file.endswith(".safetensors")  # noqa: S108
-    ]
-    files_to_normal_pin = [file for file in files if file not in files_to_inplace_pin]
+    if inplace_pin:
+        logger.info(f"[rank{rank}] allow inplace pin memory for /dev/shm/ safetensors files")
+        files_to_inplace_pin = [
+            file
+            for file in files
+            if file.startswith("/dev/shm/") and file.endswith(".safetensors")  # noqa: S108
+        ]
+        files_to_normal_pin = [file for file in files if file not in files_to_inplace_pin]
+    else:
+        files_to_normal_pin = files
+        files_to_inplace_pin = []
     if files_to_normal_pin or named_tensors:
         memory_buffers.extend(
             _normal_pin_memory(
@@ -868,7 +875,7 @@ class ParameterServer:
         *,
         rank: int | None = None,
         world_size: int | None = None,
-        auto_pg: bool = False,
+        auto_pg: bool = True,
         gpu_count: int | None = None,
         mem_fraction: float | None = None,
     ):
@@ -877,7 +884,7 @@ class ParameterServer:
         Args:
             auto_pg: Whether to automatically initialize the process group.
-                Notice that if auto_pg is True, will destroy the process group after update.
+                Notice that if auto_pg is True, will destroy the process group after update. It is recommended to set auto_pg to True!
             mem_fraction: The proportion (as a fraction) of the current free device memory for allocation.
         """
         self._rank = rank or int(os.environ.get("RANK", None))
@@ -959,11 +966,12 @@ class ParameterServer:
         files: list[str] | None = None,
         named_tensors: dict[str, torch.Tensor] | None = None,
         use_shared_memory_pool: bool = False,
+        use_inplace_pin_memory: bool = True,
     ) -> None:
         """
         Register a checkpoint to the parameter server. Both files and named_tensors will be registered together.
-        Warning: .safetensors files in /dev/shm/ will be pinned in-place, and the files will be REMOVED after pinning.
-        Please make sure to copy the files to disks if you need to keep them.
+        Warning: if `use_inplace_pin_memory` is True, .safetensors files in /dev/shm/ will be pinned in-place, and the files will be REMOVED after pinning.
+        Please make sure to copy the files to disks if you need to keep them. NPU does not support inplace pin memory.
         Args:
             checkpoint_name: The name of the checkpoint.
@@ -974,7 +982,14 @@ class ParameterServer:
                 cannot accommodate checkpoints with different memory requirements.
                 To free the actual memory of the shared pool or to modify its shape,
                 please unregister the current user of the shared memory pool using `unregister_checkpoint` with `force=True`.
+            use_inplace_pin_memory: If True (default), allows inplace pin memory for /dev/shm/ safetensors files.
+                This option is ignored when ``use_shared_memory_pool`` is True.
         """
+        if self.device_manager.device_type != "cuda" and use_inplace_pin_memory:
+            logger.warning(
+                f"[rank{self._rank}] Only cuda devices support in-place pin memory, set use_inplace_pin_memory to False"
+            )
+            use_inplace_pin_memory = False
         try:
             if use_shared_memory_pool:
                 logger.info(
@@ -993,6 +1008,7 @@ class ParameterServer:
                     named_tensors=named_tensors or {},
                     rank=self._rank,
                     shared_pin_memory=self._memory_pool[self.shared_memory_pool_name],
+                    inplace_pin=False,  # inplace pin memory is not compatible with shared memory pool
                 )
                 self._current_shared_memory_pool_user = checkpoint_name
                 if self._p2p_store is not None and _is_first_time:
@@ -1002,7 +1018,10 @@ class ParameterServer:
                     f"checkpoint {checkpoint_name} already registered"
                 )
                 self._memory_pool[checkpoint_name] = _register_checkpoint(
-                    files=files or [], named_tensors=named_tensors or {}, rank=self._rank
+                    files=files or [],
+                    named_tensors=named_tensors or {},
+                    rank=self._rank,
+                    inplace_pin=use_inplace_pin_memory,
                 )
                 if self._p2p_store is not None:
                     self._register_parameters_to_p2p_store(checkpoint_name)
@@ -1048,6 +1067,46 @@ class ParameterServer:
             del self._memory_pool[self.shared_memory_pool_name]
             self._memory_pool[self.shared_memory_pool_name] = []
         else:
+            def _unpin(t: torch.Tensor):
+                """
+                Un-pin the pinned memory.
+                """
+                p_flags = ctypes.c_uint()
+                try:
+                    libc = ctypes.CDLL(None)  # get all symbols from the current process
+                    cuda_host_get_flags = libc.cudaHostGetFlags
+                    cuda_host_get_flags.argtypes = [ctypes.POINTER(ctypes.c_uint), ctypes.c_void_p]
+                    cuda_host_get_flags.restype = ctypes.c_int
+                except AttributeError:
+                    logger.error("cudaHostGetFlags not found in libc, cannot unpin memory manually")
+                    raise
+                r = cuda_host_get_flags(ctypes.byref(p_flags), ctypes.c_void_p(t.data_ptr()))
+                assert r == 0, f"get pin flags error, error code: {r}"
+                # p_flags value meaning from cuda/include/driver_types.h
+                # cudaHostRegisterDefault             0x00  /**< Default host memory registration flag */
+                # cudaHostRegisterPortable            0x01  /**< Pinned memory accessible by all CUDA contexts */
+                # cudaHostRegisterMapped              0x02  /**< Map registered memory into device space */
+                # cudaHostRegisterIoMemory            0x04  /**< Memory-mapped I/O space */
+                # cudaHostRegisterReadOnly            0x08  /**< Memory-mapped read-only */
+                assert p_flags.value == 0x02, (
+                    f"pin memory flag error, expected: 0x02 (cudaHostRegisterMapped), got flag: {p_flags.value}"
+                )
+                cudart = torch.cuda.cudart()
+                r = cudart.cudaHostUnregister(t.data_ptr())
+                assert r == 0, f"unpin memory error, error code: {r}"
+            # if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually
+            try:
+                for memory_buffer in self._memory_pool.get(checkpoint_name, []):
+                    if memory_buffer.manually_pinned:
+                        _unpin(memory_buffer.buffer)
+            except Exception as e:
+                logger.error(
+                    f"[rank{self._rank}] fail to unpin memory for checkpoint {checkpoint_name}: {e}"
+                )
+                raise
+            # we won't delete the memory pool if unpinning fails.
             del self._memory_pool[checkpoint_name]
         # see https://github.com/pytorch/pytorch/blob/31d5c675394705f8a6bc767f80ae14bf4f01246b/torch/csrc/cuda/Module.cpp#L2018
         # this works by using torch>=2.5.0
@@ -1183,6 +1242,8 @@ class ParameterServer:
     ) -> None:
         """
         Update the checkpoint to inference engine. This function should be called after gather_metas.
+        Warning: if _auto_pg is False when initializing ParameterServer, please make sure ALL ranks in the WORLD_SIZE call `update` function,
+        otherwise, it will hang.
         Args:
             checkpoint_name: The name of the checkpoint.
@@ -1217,7 +1278,7 @@ class ParameterServer:
                     is_master=self._rank == 0,
                 )
             # if ranks is None or [], it will use fully broadcast to update to all ranks
-            ranks_group = dist.new_group(ranks if ranks else None)
+            ranks_group = dist.new_group(ranks) if ranks else None
             self._update_per_bucket(checkpoint_name, req_func, ranks_group, ranks)
             self.store_based_barrier(manager_store)
         except Exception as e:
@@ -1248,7 +1309,7 @@ class ParameterServer:
         return socket, socket_paths
     def _detect_bucket_size(
-        self, ranks_group: dist.ProcessGroup, *, disable_h2d_buffer: bool = False
+        self, ranks_group: dist.ProcessGroup | None, *, disable_h2d_buffer: bool = False
     ) -> tuple[int, bool]:
         GiB = 1 << 30  # noqa: N806
         # auto detect bucket size
@@ -1367,7 +1428,7 @@ class ParameterServer:
         self,
         checkpoint_name: str,
         req_func: Callable[[list[tuple[str, str]]], None],
-        ranks_group: dist.ProcessGroup,
+        ranks_group: dist.ProcessGroup | None,
         ranks: list[int] | None = None,
     ):
         assert len(self._current_global_parameter_metas) != 0, "parameter metas is empty"

{checkpoint_engine-0.3.0rc0 → checkpoint_engine-0.3.0rc1}/checkpoint_engine.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.3.0rc0
+Version: 0.3.0rc1
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine

{checkpoint_engine-0.3.0rc0 → checkpoint_engine-0.3.0rc1}/checkpoint_engine.egg-info/SOURCES.txt RENAMED Viewed

@@ -23,6 +23,7 @@ figures/overlap-update-and-copy.png
 figures/pipeline.png
 patches/vllm_fp8.patch
 tests/test_assign_receiver_ranks.py
-tests/test_pin_memory.py
+tests/test_inplace_unpin.py
 tests/test_rdma_parser.py
+tests/test_reuse_pin_memory.py
 tests/test_update.py

checkpoint_engine-0.3.0rc1/tests/test_inplace_unpin.py ADDED Viewed

@@ -0,0 +1,81 @@
+import os
+import subprocess
+import time
+import pytest
+import torch.distributed as dist
+from test_update import device_manager, gen_test_tensors, get_world_size
+from checkpoint_engine.ps import ParameterServer
+dev_shm_dir = "/dev/shm/checkpoint_engine_tests"  # noqa: S108
+def get_files() -> list[str]:
+    rank = int(os.getenv("RANK"))
+    named_tensors = dict(gen_test_tensors(rank))
+    import safetensors.torch
+    files = []
+    os.makedirs(dev_shm_dir, exist_ok=True)
+    tensors_in_dev_shm = named_tensors
+    time.sleep(1)
+    dev_shm_files = [
+        os.path.join(dev_shm_dir, f"rank{rank}_checkpoint.safetensors")
+        for _ in range(get_world_size())
+    ]
+    safetensors.torch.save_file(tensors_in_dev_shm, dev_shm_files[rank])
+    time.sleep(1)
+    files.append(dev_shm_files[rank])
+    return files
+def run_pin_and_unpin(num_runs: int):
+    ps = ParameterServer(auto_pg=True)
+    checkpoint_name = "test_with_files"
+    for _ in range(num_runs):
+        files = get_files()
+        ps.register_checkpoint(checkpoint_name, files=files)
+        ps.gather_metas(checkpoint_name)
+        dist.barrier()
+        ps.unregister_checkpoint(checkpoint_name)
+    if ps._rank == 0:
+        import shutil
+        shutil.rmtree(dev_shm_dir)
+    dist.destroy_process_group()
+@pytest.mark.gpu
+def test_unpin_files():
+    world_size = device_manager.device_module.device_count()
+    assert world_size >= 2, "This test requires at least 2 GPUs."
+    master_addr = "localhost"
+    master_port = 25400
+    cmd = [
+        "torchrun",
+        "--nproc_per_node",
+        str(world_size),
+        "--master_addr",
+        master_addr,
+        "--master_port",
+        str(master_port),
+        __file__,
+    ]
+    result = subprocess.run(  # noqa: S603
+        cmd,
+        capture_output=False,
+        text=True,
+        cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+        shell=False,
+        check=False,
+    )
+    assert result.returncode == 0
+if __name__ == "__main__":
+    run_pin_and_unpin(3)

{checkpoint_engine-0.3.0rc0 → checkpoint_engine-0.3.0rc1}/tests/test_update.py RENAMED Viewed

@@ -185,7 +185,6 @@ def run_with_files(
     os.makedirs(dev_shm_dir, exist_ok=True)
     os.makedirs(disk_dir, exist_ok=True)
     tensors_items = list(named_tensors.items())
-    tensors_in_dev_shm = named_tensors
     tensors_in_dev_shm = dict(tensors_items[: len(tensors_items) // 2])
     tensors_in_disk = dict(tensors_items[len(tensors_items) // 3 : 2 * len(tensors_items) // 3])
     tensors_in_memory = dict(tensors_items[1 * len(tensors_items) // 2 :])
@@ -218,7 +217,7 @@ def run_with_files(
     if rank == 0:
         import shutil
-        os.removedirs(dev_shm_dir)
+        shutil.rmtree(dev_shm_dir)
         shutil.rmtree(disk_dir)
     assert proc.exitcode == 0