PyPI - checkpoint-engine - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

checkpoint-engine 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checkpoint_engine/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.2.0'
-__version_tuple__ = version_tuple = (0, 2, 0)
+__version__ = version = '0.2.2'
+__version_tuple__ = version_tuple = (0, 2, 2)
 __commit_id__ = commit_id = None

checkpoint_engine/device_utils.py ADDED Viewed

@@ -0,0 +1,86 @@
+import os
+import re
+import socket
+import subprocess
+from functools import lru_cache
+import torch
+from loguru import logger
+@lru_cache(maxsize=1)
+def get_ip() -> str:
+    try:
+        # try to get ip from network interface
+        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+            s.connect(("8.8.8.8", 80))
+            return s.getsockname()[0]
+    except Exception as e:  # noqa: BLE001
+        # fallback to get ip from hostname
+        logger.warning(
+            f"fail to get ip from network interface, fallback to get ip from hostname: {e}"
+        )
+        return socket.gethostbyname(socket.gethostname())
+def npu_generate_uuid() -> str:
+    str_pid = str(os.getpid())
+    npu_num = 8
+    try:
+        for npu_id in range(npu_num):
+            cmd = ["npu-smi", "info", "-t", "proc-mem", "-i", str(npu_id)]
+            result = subprocess.run(cmd, check=True, capture_output=True, text=True)  # noqa: S603
+            str_result = str(result.stdout)
+            if str_pid in str_result:
+                # In A3 server, one NPU has two chips.
+                match_chip_count = re.search(r"Chip Count[^\d]*(\d+)", str_result)
+                chip_count = int(match_chip_count.group(1))
+                search_after_pid = str_result[str_result.find(str_pid) + len(str_pid) :]
+                match_chip_id = re.search(r"Chip ID[^\d]*(\d+)", search_after_pid)
+                chip_id = int(match_chip_id.group(1))
+                return f"{get_ip()}-{npu_id * chip_count + chip_id}"
+        raise ValueError("The current process is not running on the npu device")
+    except subprocess.CalledProcessError as e:
+        raise ValueError("The current process is not running on the npu device") from e
+class DeviceManager:
+    def __init__(self):
+        self.device_type = self._detect_device_type()
+        self._setup_device_module()
+    def _is_torch_npu_available(self) -> bool:
+        try:
+            if hasattr(torch, "npu") and callable(getattr(torch.npu, "is_available", None)):
+                return torch.npu.is_available()
+            else:
+                return False
+        except ImportError:
+            return False
+    def _detect_device_type(self) -> str:
+        if self._is_torch_npu_available():
+            return "npu"
+        elif torch.cuda.is_available():
+            return "cuda"
+        else:
+            raise TypeError("The current device type is not supported")
+    def _setup_device_module(self):
+        if self.device_type == "npu":
+            import torch_npu
+            self.device_module = torch_npu.npu
+        elif self.device_type == "cuda":
+            self.device_module = torch.cuda
+        else:
+            raise TypeError("The current device type is not supported")
+    @property
+    def backend(self) -> str:
+        if self.device_type == "npu":
+            return "hccl"
+        elif self.device_type == "cuda":
+            return "nccl"
+        else:
+            raise TypeError("The current device type is not supported")

checkpoint_engine/ps.py CHANGED Viewed

@@ -1,16 +1,15 @@
 import argparse
 import concurrent.futures
 import ctypes
+import json
 import os
 import pickle
 import random
-import socket
 import threading
 import time
 from collections import defaultdict
 from collections.abc import Callable
 from datetime import timedelta
-from functools import lru_cache
 from typing import TYPE_CHECKING, Annotated, Any, BinaryIO, NamedTuple
 import httpx
@@ -20,9 +19,11 @@ import torch.distributed as dist
 import zmq
 from loguru import logger
 from pydantic import BaseModel, PlainSerializer, PlainValidator, WithJsonSchema
-from safetensors.torch import safe_open
+from safetensors.torch import _getdtype, safe_open
 from torch.multiprocessing.reductions import reduce_tensor
+from checkpoint_engine.device_utils import DeviceManager, get_ip, npu_generate_uuid
 if TYPE_CHECKING:
     from typing import TypeVar
@@ -92,6 +93,7 @@ class ParameterMeta(BaseModel):
     name: str
     dtype: _TorchDtype
     shape: _TorchSize
+    aligned_size: int
 class BucketRange(NamedTuple):
@@ -140,7 +142,7 @@ def _align_size(dtype: torch.dtype, shape: torch.Size) -> int:
 def _to_named_tensor(metas: list[ParameterMeta], offset: int = 0) -> list[dict]:
     ret = []
     for meta in metas:
-        size = _align_size(meta.dtype, meta.shape)
+        size = meta.aligned_size
         ret.append(
             {
                 "name": meta.name,
@@ -254,28 +256,16 @@ def _concat_tp_weights(
     return torch.cat([w for w in tp_weights], dim=tp_concat_dim)
-def _get_physical_gpu_id(device_index: int | None = None) -> str:
+def _get_physical_gpu_id(device_manager: DeviceManager, device_index: int | None = None) -> str:
     try:
-        return f"GPU-{torch.cuda.get_device_properties(device_index).uuid!s}"
+        if device_manager.device_type == "npu":
+            return f"NPU-{npu_generate_uuid()}"
+        else:
+            return f"GPU-{device_manager.device_module.get_device_properties(device_index).uuid!s}"
     except AssertionError as e:
         raise ValueError(f"fail to get physical gpu id {device_index}") from e
-@lru_cache(maxsize=1)
-def _get_ip() -> str:
-    try:
-        # try to get ip from network interface
-        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-            s.connect(("8.8.8.8", 80))
-            return s.getsockname()[0]
-    except Exception as e:  # noqa: BLE001
-        # fallback to get ip from hostname
-        logger.warning(
-            f"fail to get ip from network interface, fallback to get ip from hostname: {e}"
-        )
-        return socket.gethostbyname(socket.gethostname())
 def _ibv_get_device_list() -> list[str]:
     lib = ctypes.CDLL("libibverbs.so.1")
     lib.ibv_get_device_list.argtypes = [ctypes.POINTER(ctypes.c_int)]  # int *num_devices
@@ -317,13 +307,21 @@ def _get_my_rdma_device(local_rank: int, gpu_count: int, devices: list[str]) ->
     """
     if not devices:
         raise RuntimeError("no rdma devices found")
-    assert len(devices) <= gpu_count, (
-        f"rdma devices count {len(devices)} should be less than or equal to gpu count {gpu_count}"
-    )
-    assert gpu_count % len(devices) == 0, (
-        f"gpu count {gpu_count} should be divisible by rdma devices count {len(devices)}"
-    )
-    return devices[local_rank // (gpu_count // len(devices))]
+    try:
+        assert len(devices) <= gpu_count, (
+            f"rdma devices count {len(devices)} should be less than or equal to gpu count {gpu_count}"
+        )
+        assert gpu_count % len(devices) == 0, (
+            f"gpu count {gpu_count} should be divisible by rdma devices count {len(devices)}"
+        )
+        return devices[local_rank // (gpu_count // len(devices))]
+    except AssertionError:
+        logger.error(
+            "Please set 'NCCL_IB_HCA' or 'PS_P2P_STORE_RDMA_DEVICES' environment variable to choose proper number of RDMA devices."
+            "The number of RDMA devices should be less than or equal to GPU count, and GPU count should be divisible by the number of RDMA devices."
+            "The acceptable value by NCCL_IB_HCA is documented in 'https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8'."
+        )
+        raise
 def _parse_NCCL_IB_HCA(value: str, available_devices: list[str]) -> list[str]:
@@ -426,6 +424,7 @@ def _load_checkpoint(files: list[str]) -> dict[str, torch.Tensor]:
                     name=parameter_name,
                     shape=meta["shape"],
                     dtype=meta["dtype"],
+                    aligned_size=_align_size(meta["dtype"], meta["shape"]),
                 )
             tp_meta = tp_metas[parameter_name]
             if tp_meta.concat_dim != -1:
@@ -435,7 +434,10 @@ def _load_checkpoint(files: list[str]) -> dict[str, torch.Tensor]:
             shape = list(parameter_metas[name].shape)
             shape[tp_meta.concat_dim] = shape[tp_meta.concat_dim] * tp_meta.size
             parameter_metas[name] = ParameterMeta(
-                name=name, shape=torch.Size(shape), dtype=parameter_metas[name].dtype
+                name=name,
+                shape=torch.Size(shape),
+                dtype=parameter_metas[name].dtype,
+                aligned_size=_align_size(parameter_metas[name].dtype, torch.Size(shape)),
             )
         weights_in_cpu = [parameters_with_tp[name][key] for key in sorted(parameters_with_tp[name])]
         # TODO: here concat is serial, which may be slow
@@ -453,17 +455,85 @@ def _load_checkpoint(files: list[str]) -> dict[str, torch.Tensor]:
     return parameters
-def _register_checkpoint(
-    *,
+def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[MemoryBuffer]:
+    def _parse_and_pin_from_safetensors(file_path: str) -> MemoryBuffer:
+        """
+        safetensors format see https://huggingface.co/docs/safetensors/en/index#format.
+        We load the safetensors file as bytes, then parse the header manually to get parameter metas.
+        The actual tensor data is in the remaining bytes and is naturally aligned.
+        We pin the remaining bytes as the buffer, making pinning faster.
+        """
+        def _pin(t: torch.Tensor):
+            """
+            Pin the memory of tensor in-place.
+            See: https://github.com/pytorch/pytorch/issues/32167
+            """
+            cudart = torch.cuda.cudart()
+            r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0)
+            assert r == 0, f"pin memory error, error code: {r}"
+        # TODO: should only support /dev/shm? but we found files in disk also work?
+        size = os.stat(file_path).st_size
+        flag_size = 8
+        t = torch.from_file(file_path, True, size, dtype=torch.uint8)
+        assert t.nbytes > flag_size, (
+            f"tensor nbytes {t.nbytes} should be greater than flag_size {flag_size}"
+        )
+        start_pos = (
+            int.from_bytes(t[0:flag_size].numpy().tobytes(), byteorder="little", signed=False)
+            + flag_size
+        )
+        header_tensor = t[flag_size:start_pos]
+        header = json.loads(header_tensor.numpy().tobytes())
+        if "__metadata__" in header:
+            header.pop("__metadata__")
+        metas: list[ParameterMeta] = []
+        offset = 0
+        try:
+            for name, meta in sorted(header.items(), key=lambda x: x[1]["data_offsets"]):
+                start, end = meta["data_offsets"]
+                # safetensors format ensures offsets are aligned
+                assert offset == start, f"offset {offset} should be equal to start {start}"
+                metas.append(
+                    ParameterMeta(
+                        name=name,
+                        dtype=_getdtype(meta["dtype"]),
+                        shape=torch.Size(meta["shape"]),
+                        aligned_size=end - start,
+                    )
+                )
+                offset = end
+        except Exception as e:
+            logger.error(f"fail to parse safetensors header from {file_path}: {e}")
+            raise
+        buffer = t[start_pos:]
+        assert offset == buffer.nbytes, (
+            f"offset {offset} should be equal to buffer.nbytes {buffer.nbytes}"
+        )
+        # Remove the file after successfully loading. This will avoid doubling the memory usage.
+        # We assume files in /dev/shm/ are temporary files. So it's safe to remove them after loading.
+        os.remove(file_path)
+        _pin(buffer)
+        logger.info(
+            f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB"
+        )
+        return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=metas)
+    memory_buffers: list[MemoryBuffer] = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
+        memory_buffers = list(executor.map(_parse_and_pin_from_safetensors, files))
+    return memory_buffers
+def _normal_pin_memory(
     files: list[str],
     named_tensors: dict[str, torch.Tensor],
     rank: int | None = None,
+    shared_pin_memory: list[MemoryBuffer] | None = None,
 ) -> list[MemoryBuffer]:
-    logger.info(
-        f"[rank{rank}] start to register checkpoint with {len(files)} files and {len(named_tensors)} named_tensors"
-    )
-    if not files and not named_tensors:
-        return []
     parameters = _load_checkpoint(files)
     if named_tensors:
         parameters.update(named_tensors)
@@ -473,13 +543,16 @@ def _register_checkpoint(
         size: int
         metas: list[ParameterMeta]
-    buckets: list[MemoryBucket] = [MemoryBucket(size=0, metas=[])]
+    buckets: list[MemoryBucket] = []
+    buckets.append(MemoryBucket(size=0, metas=[]))
     for name, tensor in sorted(parameters.items()):
         size = _align_size(tensor.dtype, tensor.shape)
         if buckets[-1].size + size > bucket_size:
             assert buckets[-1], f"buckets[{len(buckets) - 1}] should not be empty"
             buckets.append(MemoryBucket(size=0, metas=[]))
-        buckets[-1].metas.append(ParameterMeta(name=name, shape=tensor.shape, dtype=tensor.dtype))
+        buckets[-1].metas.append(
+            ParameterMeta(name=name, shape=tensor.shape, dtype=tensor.dtype, aligned_size=size)
+        )
         buckets[-1].size += size
     memory_buffers = [
@@ -487,16 +560,34 @@ def _register_checkpoint(
         for bucket in buckets
     ]
-    def register_pin_memory(idx: int, size: int) -> tuple[int, torch.Tensor]:
-        buffer = torch.empty(size, dtype=torch.uint8, pin_memory=True)
-        return idx, buffer
+    def register_pin_memory(
+        idx: int, size: int, shared_pin_memory: list[MemoryBuffer] | None = None
+    ) -> tuple[int, torch.Tensor]:
+        if shared_pin_memory:
+            # If shared_pin_memory is provided, reuse the pin memory buffer, do not allocate new one
+            # Reusing pin memory only support fixed shape of checkpoints, which is registered the first time
+            assert idx < len(shared_pin_memory), (
+                f"idx {idx} should be less than shared_pin_memory length {len(shared_pin_memory)}"
+            )
+            assert shared_pin_memory[idx].size == size, (
+                f"shared_pin_memory[{idx}].size {shared_pin_memory[idx].size} should be equal to {size}"
+            )
+            return idx, shared_pin_memory[idx].buffer
+        else:
+            buffer = torch.empty(size, dtype=torch.uint8, pin_memory=True)
+            return idx, buffer
     def register_tensor(buffer: torch.Tensor, offset: int, tensor: torch.Tensor):
         buffer[offset : offset + tensor.nbytes] = tensor.view(-1).view(dtype=torch.uint8)
     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
         futures = [
-            executor.submit(register_pin_memory, idx, bucket.size)
+            executor.submit(
+                register_pin_memory,
+                idx,
+                bucket.size,
+                shared_pin_memory,
+            )
             for idx, bucket in enumerate(buckets)
         ]
         new_futures = []
@@ -522,6 +613,39 @@ def _register_checkpoint(
                 offset += size
         for future in concurrent.futures.as_completed(new_futures):
             future.result()
+        return memory_buffers
+def _register_checkpoint(
+    *,
+    files: list[str],
+    named_tensors: dict[str, torch.Tensor],
+    rank: int | None = None,
+    shared_pin_memory: list[MemoryBuffer] | None = None,
+) -> list[MemoryBuffer]:
+    logger.info(
+        f"[rank{rank}] start to register checkpoint with {len(files)} files and {len(named_tensors)} named_tensors"
+    )
+    if not files and not named_tensors:
+        return []
+    memory_buffers: list[MemoryBuffer] = []
+    files_to_inplace_pin = [
+        file
+        for file in files
+        if file.startswith("/dev/shm/") and file.endswith(".safetensors")  # noqa: S108
+    ]
+    files_to_normal_pin = [file for file in files if file not in files_to_inplace_pin]
+    if files_to_normal_pin or named_tensors:
+        memory_buffers.extend(
+            _normal_pin_memory(
+                files=files_to_normal_pin,
+                named_tensors=named_tensors,
+                rank=rank,
+                shared_pin_memory=shared_pin_memory,
+            )
+        )
+    if files_to_inplace_pin:
+        memory_buffers.extend(_inplace_pin_memory(files_to_inplace_pin, rank=rank))
     return memory_buffers
@@ -570,7 +694,7 @@ def _gen_h2d_buckets(
         for idx, metas in enumerate(items.memory_buffer_metas_list):
             start_offset, offset = 0, 0
             for meta in metas.metas:
-                s = _align_size(meta.dtype, meta.shape)
+                s = meta.aligned_size
                 if buckets[-1][1].size + s > bucket_size:
                     if offset - start_offset > 0:
                         buckets[-1][1].ranges.append(
@@ -677,20 +801,29 @@ def _get_bcast_rank_map(world_size: int, ranks: list[int] | None) -> dict[int, i
 class P2PStore:
-    def __init__(self):
+    def __init__(self, device_manager: DeviceManager):
         from mooncake.engine import TransferEngine
         self.rank = int(os.getenv("RANK"))
-        gpu_count = torch.cuda.device_count()
+        gpu_count = device_manager.device_module.device_count()
         local_rank = self.rank % gpu_count
-        self.device = _get_my_rdma_device(local_rank, gpu_count, _get_rdma_devices())
-        self.ip = _get_ip()
+        device_type = device_manager.device_type
+        if device_type == "npu" and os.getenv("PS_P2P_STORE_RDMA_DEVICES") is None:
+            self.device = ""
+        else:
+            self.device = _get_my_rdma_device(local_rank, gpu_count, _get_rdma_devices())
+        self.ip = get_ip()
         # we will start at most 8 ps processes, so we use 8 retries to avoid port conflicts in extreme cases
         retry_count = 8
         for i in range(retry_count):
             self.engine = TransferEngine()
-            ret = self.engine.initialize(self.ip, "P2PHANDSHAKE", "rdma", self.device)
+            ret = self.engine.initialize(
+                self.ip,
+                "P2PHANDSHAKE",
+                "ascend_direct" if device_type == "npu" else "rdma",
+                self.device,
+            )
             if ret == 0:
                 break
             # sleep 0.5 ~ 2.0s, to avoid port conflicts when two processes retry at the same time
@@ -742,6 +875,8 @@ class P2PStore:
 class ParameterServer:
+    shared_memory_pool_name = "__shared_memory_pool__"
     def __init__(
         self,
         *,
@@ -757,11 +892,12 @@ class ParameterServer:
         Args:
             auto_pg: Whether to automatically initialize the process group.
                 Notice that if auto_pg is True, will destroy the process group after update.
-            mem_fraction: The proportion (as a fraction) of the current free CUDA memory for allocation.
+            mem_fraction: The proportion (as a fraction) of the current free device memory for allocation.
         """
         self._rank = rank or int(os.environ.get("RANK", None))
         self._world_size = world_size or int(os.environ.get("WORLD_SIZE", None))
-        self._gpu_count = gpu_count or torch.cuda.device_count()
+        self.device_manager = DeviceManager()
+        self._gpu_count = gpu_count or self.device_manager.device_module.device_count()
         self._local_rank = self._rank % self._gpu_count
         self._auto_pg = auto_pg
         self._all_hosts = []
@@ -775,7 +911,7 @@ class ParameterServer:
         assert (
             self._gpu_count is not None
             and self._gpu_count > 0
-            and self._gpu_count <= torch.cuda.device_count()
+            and self._gpu_count <= self.device_manager.device_module.device_count()
         ), self._gpu_count
         assert (
             self._mem_fraction is not None and self._mem_fraction > 0 and self._mem_fraction <= 1
@@ -784,20 +920,35 @@ class ParameterServer:
         self._zmq_ctx = zmq.Context()
         self._zmq_addr_counter = 0
+        # stores the name of the checkpoint currently using the shared memory pool, or empty string if none
+        self._current_shared_memory_pool_user: str = ""
         self._memory_pool: dict[str, list[MemoryBuffer]] = {}
+        self._memory_pool[self.shared_memory_pool_name] = []
         # dict key is owner_rank, value is a bucket metas list in owner_rank
         self._current_global_parameter_metas: dict[int, MemoryBufferMetaList] = {}
+        # NPU transfer engine initialization requires prior set_device.
+        device_index = self._local_rank
+        self.device_manager.device_module.set_device(device_index)
         try:
-            self._p2p_store = P2PStore()
+            self._p2p_store = P2PStore(self.device_manager)
         except ImportError as e:
             logger.warning(f"[rank{self._rank}] fail to initialize p2p store due to {e}")
             self._p2p_store = None
-        device_index = self._local_rank
-        torch.cuda.set_device(device_index)
-        self._device_uuid = _get_physical_gpu_id(device_index)
+        self._device_uuid = _get_physical_gpu_id(self.device_manager, device_index)
         self._rdma_device = None if self._p2p_store is None else self._p2p_store.device
+    def _get_memory_pool(self, checkpoint_name: str) -> list[MemoryBuffer]:
+        if checkpoint_name == self._current_shared_memory_pool_user:
+            assert self._memory_pool[self.shared_memory_pool_name], (
+                f"shared memory pool is not initialized, but checkpoint {checkpoint_name} is using it"
+            )
+            return self._memory_pool[self.shared_memory_pool_name]
+        elif checkpoint_name in self._memory_pool:
+            return self._memory_pool[checkpoint_name]
+        else:
+            raise RuntimeError(f"checkpoint {checkpoint_name} is not registered")
     def _logger_rank0(self, msg: str):
         if self._local_rank == 0:
             logger.info(msg)
@@ -821,46 +972,97 @@ class ParameterServer:
         *,
         files: list[str] | None = None,
         named_tensors: dict[str, torch.Tensor] | None = None,
+        use_shared_memory_pool: bool = False,
     ) -> None:
         """
         Register a checkpoint to the parameter server. Both files and named_tensors will be registered together.
+        Warning: .safetensors files in /dev/shm/ will be pinned in-place, and the files will be REMOVED after pinning.
+        Please make sure to copy the files to disks if you need to keep them.
         Args:
             checkpoint_name: The name of the checkpoint.
             files: The safetensors files to register.
             named_tensors: The named tensors to register.
+            use_shared_memory_pool: If True, uses a reusable shared pin memory pool instead of allocating new memory.
+                Only one checkpoint can use the shared pool at a time. The pool's shape is fixed on first use and
+                cannot accommodate checkpoints with different memory requirements.
+                To free the actual memory of the shared pool or to modify its shape,
+                please unregister the current user of the shared memory pool using `unregister_checkpoint` with `force=True`.
         """
         try:
-            assert checkpoint_name not in self._memory_pool, (
-                f"checkpoint {checkpoint_name} already registered"
-            )
-            self._memory_pool[checkpoint_name] = _register_checkpoint(
-                files=files or [], named_tensors=named_tensors or {}, rank=self._rank
-            )
-            if self._p2p_store is not None:
-                self._register_parameters_to_p2p_store(checkpoint_name)
+            if use_shared_memory_pool:
+                logger.info(
+                    f"[rank{self._rank}] checkpoint {checkpoint_name} use shared memory pool"
+                )
+                assert self._current_shared_memory_pool_user == "", (
+                    f"cannot register checkpoint {checkpoint_name} to shared memory pool, "
+                    f"since checkpoint {self._current_shared_memory_pool_user} is already using shared memory pool. "
+                    f"This registration may cause unexpected conflicts."
+                )
+                # Since we set the uninitialized shared memory pool to empty list,
+                # we can check whether this is the first time to use shared memory pool
+                _is_first_time = not self._memory_pool[self.shared_memory_pool_name]
+                self._memory_pool[self.shared_memory_pool_name] = _register_checkpoint(
+                    files=files or [],
+                    named_tensors=named_tensors or {},
+                    rank=self._rank,
+                    shared_pin_memory=self._memory_pool[self.shared_memory_pool_name],
+                )
+                self._current_shared_memory_pool_user = checkpoint_name
+                if self._p2p_store is not None and _is_first_time:
+                    self._register_parameters_to_p2p_store(checkpoint_name)
+            else:
+                assert checkpoint_name not in self._memory_pool, (
+                    f"checkpoint {checkpoint_name} already registered"
+                )
+                self._memory_pool[checkpoint_name] = _register_checkpoint(
+                    files=files or [], named_tensors=named_tensors or {}, rank=self._rank
+                )
+                if self._p2p_store is not None:
+                    self._register_parameters_to_p2p_store(checkpoint_name)
         except Exception:
             logger.exception(
                 f"[rank{self._rank}] fail to register checkpoint {checkpoint_name} with files {files}"
             )
-            if self._p2p_store is not None:
+            if self._p2p_store is not None and not use_shared_memory_pool:
                 self._unregister_parameters_from_p2p_store(checkpoint_name)
             self.unregister_checkpoint(checkpoint_name)
             raise
-    def unregister_checkpoint(self, checkpoint_name: str):
+    def unregister_checkpoint(self, checkpoint_name: str, force: bool = False) -> None:
         """
         Unregister a checkpoint from the parameter server. This function will also unregister the checkpoint
         from p2p store if p2p store is initialized.
+        Args:
+            checkpoint_name: The name of the checkpoint.
+            force: This flag is designed for shared memory pool user. If True, the memory for shared memory pool itself will be freed.
+                    If False, only the checkpoint name will be unregistered, and the shared memory pool will be kept for future use.
         """
-        if checkpoint_name not in self._memory_pool:
+        if (
+            checkpoint_name not in self._memory_pool
+            and checkpoint_name != self._current_shared_memory_pool_user
+        ):
+            logger.warning(
+                f"[rank{self._rank}] unregister checkpoint name {checkpoint_name} not found"
+            )
             return
+        if checkpoint_name == self._current_shared_memory_pool_user and not force:
+            self._current_shared_memory_pool_user = ""
+            return
         if self._p2p_store is not None:
             num_unregistered = self._unregister_parameters_from_p2p_store(checkpoint_name)
             logger.info(
                 f"[rank{self._rank}] unregister {num_unregistered} parameters from p2p store for checkpoint {checkpoint_name}"
             )
-        del self._memory_pool[checkpoint_name]
+        if checkpoint_name == self._current_shared_memory_pool_user:
+            self._current_shared_memory_pool_user = ""
+            del self._memory_pool[self.shared_memory_pool_name]
+            self._memory_pool[self.shared_memory_pool_name] = []
+        else:
+            del self._memory_pool[checkpoint_name]
         # see https://github.com/pytorch/pytorch/blob/31d5c675394705f8a6bc767f80ae14bf4f01246b/torch/csrc/cuda/Module.cpp#L2018
         # this works by using torch>=2.5.0
         torch._C._host_emptyCache()
@@ -875,6 +1077,10 @@ class ParameterServer:
             self.init_process_group()
         assert dist.is_initialized(), "process group is not initialized"
         metas_lst: list[DataToGather | None] = [None for _ in range(self._world_size)]  # type: ignore
+        try:
+            memory_pool = self._get_memory_pool(checkpoint_name)
+        except RuntimeError:
+            memory_pool = []
         metas = DataToGather(
             memory_buffer_metas_list=[
                 MemoryBufferMetas(
@@ -882,16 +1088,18 @@ class ParameterServer:
                     ptr=x.buffer.data_ptr(),
                     size=x.size,
                 )
-                for x in self._memory_pool.get(checkpoint_name, [])
+                for x in memory_pool
             ],
             p2p_store_addr=None if self._p2p_store is None else self._p2p_store.addr,
-            host_ip=_get_ip(),
+            host_ip=get_ip(),
             device_uuid=self._device_uuid,
             rdma_device=self._rdma_device or "",
         )
         dist.all_gather_object(metas_lst, metas)
+        self._current_global_parameter_metas = {}
         num_parameters = 0
         all_hosts: list[str] = []
         global_device_uuids: list[str] = []
@@ -948,7 +1156,7 @@ class ParameterServer:
             is_master=self._rank == 0,
         )
         dist.init_process_group(
-            backend="nccl",
+            backend=self.device_manager.backend,
             world_size=self._world_size,
             rank=self._rank,
             timeout=timeout,
@@ -991,21 +1199,22 @@ class ParameterServer:
                 if self._rank not in ranks:
                     return
                 self._update_per_bucket(checkpoint_name, req_func, ranks)
-            if self._auto_pg:
-                dist.destroy_process_group()
-            torch.cuda.empty_cache()
-            logger.info(
-                f"[rank{self._rank}] update checkpoint {checkpoint_name} with ranks {ranks} done. "
-                f"Current CUDA allocated {torch.cuda.memory_allocated() / 1024 / 1024} MB, "
-                f"reserved {torch.cuda.memory_reserved() / 1024 / 1024} MB."
-            )
         except Exception as e:
             logger.exception(
                 f"[rank{self._rank}] update checkpoint {checkpoint_name} with ranks {ranks} error {e}"
             )
             raise
+        finally:
+            if self._auto_pg and (not ranks or self._rank in ranks):
+                dist.destroy_process_group()
+            self.device_manager.device_module.empty_cache()
+            logger.info(
+                f"[rank{self._rank}] update checkpoint {checkpoint_name} with ranks {ranks} done. "
+                f"Current device allocated {self.device_manager.device_module.memory_allocated() / 1024 / 1024} MB, "
+                f"reserved {self.device_manager.device_module.memory_reserved() / 1024 / 1024} MB."
+            )
     def _bind_zmq_socket(self) -> tuple[zmq.Socket, list[tuple[str, str]]]:
         def zmq_handle(device_uuid: str) -> str:
@@ -1022,14 +1231,16 @@ class ParameterServer:
         # auto detect bucket size
         tensor = torch.tensor(
             [
-                # proportion of current cuda free memory bytes
-                int(float(torch.cuda.mem_get_info()[0]) * self._mem_fraction),
+                # proportion of current device free memory bytes
+                int(
+                    float(self.device_manager.device_module.mem_get_info()[0]) * self._mem_fraction
+                ),
                 # we use negative value to reuse allreduce min operation
                 # for getting the max value of zmq_addr_counter in all ranks
                 -self._zmq_addr_counter,
             ],
             dtype=torch.int64,
-            device="cuda",
+            device=self.device_manager.device_type,
         )
         dist.all_reduce(tensor, op=dist.ReduceOp.MIN)
         tensor = tensor.cpu()
@@ -1038,7 +1249,7 @@ class ParameterServer:
         for items in self._current_global_parameter_metas.values():
             for metas_list in items.memory_buffer_metas_list:
                 for meta in metas_list.metas:
-                    max_tensor_bytes = max(max_tensor_bytes, _align_size(meta.dtype, meta.shape))
+                    max_tensor_bytes = max(max_tensor_bytes, meta.aligned_size)
         free_bytes_divided_3 = free_bytes // (3 * _ALIGN_SIZE) * _ALIGN_SIZE
         if max_tensor_bytes <= free_bytes_divided_3 and not disable_h2d_buffer:
             self._logger_rank0(f"[rank{self._rank}] use h2d buffer")
@@ -1083,7 +1294,7 @@ class ParameterServer:
                 remote_ptrs.append(ptrs[b.idx][0] + b.offset)
                 lens.append(b.size)
             else:
-                pool = self._memory_pool[checkpoint_name][b.idx]
+                pool = self._get_memory_pool(checkpoint_name)[b.idx]
                 buffer[offset : offset + b.size].data.copy_(
                     pool.buffer[b.offset : b.offset + b.size],
                     non_blocking=True,
@@ -1092,7 +1303,7 @@ class ParameterServer:
         assert offset == bucket.size, f"offset {offset} != bucket_size {bucket.size}"
         if owner_rank is not None:
             self._p2p_store.batch_transfer_sync_read(target_addr, buf_ptrs, remote_ptrs, lens)
-        torch.cuda.synchronize()
+        self.device_manager.device_module.synchronize()
     def init_process_group_for_ranks(
         self,
@@ -1132,7 +1343,11 @@ class ParameterServer:
             master_addr, master_port, len(ranks), is_master=rank == 0, timeout=timeout
         )
         dist.init_process_group(
-            backend="nccl", world_size=len(ranks), rank=rank, timeout=timeout, store=store
+            backend=self.device_manager.backend,
+            world_size=len(ranks),
+            rank=rank,
+            timeout=timeout,
+            store=store,
         )
     def _get_addr_ptrs(self, owner_rank: int) -> tuple[str, list[tuple[int, int]]]:
@@ -1142,22 +1357,32 @@ class ParameterServer:
     def _register_parameters_to_p2p_store(self, checkpoint_name: str):
         assert self._p2p_store is not None, "p2p store is not initialized"
-        pool = self._memory_pool[checkpoint_name]
+        pool = self._get_memory_pool(checkpoint_name)
         if len(pool) == 0:
             return
         named_tensors, tensor_ptrs = {}, []
+        register_name = (
+            checkpoint_name
+            if checkpoint_name != self._current_shared_memory_pool_user
+            else self.shared_memory_pool_name
+        )
         for idx, memory_buffer in enumerate(pool):
-            named_tensors[f"memory_pool_{checkpoint_name}_{idx}"] = memory_buffer.buffer
+            named_tensors[f"memory_pool_{register_name}_{idx}"] = memory_buffer.buffer
             tensor_ptrs.append((memory_buffer.buffer.data_ptr(), memory_buffer.size))
         self._p2p_store.register_named_tensors(named_tensors)
     def _unregister_parameters_from_p2p_store(self, checkpoint_name: str) -> int:
         assert self._p2p_store is not None, "p2p store is not initialized"
-        pool = self._memory_pool[checkpoint_name]
+        pool = self._get_memory_pool(checkpoint_name)
         if len(pool) == 0:
             return 0
+        unregister_name = (
+            checkpoint_name
+            if checkpoint_name != self._current_shared_memory_pool_user
+            else self.shared_memory_pool_name
+        )
         return self._p2p_store.unregister_named_tensors(
-            [f"memory_pool_{checkpoint_name}_{idx}" for idx, _ in enumerate(pool)]
+            [f"memory_pool_{unregister_name}_{idx}" for idx, _ in enumerate(pool)]
         )
     def _update_per_bucket(
@@ -1184,7 +1409,7 @@ class ParameterServer:
             if not need_update:
                 return
-            # first execute a barrier to avoid subsequent cuda oom
+            # first execute a barrier to avoid subsequent device oom
             dist.barrier()
         bucket_size, disable_h2d_buffer = self._detect_bucket_size()
@@ -1199,7 +1424,7 @@ class ParameterServer:
         h2d_buffer: torch.Tensor | None = (
             None
             if disable_h2d_buffer
-            else torch.empty(bucket_size, dtype=torch.uint8, device="cuda")
+            else torch.empty(bucket_size, dtype=torch.uint8, device=self.device_manager.device_type)
         )
         # p2p store need to register h2d_buffer to let other ranks read
         if ranks:
@@ -1212,7 +1437,9 @@ class ParameterServer:
                 continue
             receiver_rank_buckets.append((owner_rank, bucket))
-        buffer = torch.empty(bucket_size * 2, dtype=torch.uint8, device="cuda")
+        buffer = torch.empty(
+            bucket_size * 2, dtype=torch.uint8, device=self.device_manager.device_type
+        )
         handle = reduce_tensor(buffer)
         buckets_by_receiver_rank: dict[int, list[H2DBucket]] = defaultdict(list)
@@ -1231,52 +1458,66 @@ class ParameterServer:
         socket.send_pyobj(handle)
         gidx = 0
+        ret_code = torch.zeros((), device=self.device_manager.device_type, dtype=torch.int64)
         bcast_rank_map = _get_bcast_rank_map(self._world_size, ranks)
-        for i in range(max_len):
-            if i < len(receiver_rank_buckets) and not disable_h2d_buffer:
-                self._copy_to_buffer(
-                    checkpoint_name,
-                    receiver_rank_buckets[i][1],
-                    h2d_buffer,
-                    receiver_rank_buckets[i][0] if ranks else None,
-                )
-            for receiver_rank, _buckets in buckets_by_receiver_rank.items():
-                if i >= len(_buckets):
-                    continue
-                bucket = _buckets[i]
-                alloc, reserved = (
-                    torch.cuda.memory_allocated() / 1024 / 1024,
-                    torch.cuda.memory_reserved() / 1024 / 1024,
-                )
-                self._logger_rank0(
-                    f"[rank{self._rank}] begin to update bucket {gidx + 1}/{len(buckets)} receiver_rank {receiver_rank} in checkpoint {checkpoint_name}, bucket_size: {bucket.size / 1024 / 1024:.2f}MiB, length: {len(bucket.items)}. "
-                    f"Current CUDA allocated {alloc:.2f} MB, "
-                    f"reserved {reserved:.2f} MB."
-                )
-                start = gidx % 2 * bucket_size
-                buffer_b: torch.Tensor = buffer[start : start + bucket.size]
-                if receiver_rank == self._rank:
-                    if disable_h2d_buffer:
-                        self._copy_to_buffer(checkpoint_name, bucket, buffer_b)
-                    else:
-                        buffer_b.data.copy_(h2d_buffer[: bucket.size])
-                brank = bcast_rank_map[receiver_rank]
-                dist.broadcast(buffer_b, src=brank)
-                socket.recv()
-                dist.barrier()
-                socket.send_pyobj(_to_named_tensor(bucket.items, gidx % 2 * bucket_size))
-                gidx += 1
-        socket.recv()
-        socket.send_pyobj(None)
-        socket.recv()
-        req_thread.join()
-        dist.barrier()
-        socket.close()
-        if ranks and h2d_buffer is not None:
-            self._p2p_store.unregister_named_tensors([h2d_buffer_name])
-        torch.cuda.empty_cache()
+        try:
+            for i in range(max_len):
+                if i < len(receiver_rank_buckets) and not disable_h2d_buffer:
+                    self._copy_to_buffer(
+                        checkpoint_name,
+                        receiver_rank_buckets[i][1],
+                        h2d_buffer,
+                        receiver_rank_buckets[i][0] if ranks else None,
+                    )
+                for receiver_rank, _buckets in buckets_by_receiver_rank.items():
+                    if i >= len(_buckets):
+                        continue
+                    bucket = _buckets[i]
+                    alloc, reserved = (
+                        self.device_manager.device_module.memory_allocated() / 1024 / 1024,
+                        self.device_manager.device_module.memory_reserved() / 1024 / 1024,
+                    )
+                    self._logger_rank0(
+                        f"[rank{self._rank}] begin to update bucket {gidx + 1}/{len(buckets)} receiver_rank {receiver_rank} in checkpoint {checkpoint_name}, bucket_size: {bucket.size / 1024 / 1024:.2f}MiB, length: {len(bucket.items)}. "
+                        f"Current device allocated {alloc:.2f} MB, "
+                        f"reserved {reserved:.2f} MB."
+                    )
+                    start = gidx % 2 * bucket_size
+                    buffer_b: torch.Tensor = buffer[start : start + bucket.size]
+                    if receiver_rank == self._rank:
+                        if disable_h2d_buffer:
+                            self._copy_to_buffer(checkpoint_name, bucket, buffer_b)
+                        else:
+                            buffer_b.data.copy_(h2d_buffer[: bucket.size])
+                    brank = bcast_rank_map[receiver_rank]
+                    dist.broadcast(buffer_b, src=brank)
+                    resp = socket.recv()
+                    if resp != b"":
+                        msg = resp.decode("utf-8")
+                        logger.error(
+                            f"[rank{self._rank}] receive error response from rank {receiver_rank} for bucket {gidx} in checkpoint {checkpoint_name}: {msg}"
+                        )
+                        ret_code.fill_(1)
+                    dist.all_reduce(ret_code, op=dist.ReduceOp.SUM)
+                    self.device_manager.device_module.synchronize()
+                    if ret_code.item() != 0:
+                        # quit early if any rank failed
+                        socket.send_pyobj(RuntimeError("Some workers failed to update weights"))
+                        raise RuntimeError("Failed to update weights due to remote errors")
+                    socket.send_pyobj(_to_named_tensor(bucket.items, gidx % 2 * bucket_size))
+                    gidx += 1
+            socket.recv()
+            socket.send_pyobj(None)
+            socket.recv()
+        finally:
+            req_thread.join()
+            dist.barrier()
+            socket.close()
+            if ranks and h2d_buffer is not None:
+                self._p2p_store.unregister_named_tensors([h2d_buffer_name])
+            self.device_manager.device_module.empty_cache()
 def _init_api(ps: ParameterServer) -> Any:
@@ -1294,6 +1535,7 @@ def _init_api(ps: ParameterServer) -> Any:
         update_url: str | None = None
         inference_group_ranks: list[int] = []
         timeout: float = 300.0
+        uds: str | None = None
     def wrap_exception(func: Callable[[], None]) -> Response:
         try:
@@ -1326,7 +1568,9 @@ def _init_api(ps: ParameterServer) -> Any:
                 return
             if req.inference_group_ranks:
                 socket_paths = [socket_paths[i] for i in req.inference_group_ranks]
-            request_inference_to_update(req.update_url, dict(socket_paths), timeout=req.timeout)
+            request_inference_to_update(
+                req.update_url, dict(socket_paths), timeout=req.timeout, uds=req.uds
+            )
         return wrap_exception(lambda: ps.update(checkpoint_name, update_func, ranks=req.ranks))

checkpoint_engine/worker.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import gc
+import traceback
 from collections.abc import Callable
 from typing import TypedDict
 import torch
 import zmq
+from checkpoint_engine.device_utils import DeviceManager, npu_generate_uuid
 def _rebuild_ipc(handle: tuple[Callable, tuple], device_id: int | None = None) -> torch.Tensor:
     func, args = handle
@@ -53,51 +56,107 @@ def update_weights_from_ipc(
     socket = zmq_ctx.socket(zmq.REP)
     socket.connect(zmq_handle)
     buffer: torch.Tensor | None = None
-    while True:
-        payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = socket.recv_pyobj()
-        if payload is None:
-            # means the update is done
-            if post_hook is not None:
-                post_hook()
-            torch.cuda.synchronize()
-            socket.send(b"")
-            break
-        if isinstance(payload, tuple):
-            # an ipc handle that vLLM can use `func, args = handle`
-            # and `func(*args)` to rebuild GPU tensor.
-            buffer = _rebuild_ipc(payload, device_id)
-            assert buffer.dtype == torch.uint8
-            socket.send(b"")
-            continue
-        assert isinstance(payload, list)
-        run(_extract_weights(payload, buffer))
-        torch.cuda.synchronize()
+    device_manager = DeviceManager()
+    try:
+        ipc_handle: tuple[Callable, tuple] = socket.recv_pyobj()
+        assert isinstance(ipc_handle, tuple)
+        buffer = _rebuild_ipc(ipc_handle, device_id)
+        assert buffer.dtype == torch.uint8
         socket.send(b"")
+    except Exception as e:
+        msg = "".join(traceback.format_exception(type(e), e, e.__traceback__))
+        socket.send_string(msg)
+        socket.recv()  # wait for ack
+        raise
+    try:
+        while True:
+            payload: list[FlattenedTensorMetadata] | Exception | None = socket.recv_pyobj()
+            if payload is None:  # done signal
+                if post_hook is not None:
+                    post_hook()
+                device_manager.device_module.synchronize()
+                socket.send(b"")
+                break
+            if isinstance(payload, list):  # still updating weights
+                try:
+                    run(_extract_weights(payload, buffer))
+                    device_manager.device_module.synchronize()
+                    socket.send(b"")
+                except Exception as e:  # noqa: BLE001
+                    # Send exception back to Parameter Server.
+                    # Don't raise here. Because all workers should quit in the same way by receiving the exception from PS
+                    msg = "".join(traceback.format_exception(type(e), e, e.__traceback__))
+                    socket.send_string(msg)
+            elif isinstance(
+                payload, Exception
+            ):  # error occurred, got force quit signal from Parameter Server
+                raise payload
+            else:
+                raise TypeError(f"Unexpected payload type: {type(payload)}")
-    socket.close()
-    del buffer
-    gc.collect()
-    torch.cuda.empty_cache()
+    finally:
+        socket.close()
+        del buffer
+        gc.collect()
+        device_manager.device_module.empty_cache()
 class VllmColocateWorkerExtension:
     """
-    The class for vLLM's worker to inherit from, in the colocate setting.
-    By defining an extension class, the code can work no matter what is
-    the underlying worker class. This way, the code can be compatible
-    with both vLLM V0 and V1.
-    NOTE: we define this class in a separate module, and the main module
-    should pass the full qualified name as `worker_extension_cls` argument.
+    Worker extension for vLLM to update weights from checkpoint-engine.
+    This class provides a worker extension mechanism that allows vLLM workers to receive
+    and apply weight updates from the checkpoint-engine via IPC (Inter-Process Communication).
+    The methods in this worker extension will be injected into the vLLM worker class and
+    are callable from the `collective_rpc` API, enabling seamless weight updates for both
+    vLLM V0 and V1 versions.
+    Note:
+        This class is defined in a separate module. The fully qualified name
+        `checkpoint_engine.worker.VllmColocateWorkerExtension` should be passed as the
+        `worker_extension_cls` argument when initializing the vLLM worker.
     """
     def update_weights_from_ipc(self, zmq_handles: dict[str, str]):
+        """
+        Update model weights from checkpoint-engine via IPC communication.
+        This method establishes a ZMQ connection to the checkpoint-engine and receives
+        weight updates through a shared memory buffer. The update process includes:
+        1. Receiving IPC handles to reconstruct shared memory tensors
+        2. Extracting flattened metadata describing tensor weights in the shared memory tensor
+        3. Loading weights into the model
+        4. Post-processing weights after loading
+        Args:
+            zmq_handles: A dictionary mapping device UUIDs to ZMQ socket handles.
+                        The device UUID is platform-specific:
+                        - For CUDA: UUID from `current_platform.get_device_uuid()`
+                        - For NPU: Format "NPU-{generated_uuid}"
+        Raises:
+            ValueError: If the device type is not supported (not CUDA or NPU).
+            AssertionError: If the device is not properly initialized.
+        Note:
+            This method is called by vLLM's collective RPC mechanism. The ZMQ context
+            is lazily initialized on first call and reused for subsequent updates.
+        """
         from vllm.model_executor.model_loader.utils import process_weights_after_loading
         from vllm.platforms import current_platform
+        # vllm-ascend not init device
+        if current_platform.device_type == "npu" and self.device is None:
+            self.device = torch.device(f"npu:{self.local_rank}")
         assert self.device is not None
         if not hasattr(self, "_zmq_ctx") or self._zmq_ctx is None:
             self._zmq_ctx = zmq.Context()
-        device_uuid = current_platform.get_device_uuid(self.device.index)
+        if current_platform.device_type == "cuda":
+            device_uuid = current_platform.get_device_uuid(self.device.index)
+        elif current_platform.device_type == "npu":
+            device_uuid = f"NPU-{npu_generate_uuid()}"
+        else:
+            raise ValueError(f"Unsupported device type: {current_platform.device_type}")
         update_weights_from_ipc(
             self._zmq_ctx,
             zmq_handles[device_uuid],

{checkpoint_engine-0.2.0.dist-info → checkpoint_engine-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.2.0
+Version: 0.2.2
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine
@@ -99,17 +99,15 @@ Use the flexible P2P implementation, notice this will install `mooncake-transfer
 pip install 'checkpoint-engine[p2p]'
 ```
-If set `NCCL_IB_HCA` env, checkpoint-engine will use it to auto select net devices for different ranks. Available patterns can be found from [NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8). If not set, it will read all RDMA devices and try to divide them into each rank.
 ## Getting Started
-Prepare an H800 or H20 machine with 8 GPUs with latest vLLM. Be sure to include [/collective_rpc API endpoint](https://github.com/vllm-project/vllm/commit/f7cf5b512ee41f36613deb2471a44de5f304f70d) commit (available in main branch) since checkpoint-engine will use this endpoint to update weights.
+Prepare an H800 or H20 machine with 8 GPUs with vLLM. Be sure to include [/collective_rpc API endpoint](https://github.com/vllm-project/vllm/commit/f7cf5b512ee41f36613deb2471a44de5f304f70d) commit (available in main branch) since checkpoint-engine will use this endpoint to update weights. vLLM version `v0.10.2` is fully tested and recommended.
 ```Bash
-cd /opt && git clone https://github.com/vllm-project/vllm && cd vllm
+mkdir -p /opt/vLLM && cd /opt/vLLM
 uv venv --python 3.12 --seed
 source .venv/bin/activate
-VLLM_USE_PRECOMPILED=1 uv pip install --editable .
+uv pip install vllm==0.10.2
 ```
 Install checkpoint-engine
@@ -169,13 +167,68 @@ A [PR](https://github.com/vllm-project/vllm/pull/24488) is opened to the vLLM pr
 Run a simple correctness test for checkpoint_engine
 ```bash
-torchrun --nproc-per-node 8 tests/test_update.py
+pytest tests/test_update.py
 ```
-Other unit tests can be done with pytest.
+`test_update.py` are only designed to run with `pytest`. Please don't run it directly with `torchrun`.
+Other unit tests can also be done with pytest. Only test_update.py requires GPUs, other tests can be run on CPUs. Only to run CPU tests, use:
+```bash
+pytest tests/ -m "not gpu"
+```
+### Environment Variables
+- `PS_MAX_BUCKET_SIZE_GB`: An integer is used to set the maximum bucket size for checkpoint-engine. If not set, 8GB is used as default.
+- `PS_P2P_STORE_RDMA_DEVICES`: Comma-separated RDMA devices' names for P2P transfer. If not set, checkpoint-engine will fall back to use `NCCL_IB_HCA` to detect RDMA devices.
+- `NCCL_IB_HCA`: Available patterns can be found from [NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8). If also not set, all RDMA devices will be used and divided evenly among the ranks.
+## SGLang Integration
+Checkpoint Engine provides efficient distributed checkpoint loading for SGLang inference servers, significantly reducing model loading time for large models and multi-node setups.
+### Quick Start
+**1. Install checkpoint-engine:**
+```bash
+pip install 'checkpoint-engine[p2p]'
+```
+**2. Launch SGLang server:**
+```bash
+python -m sglang.launch_server \
+    --model-path $MODEL_PATH \
+    --tp 8 \
+    --load-format dummy \
+    --wait-for-initial-weights
+```
+**3. Run checkpoint engine:**
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+    --update-method broadcast \
+    --checkpoint-path $MODEL_PATH \
+    --inference-parallel-size 8
+```
+### Multi-Node Setup
+For 2-node setup, run the same commands on both nodes with appropriate `--host` and distributed training parameters.
+### Key Options
+**SGLang Server:**
+- `--wait-for-initial-weights`: Wait for checkpoint engine before becoming ready
+- `--load-format dummy`: Enable overlapping initialization tasks
+**Checkpoint Engine:**
+- `--update-method`: Choose `broadcast`, `p2p`, or `all`
+- `--inference-parallel-size`: Number of parallel processes
+- `--checkpoint-path`: Model checkpoint directory
 ## Limitations and Future Work
-- This project is currently only tested with vLLM. But it is easy to integrate with other frameworks like SGLang.
+- This project is currently tested with vLLM and SGLang. Integration with other frameworks is planned for future releases.
 - The perfect three-stage pipeline mentioned in our paper is currently not implemented. This could be useful for architectures where H2D and broadcast do not conflict in PCIE.
 ## Acknowledgments

checkpoint_engine-0.2.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+checkpoint_engine/__init__.py,sha256=Zj4I008kn9R6fYr0lVBzcQSnvckLpX2s1ljCOOqV1c8,87
+checkpoint_engine/_version.py,sha256=o3ZTescp-19Z9cvBGq9dQnbppljgzdUYUf98Nov0spY,704
+checkpoint_engine/device_utils.py,sha256=iKrof60j3CY3fStRTq3DRTt_kE1vYoEWHhAeyh0lByA,3020
+checkpoint_engine/ps.py,sha256=cu8Qp5daY1iL30iN69jXP4grlHoAKILblngcKQPA5Bg,67692
+checkpoint_engine/worker.py,sha256=f6kS1ushIXxkRCEHXM5wVofUer9OxRiVY03vmKYLzgo,6757
+checkpoint_engine-0.2.2.dist-info/licenses/LICENCE,sha256=D3gPmHKpGtF1yxYNhqjtBtZY_brZjDotJTzpnmClzlY,1067
+checkpoint_engine-0.2.2.dist-info/METADATA,sha256=_bBxy27d0GMc7KzuIBAdw-Lno3-UrVLUhH63YDbY1YA,11559
+checkpoint_engine-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+checkpoint_engine-0.2.2.dist-info/top_level.txt,sha256=66sik_1eLakLYmcllOEJzFaNbSfjsueuP0tHYEzhMSs,18
+checkpoint_engine-0.2.2.dist-info/RECORD,,

checkpoint_engine-0.2.0.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-checkpoint_engine/__init__.py,sha256=Zj4I008kn9R6fYr0lVBzcQSnvckLpX2s1ljCOOqV1c8,87
-checkpoint_engine/_version.py,sha256=Dg8AmJomLVpjKL6prJylOONZAPRtB86LOce7dorQS_A,704
-checkpoint_engine/ps.py,sha256=OpGocqJv0TfGgVC1cPKARfz6qehfCLMzQ5KpDQNxb0o,55291
-checkpoint_engine/worker.py,sha256=ZmJTHeNPbnE8sPInfrghj9jeRDkMUSQO906o1UoJv-E,3748
-checkpoint_engine-0.2.0.dist-info/licenses/LICENCE,sha256=D3gPmHKpGtF1yxYNhqjtBtZY_brZjDotJTzpnmClzlY,1067
-checkpoint_engine-0.2.0.dist-info/METADATA,sha256=tbAq45YlRvRAfQHDB0XV8w4ZP0zmVJ3RMTAx_wTm154,9896
-checkpoint_engine-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-checkpoint_engine-0.2.0.dist-info/top_level.txt,sha256=66sik_1eLakLYmcllOEJzFaNbSfjsueuP0tHYEzhMSs,18
-checkpoint_engine-0.2.0.dist-info/RECORD,,

{checkpoint_engine-0.2.0.dist-info → checkpoint_engine-0.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{checkpoint_engine-0.2.0.dist-info → checkpoint_engine-0.2.2.dist-info}/licenses/LICENCE RENAMED Viewed

File without changes

{checkpoint_engine-0.2.0.dist-info → checkpoint_engine-0.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

checkpoint-engine 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

checkpoint-engine 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl