PyPI - checkpoint-engine - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

checkpoint-engine 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checkpoint_engine/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.3.2'
-__version_tuple__ = version_tuple = (0, 3, 2)
+__version__ = version = '0.3.4'
+__version_tuple__ = version_tuple = (0, 3, 4)
 __commit_id__ = commit_id = None

checkpoint_engine/pin_memory.py CHANGED Viewed

@@ -209,7 +209,9 @@ def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[Memor
             torch.cuda.set_device(device_index)
             cudart = torch.cuda.cudart()
             r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0)
-            assert r == 0, f"pin memory error, error code: {r}"
+            if r != 0:
+                error_msg = cudart.cudaGetErrorString(r)
+                raise RuntimeError(f"pin memory error, error code: {r}, error message: {error_msg}")
         # TODO: should only support /dev/shm? but we found files in disk also work?
         size = os.stat(file_path).st_size
@@ -254,6 +256,12 @@ def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[Memor
         # Remove the file after successfully loading. This will avoid doubling the memory usage.
         # We assume files in /dev/shm/ are temporary files. So it's safe to remove them after loading.
         os.remove(file_path)
+        if not metas:
+            # TODO: should we still return this buffer?
+            assert buffer.nbytes == 0, f"buffer nbytes {buffer.nbytes} should be 0"
+            logger.warning(f"[rank{rank}] no metas found in {file_path}, skip pin memory")
+            return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=[], manually_pinned=False)
         _pin(buffer)
         logger.info(
             f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB"

checkpoint_engine/ps.py CHANGED Viewed

@@ -391,7 +391,11 @@ class ParameterServer:
                 )
                 cudart = torch.cuda.cudart()
                 r = cudart.cudaHostUnregister(t.data_ptr())
-                assert r == 0, f"unpin memory error, error code: {r}"
+                if r != 0:
+                    error_msg = cudart.cudaGetErrorString(r)
+                    raise RuntimeError(
+                        f"unpin memory error, error code: {r}, error message: {error_msg}"
+                    )
             # if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually
             try:
@@ -407,7 +411,13 @@ class ParameterServer:
             del self._memory_pool[checkpoint_name]
         # see https://github.com/pytorch/pytorch/blob/31d5c675394705f8a6bc767f80ae14bf4f01246b/torch/csrc/cuda/Module.cpp#L2018
         # this works by using torch>=2.5.0
-        torch._C._host_emptyCache()
+        if self.device_manager.device_type == "cuda":
+            torch._C._host_emptyCache()
+        else:
+            # torch._C._host_emptyCache() is not supported on NPU, so we call gc.collect() to empty host cache.
+            import gc
+            gc.collect()
     def gather_metas(self, checkpoint_name: str):
         """

checkpoint_engine/worker.py CHANGED Viewed

@@ -10,6 +10,9 @@ import zmq
 from checkpoint_engine.device_utils import DeviceManager, npu_generate_uuid
+_WEIGHTS_TYPE = list[tuple[str, torch.Tensor]]
 def _rebuild_ipc(handle: tuple[Callable, tuple], device_id: int | None = None) -> torch.Tensor:
     func, args = handle
     list_args = list(args)
@@ -29,11 +32,9 @@ class FlattenedTensorMetadata(TypedDict):
     offset: int
-def _extract_weights(
-    payload: list[FlattenedTensorMetadata], buffer: torch.Tensor
-) -> list[tuple[str, torch.Tensor]]:
+def _extract_weights(payload: list[FlattenedTensorMetadata], buffer: torch.Tensor) -> _WEIGHTS_TYPE:
     assert buffer is not None
-    weights: list[tuple[str, torch.Tensor]] = []
+    weights: _WEIGHTS_TYPE = []
     for item in payload:
         shape = item["shape"]
         if isinstance(shape, list | tuple):
@@ -166,12 +167,31 @@ class VllmColocateWorkerExtension:
             self.device = torch.device(f"npu:{self.local_rank}")
         assert self.device is not None
+        def _load_weights(weights: _WEIGHTS_TYPE):
+            # Load main model weights
+            self.model_runner.model.load_weights(weights)
+            # Load drafter model weights if MTP/speculative decoding is enabled
+            if (
+                getattr(self.model_runner, "drafter", None) is not None
+                and getattr(self.model_runner.drafter, "model", None) is not None
+            ):
+                self.model_runner.drafter.model.load_weights(weights=weights)
+        def _post_hook():
+            process_weights_after_loading(self.model_runner.model, self.model_config, self.device)
+            # Also trigger drafter model's post processing if MTP is enabled
+            if (
+                getattr(self.model_runner, "drafter", None) is not None
+                and getattr(self.model_runner.drafter, "model", None) is not None
+            ):
+                process_weights_after_loading(
+                    self.model_runner.drafter.model, self.model_config, self.device
+                )
         update_weights_from_ipc(
             self._zmq_ctx,
             zmq_handles[self._device_uuid],
             device_id=self.device.index,
-            run=self.model_runner.model.load_weights,
-            post_hook=lambda: process_weights_after_loading(
-                self.model_runner.model, self.model_config, self.device
-            ),
+            run=_load_weights,
+            post_hook=_post_hook,
         )

{checkpoint_engine-0.3.2.dist-info → checkpoint_engine-0.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.3.2
+Version: 0.3.4
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine

checkpoint_engine-0.3.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+checkpoint_engine/__init__.py,sha256=OeWxe9mxl2sZ6cW-blSTg6JbFlOMpGbBghLZtxGOqXk,942
+checkpoint_engine/__main__.py,sha256=yzQlApuYo6eIOqtqM018RosyxNzXzB5a-stxUvsh-dg,709
+checkpoint_engine/_version.py,sha256=3nDaC5e0d_scBB1bUEKPlItbvbY0PmXNNyyOTNFNWNI,704
+checkpoint_engine/api.py,sha256=JDiQ4i3Gb6GoaBhlp8lNuUPaVURoFFdeGJY9ZDDGvPc,3518
+checkpoint_engine/data_types.py,sha256=O9uAXjwB20iwrOHfEEQd8Y9CmaFspNJ9ks9noHqwQKk,2716
+checkpoint_engine/device_utils.py,sha256=iKrof60j3CY3fStRTq3DRTt_kE1vYoEWHhAeyh0lByA,3020
+checkpoint_engine/p2p_store.py,sha256=abiCDVmRISPt9QFfavHB9Jo7ZpBbSjUS1NevGuB-AVA,8721
+checkpoint_engine/pin_memory.py,sha256=b7nABKJV2bSIsOfX2YTHzUk1OkOze6AQjCaOIFaQnbA,16708
+checkpoint_engine/ps.py,sha256=wBsHu2qWy5oRBrvLc7aEOroG_j58UJoWT6lFH4ylMRk,41092
+checkpoint_engine/worker.py,sha256=CDWbxwvMpid19yriuwAsyZLUZtqfkh9Lybn8KpiuKCw,7781
+checkpoint_engine-0.3.4.dist-info/licenses/LICENCE,sha256=D3gPmHKpGtF1yxYNhqjtBtZY_brZjDotJTzpnmClzlY,1067
+checkpoint_engine-0.3.4.dist-info/METADATA,sha256=P23Txz8z5WvM3km3EHFtKBEc5299c5UZcd0UTABN-u8,11559
+checkpoint_engine-0.3.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+checkpoint_engine-0.3.4.dist-info/top_level.txt,sha256=66sik_1eLakLYmcllOEJzFaNbSfjsueuP0tHYEzhMSs,18
+checkpoint_engine-0.3.4.dist-info/RECORD,,

{checkpoint_engine-0.3.2.dist-info → checkpoint_engine-0.3.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

checkpoint_engine-0.3.2.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-checkpoint_engine/__init__.py,sha256=OeWxe9mxl2sZ6cW-blSTg6JbFlOMpGbBghLZtxGOqXk,942
-checkpoint_engine/__main__.py,sha256=yzQlApuYo6eIOqtqM018RosyxNzXzB5a-stxUvsh-dg,709
-checkpoint_engine/_version.py,sha256=e8NqPtZ8fggRgk3GPrqZ_U_BDV8aSULw1u_Gn9NNbnk,704
-checkpoint_engine/api.py,sha256=JDiQ4i3Gb6GoaBhlp8lNuUPaVURoFFdeGJY9ZDDGvPc,3518
-checkpoint_engine/data_types.py,sha256=O9uAXjwB20iwrOHfEEQd8Y9CmaFspNJ9ks9noHqwQKk,2716
-checkpoint_engine/device_utils.py,sha256=iKrof60j3CY3fStRTq3DRTt_kE1vYoEWHhAeyh0lByA,3020
-checkpoint_engine/p2p_store.py,sha256=abiCDVmRISPt9QFfavHB9Jo7ZpBbSjUS1NevGuB-AVA,8721
-checkpoint_engine/pin_memory.py,sha256=9XgE3Tn4XrEjXvA-XG70OgErDmlBU-cUVDP8ysB_9us,16237
-checkpoint_engine/ps.py,sha256=IJiA2zvZucFzFvnaLCYJMK7FHl2M2Z-g1tlDeoeZ-Rs,40689
-checkpoint_engine/worker.py,sha256=ghj9d2u8hY_U2uiOZWIN2CqRNZH6PrzujT22fHUFBWI,6879
-checkpoint_engine-0.3.2.dist-info/licenses/LICENCE,sha256=D3gPmHKpGtF1yxYNhqjtBtZY_brZjDotJTzpnmClzlY,1067
-checkpoint_engine-0.3.2.dist-info/METADATA,sha256=a2BEqlP0yca80Djg9WZD3IWj0DLPv9hfk6j1pgnZiR0,11559
-checkpoint_engine-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-checkpoint_engine-0.3.2.dist-info/top_level.txt,sha256=66sik_1eLakLYmcllOEJzFaNbSfjsueuP0tHYEzhMSs,18
-checkpoint_engine-0.3.2.dist-info/RECORD,,

{checkpoint_engine-0.3.2.dist-info → checkpoint_engine-0.3.4.dist-info}/licenses/LICENCE RENAMED Viewed

File without changes

{checkpoint_engine-0.3.2.dist-info → checkpoint_engine-0.3.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

checkpoint-engine 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

checkpoint-engine 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl