PyPI - checkpoint-engine - Versions diffs - 0.3.1rc0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

checkpoint-engine 0.3.1rc0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checkpoint_engine/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.3.1rc0'
-__version_tuple__ = version_tuple = (0, 3, 1, 'rc0')
+__version__ = version = '0.3.2'
+__version_tuple__ = version_tuple = (0, 3, 2)
 __commit_id__ = commit_id = None

checkpoint_engine/pin_memory.py CHANGED Viewed

@@ -191,6 +191,8 @@ def _load_checkpoint(files: list[str]) -> dict[str, torch.Tensor]:
 def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[MemoryBuffer]:
+    device_index = torch.cuda.current_device()
     def _parse_and_pin_from_safetensors(file_path: str) -> MemoryBuffer:
         """
         safetensors format see https://huggingface.co/docs/safetensors/en/index#format.
@@ -204,6 +206,7 @@ def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[Memor
             Pin the memory of tensor in-place.
             See: https://github.com/pytorch/pytorch/issues/32167
             """
+            torch.cuda.set_device(device_index)
             cudart = torch.cuda.cudart()
             r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0)
             assert r == 0, f"pin memory error, error code: {r}"

checkpoint_engine/ps.py CHANGED Viewed

@@ -731,6 +731,7 @@ class ParameterServer:
         assert len(self._current_global_parameter_metas) != 0, "parameter metas is empty"
         assert dist.is_initialized(), "process group is not initialized"
+        p2p_update = False
         # if both ranks is None or [], it will use fully broadcast to update to all ranks
         if not ranks:
             logger.info(f"[rank{self._rank}] update checkpoint {checkpoint_name}")
@@ -739,6 +740,7 @@ class ParameterServer:
             assert self._p2p_store is not None, "p2p store is not initialized"
             assert ranks, "ranks should be set"
+            p2p_update = True
             need_update = self._rank in ranks
             logger.info(
                 f"[rank{self._rank}] update checkpoint {checkpoint_name} p2p, {need_update=} with {ranks=}, "
@@ -764,11 +766,6 @@ class ParameterServer:
             if disable_h2d_buffer
             else torch.empty(bucket_size, dtype=torch.uint8, device=self.device_manager.device_type)
         )
-        # p2p store need to register h2d_buffer to let other ranks read
-        if ranks:
-            h2d_buffer_name = "__h2d_buffer__"
-            if h2d_buffer is not None and self._p2p_store is not None:
-                self._p2p_store.register_named_tensors({h2d_buffer_name: h2d_buffer})
         receiver_rank_buckets: list[tuple[int, H2DBucket]] = []
         for receiver_rank, owner_rank, bucket in buckets:
             if receiver_rank != self._rank:
@@ -778,6 +775,12 @@ class ParameterServer:
         buffer = torch.empty(
             bucket_size * 2, dtype=torch.uint8, device=self.device_manager.device_type
         )
+        if p2p_update:
+            # p2p store need to register buffer to let other ranks read
+            p2p_ipc_buffer_name = "__ipc_buffer__"
+            self._p2p_store.register_named_tensors(
+                {p2p_ipc_buffer_name: buffer if disable_h2d_buffer else h2d_buffer}
+            )
         handle = reduce_tensor(buffer)
         buckets_by_receiver_rank: dict[int, list[H2DBucket]] = defaultdict(list)
@@ -823,7 +826,14 @@ class ParameterServer:
                     buffer_b: torch.Tensor = buffer[start : start + bucket.size]
                     if receiver_rank == self._rank:
                         if disable_h2d_buffer:
-                            self._copy_to_buffer(checkpoint_name, bucket, buffer_b)
+                            if p2p_update:
+                                assert bucket == receiver_rank_buckets[i][1]
+                            self._copy_to_buffer(
+                                checkpoint_name,
+                                bucket,
+                                buffer_b,
+                                receiver_rank_buckets[i][0] if p2p_update else None,
+                            )
                         else:
                             buffer_b.data.copy_(h2d_buffer[: bucket.size])
                     dist.broadcast(buffer_b, src=receiver_rank, group=ranks_group)
@@ -850,8 +860,8 @@ class ParameterServer:
             req_thread.join()
             dist.barrier(group=ranks_group)
             socket.close()
-            if ranks and h2d_buffer is not None:
-                self._p2p_store.unregister_named_tensors([h2d_buffer_name])
+            if p2p_update:
+                self._p2p_store.unregister_named_tensors([p2p_ipc_buffer_name])
             self.device_manager.device_module.empty_cache()

{checkpoint_engine-0.3.1rc0.dist-info → checkpoint_engine-0.3.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.3.1rc0
+Version: 0.3.2
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine

checkpoint_engine-0.3.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+checkpoint_engine/__init__.py,sha256=OeWxe9mxl2sZ6cW-blSTg6JbFlOMpGbBghLZtxGOqXk,942
+checkpoint_engine/__main__.py,sha256=yzQlApuYo6eIOqtqM018RosyxNzXzB5a-stxUvsh-dg,709
+checkpoint_engine/_version.py,sha256=e8NqPtZ8fggRgk3GPrqZ_U_BDV8aSULw1u_Gn9NNbnk,704
+checkpoint_engine/api.py,sha256=JDiQ4i3Gb6GoaBhlp8lNuUPaVURoFFdeGJY9ZDDGvPc,3518
+checkpoint_engine/data_types.py,sha256=O9uAXjwB20iwrOHfEEQd8Y9CmaFspNJ9ks9noHqwQKk,2716
+checkpoint_engine/device_utils.py,sha256=iKrof60j3CY3fStRTq3DRTt_kE1vYoEWHhAeyh0lByA,3020
+checkpoint_engine/p2p_store.py,sha256=abiCDVmRISPt9QFfavHB9Jo7ZpBbSjUS1NevGuB-AVA,8721
+checkpoint_engine/pin_memory.py,sha256=9XgE3Tn4XrEjXvA-XG70OgErDmlBU-cUVDP8ysB_9us,16237
+checkpoint_engine/ps.py,sha256=IJiA2zvZucFzFvnaLCYJMK7FHl2M2Z-g1tlDeoeZ-Rs,40689
+checkpoint_engine/worker.py,sha256=ghj9d2u8hY_U2uiOZWIN2CqRNZH6PrzujT22fHUFBWI,6879
+checkpoint_engine-0.3.2.dist-info/licenses/LICENCE,sha256=D3gPmHKpGtF1yxYNhqjtBtZY_brZjDotJTzpnmClzlY,1067
+checkpoint_engine-0.3.2.dist-info/METADATA,sha256=a2BEqlP0yca80Djg9WZD3IWj0DLPv9hfk6j1pgnZiR0,11559
+checkpoint_engine-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+checkpoint_engine-0.3.2.dist-info/top_level.txt,sha256=66sik_1eLakLYmcllOEJzFaNbSfjsueuP0tHYEzhMSs,18
+checkpoint_engine-0.3.2.dist-info/RECORD,,

checkpoint_engine-0.3.1rc0.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-checkpoint_engine/__init__.py,sha256=OeWxe9mxl2sZ6cW-blSTg6JbFlOMpGbBghLZtxGOqXk,942
-checkpoint_engine/__main__.py,sha256=yzQlApuYo6eIOqtqM018RosyxNzXzB5a-stxUvsh-dg,709
-checkpoint_engine/_version.py,sha256=dHrWkv1sAsMQC6tqqCYOmWtbcT9G_s2WFJDo0_AYy_0,714
-checkpoint_engine/api.py,sha256=JDiQ4i3Gb6GoaBhlp8lNuUPaVURoFFdeGJY9ZDDGvPc,3518
-checkpoint_engine/data_types.py,sha256=O9uAXjwB20iwrOHfEEQd8Y9CmaFspNJ9ks9noHqwQKk,2716
-checkpoint_engine/device_utils.py,sha256=iKrof60j3CY3fStRTq3DRTt_kE1vYoEWHhAeyh0lByA,3020
-checkpoint_engine/p2p_store.py,sha256=abiCDVmRISPt9QFfavHB9Jo7ZpBbSjUS1NevGuB-AVA,8721
-checkpoint_engine/pin_memory.py,sha256=gpoe_z5XxbWkCvFLaXXpyUUFetBXUjsOrxBSX-ksZTw,16141
-checkpoint_engine/ps.py,sha256=0d68Sqb_y3H6b5H37exMbghDJ294VKaGqoWkcKE-Ao8,40316
-checkpoint_engine/worker.py,sha256=ghj9d2u8hY_U2uiOZWIN2CqRNZH6PrzujT22fHUFBWI,6879
-checkpoint_engine-0.3.1rc0.dist-info/licenses/LICENCE,sha256=D3gPmHKpGtF1yxYNhqjtBtZY_brZjDotJTzpnmClzlY,1067
-checkpoint_engine-0.3.1rc0.dist-info/METADATA,sha256=VCgsnIGn1CcO9-ILevego92QDldqyGn-frzs4weGIwQ,11562
-checkpoint_engine-0.3.1rc0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-checkpoint_engine-0.3.1rc0.dist-info/top_level.txt,sha256=66sik_1eLakLYmcllOEJzFaNbSfjsueuP0tHYEzhMSs,18
-checkpoint_engine-0.3.1rc0.dist-info/RECORD,,

{checkpoint_engine-0.3.1rc0.dist-info → checkpoint_engine-0.3.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{checkpoint_engine-0.3.1rc0.dist-info → checkpoint_engine-0.3.2.dist-info}/licenses/LICENCE RENAMED Viewed

File without changes

{checkpoint_engine-0.3.1rc0.dist-info → checkpoint_engine-0.3.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

checkpoint-engine 0.3.1rc0__py3-none-any.whl → 0.3.2__py3-none-any.whl

checkpoint-engine 0.3.1rc0py3-none-any.whl → 0.3.2py3-none-any.whl