PyPI - checkpoint-engine - Versions diffs - 0.1.2__tar.gz → 0.1.3__tar.gz - Mend

checkpoint-engine 0.1.2tar.gz → 0.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{checkpoint_engine-0.1.2 → checkpoint_engine-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.1.2
+Version: 0.1.3
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine

{checkpoint_engine-0.1.2 → checkpoint_engine-0.1.3}/checkpoint_engine/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.1.2'
-__version_tuple__ = version_tuple = (0, 1, 2)
+__version__ = version = '0.1.3'
+__version_tuple__ = version_tuple = (0, 1, 3)
-__commit_id__ = commit_id = 'g716c0dad9'
+__commit_id__ = commit_id = 'g8a60e65ba'

{checkpoint_engine-0.1.2 → checkpoint_engine-0.1.3}/checkpoint_engine/ps.py RENAMED Viewed

@@ -1,5 +1,3 @@
-from __future__ import annotations
 import argparse
 import concurrent.futures
 import ctypes
@@ -10,6 +8,7 @@ import socket
 import threading
 import time
 from collections import defaultdict
+from collections.abc import Callable
 from datetime import timedelta
 from functools import lru_cache
 from typing import TYPE_CHECKING, Annotated, Any, BinaryIO, NamedTuple
@@ -26,8 +25,6 @@ from torch.multiprocessing.reductions import reduce_tensor
 if TYPE_CHECKING:
-    from collections.abc import Callable
     from typing_extensions import TypedDict
     class FileMeta(TypedDict):
@@ -151,8 +148,8 @@ def _to_named_tensor(metas: list[ParameterMeta], offset: int = 0) -> list[dict]:
     return ret
-def _load_checkpoint_file(file_path: str) -> tuple[int, dict[str, tuple[FileMeta, torch.Tensor]]]:
-    def _safetensors_load(fn: str) -> dict[str, tuple[FileMeta, torch.Tensor]]:
+def _load_checkpoint_file(file_path: str) -> tuple[int, dict[str, tuple["FileMeta", torch.Tensor]]]:
+    def _safetensors_load(fn: str) -> dict[str, tuple["FileMeta", torch.Tensor]]:
         ret = {}
         with safe_open(fn, framework="pt") as f:
             for name in f.keys():  # noqa: SIM118
@@ -168,7 +165,7 @@ def _load_checkpoint_file(file_path: str) -> tuple[int, dict[str, tuple[FileMeta
         return ret
     # deprecated, will be removed in the future
-    def _fast_np_load(fn: str) -> dict[str, tuple[FileMeta, torch.Tensor]]:
+    def _fast_np_load(fn: str) -> dict[str, tuple["FileMeta", torch.Tensor]]:
         """load *.np file and return memmap and related tensor meta"""
         def parse_npy_header(fin: BinaryIO) -> dict[str, Any]:
@@ -595,7 +592,13 @@ class P2PStore:
 class ParameterServer:
     def __init__(
-        self, *, rank: int | None = None, world_size: int | None = None, auto_pg: bool = False
+        self,
+        *,
+        rank: int | None = None,
+        world_size: int | None = None,
+        auto_pg: bool = False,
+        gpu_count: int | None = None,
+        mem_fraction: float | None = None,
     ):
         """
         Initialize the parameter server. env RANK, WORLD_SIZE and MASTER_ADDR must be set.
@@ -603,17 +606,27 @@ class ParameterServer:
         Args:
             auto_pg: Whether to automatically initialize the process group.
                 Notice that if auto_pg is True, will destroy the process group after update.
+            mem_fraction: The proportion (as a fraction) of the current free CUDA memory for allocation.
         """
         self._rank = rank or int(os.environ.get("RANK", None))
         self._world_size = world_size or int(os.environ.get("WORLD_SIZE", None))
-        self._gpu_count = torch.cuda.device_count()
+        self._gpu_count = gpu_count or torch.cuda.device_count()
         self._local_rank = self._rank % self._gpu_count
         self._auto_pg = auto_pg
         self._all_hosts = []
         self._global_device_uuids: list[str] = []
+        self._mem_fraction = mem_fraction or 0.9
         assert self._rank is not None and self._rank >= 0, self._rank
         assert self._world_size and self._world_size > 0, self._world_size
+        assert (
+            self._gpu_count is not None
+            and self._gpu_count > 0
+            and self._gpu_count <= torch.cuda.device_count()
+        ), self._gpu_count
+        assert (
+            self._mem_fraction is not None and self._mem_fraction > 0 and self._mem_fraction <= 1
+        ), self._mem_fraction
         self._zmq_ctx = zmq.Context()
         self._zmq_addr_counter = 0
@@ -795,13 +808,15 @@ class ParameterServer:
                     self.init_process_group()
                 self._update_per_bucket(checkpoint_name, req_func)
             else:
-                if self._rank not in ranks:
+                if not self._auto_pg and self._rank not in ranks:
                     return
                 if self._auto_pg:
                     if dist.is_initialized():
                         dist.destroy_process_group()
                         # HACK: wait 2s to ensure destroy is finished
                         time.sleep(2)
+                    if self._rank not in ranks:
+                        return
                     self.init_process_group_for_ranks(ranks)
                 self._update_per_bucket_p2p(checkpoint_name, req_func, ranks)
             if self._auto_pg:
@@ -835,8 +850,8 @@ class ParameterServer:
         # auto detect bucket size
         tensor = torch.tensor(
             [
-                # 90% of current cuda free memory bytes
-                int(float(torch.cuda.mem_get_info()[0]) * 0.9),
+                # proportion of current cuda free memory bytes
+                int(float(torch.cuda.mem_get_info()[0]) * self._mem_fraction),
                 # we use negative value to reuse allreduce min operation
                 # for getting the max value of zmq_addr_counter in all ranks
                 -self._zmq_addr_counter,

{checkpoint_engine-0.1.2 → checkpoint_engine-0.1.3}/checkpoint_engine.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.1.2
+Version: 0.1.3
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine