PyPI - checkpoint-engine - Versions diffs - 0.2.3__tar.gz → 0.3.0rc0__tar.gz - Mend

checkpoint-engine 0.2.3tar.gz → 0.3.0rc0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.2.3
+Version: 0.3.0rc0
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc0}/checkpoint_engine/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.2.3'
-__version_tuple__ = version_tuple = (0, 2, 3)
+__version__ = version = '0.3.0rc0'
+__version_tuple__ = version_tuple = (0, 3, 0, 'rc0')
-__commit_id__ = commit_id = 'g0a6244951'
+__commit_id__ = commit_id = 'gbaf6f6196'

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc0}/checkpoint_engine/ps.py RENAMED Viewed

@@ -622,7 +622,6 @@ def _register_checkpoint(
     named_tensors: dict[str, torch.Tensor],
     rank: int | None = None,
     shared_pin_memory: list[MemoryBuffer] | None = None,
-    inplace_pin: bool = False,
 ) -> list[MemoryBuffer]:
     logger.info(
         f"[rank{rank}] start to register checkpoint with {len(files)} files and {len(named_tensors)} named_tensors"
@@ -630,17 +629,12 @@ def _register_checkpoint(
     if not files and not named_tensors:
         return []
     memory_buffers: list[MemoryBuffer] = []
-    if inplace_pin:
-        logger.info(f"[rank{rank}] allow inplace pin memory for /dev/shm/ safetensors files")
-        files_to_inplace_pin = [
-            file
-            for file in files
-            if file.startswith("/dev/shm/") and file.endswith(".safetensors")  # noqa: S108
-        ]
-        files_to_normal_pin = [file for file in files if file not in files_to_inplace_pin]
-    else:
-        files_to_normal_pin = files
-        files_to_inplace_pin = []
+    files_to_inplace_pin = [
+        file
+        for file in files
+        if file.startswith("/dev/shm/") and file.endswith(".safetensors")  # noqa: S108
+    ]
+    files_to_normal_pin = [file for file in files if file not in files_to_inplace_pin]
     if files_to_normal_pin or named_tensors:
         memory_buffers.extend(
             _normal_pin_memory(
@@ -792,20 +786,6 @@ def _get_master_port(master_port: int | None = None) -> int:
     return master_port
-def _get_bcast_rank_map(world_size: int, ranks: list[int] | None) -> dict[int, int]:
-    """
-    map the real ranks (receiver_rank) to the bcast ranks (0 ~ len(ranks) - 1),
-    which are generated in self.init_process_group_for_ranks
-    """
-    bcast_rank_map: dict[int, int] = {}
-    if not ranks:
-        bcast_rank_map = {r: r for r in range(world_size)}
-    else:
-        for i, r in enumerate(ranks):
-            bcast_rank_map[r] = i
-    return bcast_rank_map
 class P2PStore:
     def __init__(self, device_manager: DeviceManager):
         from mooncake.engine import TransferEngine
@@ -979,11 +959,10 @@ class ParameterServer:
         files: list[str] | None = None,
         named_tensors: dict[str, torch.Tensor] | None = None,
         use_shared_memory_pool: bool = False,
-        use_inplace_pin_memory: bool = False,
     ) -> None:
         """
         Register a checkpoint to the parameter server. Both files and named_tensors will be registered together.
-        Warning: if `use_inplace_pin_memory` is True, .safetensors files in /dev/shm/ will be pinned in-place, and the files will be REMOVED after pinning.
+        Warning: .safetensors files in /dev/shm/ will be pinned in-place, and the files will be REMOVED after pinning.
         Please make sure to copy the files to disks if you need to keep them.
         Args:
@@ -995,8 +974,6 @@ class ParameterServer:
                 cannot accommodate checkpoints with different memory requirements.
                 To free the actual memory of the shared pool or to modify its shape,
                 please unregister the current user of the shared memory pool using `unregister_checkpoint` with `force=True`.
-            use_inplace_pin_memory: If True, allows inplace pin memory for /dev/shm/ safetensors files. This option is ignored when ``use_shared_memory_pool`` is True.
-                Currently, this feature is experimental and may crash.
         """
         try:
             if use_shared_memory_pool:
@@ -1025,10 +1002,7 @@ class ParameterServer:
                     f"checkpoint {checkpoint_name} already registered"
                 )
                 self._memory_pool[checkpoint_name] = _register_checkpoint(
-                    files=files or [],
-                    named_tensors=named_tensors or {},
-                    rank=self._rank,
-                    inplace_pin=use_inplace_pin_memory,
+                    files=files or [], named_tensors=named_tensors or {}, rank=self._rank
                 )
                 if self._p2p_store is not None:
                     self._register_parameters_to_p2p_store(checkpoint_name)
@@ -1176,12 +1150,36 @@ class ParameterServer:
         )
         logger.info(f"[rank{self._rank}] init process group successfully.")
+    def store_based_barrier(
+        self, store: dist.TCPStore, timeout: timedelta = timedelta(minutes=5)
+    ) -> None:
+        """
+        Perform a store-based barrier synchronization across all ranks.
+        This barrier uses a TCP store directly rather than a process group,
+        allowing all ranks to synchronize regardless of which process group
+        they belong to.
+        Args:
+            store: The TCPStore instance to use for synchronization.
+        """
+        dist.distributed_c10d._store_based_barrier(
+            rank=self._rank,
+            store=store,
+            group_name="parameter_server_barrier",
+            rendezvous_count=self._world_size,
+            timeout=timeout,
+        )
     def update(
         self,
         checkpoint_name: str,
         req_func: Callable[[list[tuple[str, str]]], None],
         *,
+        timeout: timedelta = timedelta(minutes=10),
         ranks: list[int] | None = None,
+        master_addr: str | None = None,
+        master_port: int | None = None,
     ) -> None:
         """
         Update the checkpoint to inference engine. This function should be called after gather_metas.
@@ -1193,34 +1191,45 @@ class ParameterServer:
                 which is the fastest way to update weights, especially in colocated architecture.
                 If set, will use p2p to update to the ranks, this is flexible to update to a group of ranks,
                 which is useful in disaggregated architecture.
+            master_addr: The master address for process group initialization. If not set, will use env MASTER_ADDR.
+            master_port: The master port for process group initialization. If not set, will use _get_master_port to get the port, which will use MASTER_PORT+1.
+            timeout: The timeout of the barrier operation.
         """
         assert req_func is not None, "req_func is required"
+        ranks_group = None
         try:
-            # if both ranks is None or [], it will use fully broadcast to update to all ranks
-            if not ranks:
-                if self._auto_pg and not dist.is_initialized():
-                    self.init_process_group()
-                self._update_per_bucket(checkpoint_name, req_func)
+            master_addr = os.getenv("MASTER_ADDR") or master_addr
+            assert master_addr, "master_addr is required"
+            if self._auto_pg:
+                if not dist.is_initialized():
+                    self.init_process_group(
+                        timeout=timeout, master_addr=master_addr, master_port=master_port
+                    )
+                manager_store = dist.distributed_c10d._get_default_store()
             else:
-                if self._auto_pg:
-                    if dist.is_initialized():
-                        dist.destroy_process_group()
-                        # HACK: wait 2s to ensure destroy is finished
-                        time.sleep(2)
-                    self.init_process_group_for_ranks(ranks)
-                if self._rank not in ranks:
-                    return
-                self._update_per_bucket(checkpoint_name, req_func, ranks)
+                # HACK: MASTER_PORT+2 for barrier store if master_port is not provided, _get_master_port() returns MASTER_PORT+1
+                # If master_port is provided, use master_port+1 for barrier store
+                manager_store = dist.TCPStore(
+                    master_addr,
+                    _get_master_port(master_port) + 1,
+                    self._world_size,
+                    timeout=timeout,
+                    is_master=self._rank == 0,
+                )
+            # if ranks is None or [], it will use fully broadcast to update to all ranks
+            ranks_group = dist.new_group(ranks if ranks else None)
+            self._update_per_bucket(checkpoint_name, req_func, ranks_group, ranks)
+            self.store_based_barrier(manager_store)
         except Exception as e:
             logger.exception(
                 f"[rank{self._rank}] update checkpoint {checkpoint_name} with ranks {ranks} error {e}"
             )
             raise
         finally:
-            if self._auto_pg and (not ranks or self._rank in ranks):
+            if ranks_group:
+                dist.destroy_process_group(ranks_group)
+            if self._auto_pg and dist.is_initialized():
                 dist.destroy_process_group()
             self.device_manager.device_module.empty_cache()
             logger.info(
                 f"[rank{self._rank}] update checkpoint {checkpoint_name} with ranks {ranks} done. "
@@ -1238,7 +1247,9 @@ class ParameterServer:
         self._zmq_addr_counter += 1
         return socket, socket_paths
-    def _detect_bucket_size(self, *, disable_h2d_buffer: bool = False) -> tuple[int, bool]:
+    def _detect_bucket_size(
+        self, ranks_group: dist.ProcessGroup, *, disable_h2d_buffer: bool = False
+    ) -> tuple[int, bool]:
         GiB = 1 << 30  # noqa: N806
         # auto detect bucket size
         tensor = torch.tensor(
@@ -1254,7 +1265,7 @@ class ParameterServer:
             dtype=torch.int64,
             device=self.device_manager.device_type,
         )
-        dist.all_reduce(tensor, op=dist.ReduceOp.MIN)
+        dist.all_reduce(tensor, op=dist.ReduceOp.MIN, group=ranks_group)
         tensor = tensor.cpu()
         free_bytes, self._zmq_addr_counter = tensor[0].item(), -tensor[1].item()
         max_tensor_bytes = 0
@@ -1317,51 +1328,6 @@ class ParameterServer:
             self._p2p_store.batch_transfer_sync_read(target_addr, buf_ptrs, remote_ptrs, lens)
         self.device_manager.device_module.synchronize()
-    def init_process_group_for_ranks(
-        self,
-        ranks: list[int],
-        *,
-        master_port: int | None = None,
-        timeout: timedelta = timedelta(minutes=10),
-    ):
-        """
-        Initialize the process group for the ranks. This global group can be easily destroyed by calling dist.destroy_process_group.
-        Args:
-            ranks: The ranks to initialize the process group. ranks should be a subset of all ranks.
-            master_port: The specified port of the master node. If not set, will use _get_master_port to get the port.
-            timeout: The timeout of the process group.
-        """
-        assert not dist.is_initialized()
-        assert ranks, "ranks should be set"
-        if self._rank not in ranks:
-            return
-        assert self._all_hosts, "all_hosts should be set"
-        assert len(self._all_hosts) == self._world_size // self._gpu_count, (
-            f"world_size {self._world_size} should be equal to all_hosts {len(self._all_hosts)}"
-        )
-        rank = ranks.index(self._rank)
-        master_addr = self._all_hosts[ranks[0] // self._gpu_count]
-        master_port = _get_master_port(master_port)
-        logger.info(
-            f"[rank{self._rank}] start to init process group as virtual_rank {rank}, "
-            f"master_addr {master_addr}, master_port {master_port}, world_size {len(ranks)}, "
-        )
-        # only initialize process group and store for ranks, other nodes are not initialized
-        # and will not participate in this update. Since they have registered memory addresses
-        # to p2p_store at the beginning, update ranks can directly get the memory addresses
-        # from other nodes and put the weights into the buffer.
-        store = dist.TCPStore(
-            master_addr, master_port, len(ranks), is_master=rank == 0, timeout=timeout
-        )
-        dist.init_process_group(
-            backend=self.device_manager.backend,
-            world_size=len(ranks),
-            rank=rank,
-            timeout=timeout,
-            store=store,
-        )
     def _get_addr_ptrs(self, owner_rank: int) -> tuple[str, list[tuple[int, int]]]:
         addr = self._current_global_parameter_metas[owner_rank].p2p_store_addr
         metas_list = self._current_global_parameter_metas[owner_rank].memory_buffer_metas_list
@@ -1401,10 +1367,12 @@ class ParameterServer:
         self,
         checkpoint_name: str,
         req_func: Callable[[list[tuple[str, str]]], None],
+        ranks_group: dist.ProcessGroup,
         ranks: list[int] | None = None,
     ):
         assert len(self._current_global_parameter_metas) != 0, "parameter metas is empty"
         assert dist.is_initialized(), "process group is not initialized"
         # if both ranks is None or [], it will use fully broadcast to update to all ranks
         if not ranks:
             logger.info(f"[rank{self._rank}] update checkpoint {checkpoint_name}")
@@ -1422,9 +1390,9 @@ class ParameterServer:
             if not need_update:
                 return
             # first execute a barrier to avoid subsequent device oom
-            dist.barrier()
+            dist.barrier(group=ranks_group)
-        bucket_size, disable_h2d_buffer = self._detect_bucket_size()
+        bucket_size, disable_h2d_buffer = self._detect_bucket_size(ranks_group)
         buckets = _gen_h2d_buckets(
             self._current_global_parameter_metas,
             bucket_size,
@@ -1471,7 +1439,6 @@ class ParameterServer:
         gidx = 0
         ret_code = torch.zeros((), device=self.device_manager.device_type, dtype=torch.int64)
-        bcast_rank_map = _get_bcast_rank_map(self._world_size, ranks)
         try:
             for i in range(max_len):
                 if i < len(receiver_rank_buckets) and not disable_h2d_buffer:
@@ -1501,8 +1468,7 @@ class ParameterServer:
                             self._copy_to_buffer(checkpoint_name, bucket, buffer_b)
                         else:
                             buffer_b.data.copy_(h2d_buffer[: bucket.size])
-                    brank = bcast_rank_map[receiver_rank]
-                    dist.broadcast(buffer_b, src=brank)
+                    dist.broadcast(buffer_b, src=receiver_rank, group=ranks_group)
                     resp = socket.recv()
                     if resp != b"":
                         msg = resp.decode("utf-8")
@@ -1510,7 +1476,7 @@ class ParameterServer:
                             f"[rank{self._rank}] receive error response from rank {receiver_rank} for bucket {gidx} in checkpoint {checkpoint_name}: {msg}"
                         )
                         ret_code.fill_(1)
-                    dist.all_reduce(ret_code, op=dist.ReduceOp.SUM)
+                    dist.all_reduce(ret_code, op=dist.ReduceOp.SUM, group=ranks_group)
                     self.device_manager.device_module.synchronize()
                     if ret_code.item() != 0:
                         # quit early if any rank failed
@@ -1524,7 +1490,7 @@ class ParameterServer:
             socket.recv()
         finally:
             req_thread.join()
-            dist.barrier()
+            dist.barrier(group=ranks_group)
             socket.close()
             if ranks and h2d_buffer is not None:
                 self._p2p_store.unregister_named_tensors([h2d_buffer_name])

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc0}/checkpoint_engine.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: checkpoint-engine
-Version: 0.2.3
+Version: 0.3.0rc0
 Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
 Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
 Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine

{checkpoint_engine-0.2.3 → checkpoint_engine-0.3.0rc0}/tests/test_update.py RENAMED Viewed

@@ -218,8 +218,7 @@ def run_with_files(
     if rank == 0:
         import shutil
-        # this test should be run under use_inplace_pin_memory=False. Otherwise, the files in /dev/shm/ will be deleted.
-        shutil.rmtree(dev_shm_dir)
+        os.removedirs(dev_shm_dir)
         shutil.rmtree(disk_dir)
     assert proc.exitcode == 0
@@ -238,7 +237,13 @@ def run_with_files(
             ],
         ),
         ("test_with_remote_error", [[]]),
-        # ("long_test_no_error", [list(random.sample(range(get_world_size()), k=num_ranks)) for num_ranks in range(get_world_size() + 1)]),
+        (
+            "test_no_error",
+            [
+                list(random.sample(range(get_world_size()), k=num_ranks))
+                for num_ranks in range(get_world_size() + 1)
+            ],
+        ),
     ],
 )
 def test_update(test_name: str, rank_list: list[list[int]] | None):