checkpoint-engine 0.3.1rc0__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/PKG-INFO +1 -1
  2. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/_version.py +3 -3
  3. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/pin_memory.py +12 -1
  4. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/ps.py +30 -10
  5. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/PKG-INFO +1 -1
  6. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/examples/update.py +2 -1
  7. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.github/workflows/cpu-tests.yml +0 -0
  8. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.github/workflows/pre-commit.yaml +0 -0
  9. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.github/workflows/python-publish.yml +0 -0
  10. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.gitignore +0 -0
  11. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.pre-commit-config.yaml +0 -0
  12. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/LICENCE +0 -0
  13. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/README.md +0 -0
  14. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/__init__.py +0 -0
  15. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/__main__.py +0 -0
  16. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/api.py +0 -0
  17. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/data_types.py +0 -0
  18. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/device_utils.py +0 -0
  19. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/p2p_store.py +0 -0
  20. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/worker.py +0 -0
  21. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/SOURCES.txt +0 -0
  22. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/dependency_links.txt +0 -0
  23. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/requires.txt +0 -0
  24. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/top_level.txt +0 -0
  25. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/docs/npu_start.md +0 -0
  26. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/figures/checkpoint-engine.png +0 -0
  27. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/figures/overlap-update-and-copy.png +0 -0
  28. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/figures/pipeline.png +0 -0
  29. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/patches/vllm_fp8.patch +0 -0
  30. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/pyproject.toml +0 -0
  31. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/setup.cfg +0 -0
  32. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_assign_receiver_ranks.py +0 -0
  33. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_inplace_unpin.py +0 -0
  34. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_rdma_parser.py +0 -0
  35. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_reuse_pin_memory.py +0 -0
  36. {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_update.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: checkpoint-engine
3
- Version: 0.3.1rc0
3
+ Version: 0.3.3
4
4
  Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
5
5
  Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
6
6
  Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.1rc0'
32
- __version_tuple__ = version_tuple = (0, 3, 1, 'rc0')
31
+ __version__ = version = '0.3.3'
32
+ __version_tuple__ = version_tuple = (0, 3, 3)
33
33
 
34
- __commit_id__ = commit_id = 'g09c543af4'
34
+ __commit_id__ = commit_id = 'gf6910d646'
@@ -191,6 +191,8 @@ def _load_checkpoint(files: list[str]) -> dict[str, torch.Tensor]:
191
191
 
192
192
 
193
193
  def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[MemoryBuffer]:
194
+ device_index = torch.cuda.current_device()
195
+
194
196
  def _parse_and_pin_from_safetensors(file_path: str) -> MemoryBuffer:
195
197
  """
196
198
  safetensors format see https://huggingface.co/docs/safetensors/en/index#format.
@@ -204,9 +206,12 @@ def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[Memor
204
206
  Pin the memory of tensor in-place.
205
207
  See: https://github.com/pytorch/pytorch/issues/32167
206
208
  """
209
+ torch.cuda.set_device(device_index)
207
210
  cudart = torch.cuda.cudart()
208
211
  r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0)
209
- assert r == 0, f"pin memory error, error code: {r}"
212
+ if r != 0:
213
+ error_msg = cudart.cudaGetErrorString(r)
214
+ raise RuntimeError(f"pin memory error, error code: {r}, error message: {error_msg}")
210
215
 
211
216
  # TODO: should only support /dev/shm? but we found files in disk also work?
212
217
  size = os.stat(file_path).st_size
@@ -251,6 +256,12 @@ def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[Memor
251
256
  # Remove the file after successfully loading. This will avoid doubling the memory usage.
252
257
  # We assume files in /dev/shm/ are temporary files. So it's safe to remove them after loading.
253
258
  os.remove(file_path)
259
+ if not metas:
260
+ # TODO: should we still return this buffer?
261
+ assert buffer.nbytes == 0, f"buffer nbytes {buffer.nbytes} should be 0"
262
+ logger.warning(f"[rank{rank}] no metas found in {file_path}, skip pin memory")
263
+ return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=[], manually_pinned=False)
264
+
254
265
  _pin(buffer)
255
266
  logger.info(
256
267
  f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB"
@@ -391,7 +391,11 @@ class ParameterServer:
391
391
  )
392
392
  cudart = torch.cuda.cudart()
393
393
  r = cudart.cudaHostUnregister(t.data_ptr())
394
- assert r == 0, f"unpin memory error, error code: {r}"
394
+ if r != 0:
395
+ error_msg = cudart.cudaGetErrorString(r)
396
+ raise RuntimeError(
397
+ f"unpin memory error, error code: {r}, error message: {error_msg}"
398
+ )
395
399
 
396
400
  # if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually
397
401
  try:
@@ -407,7 +411,13 @@ class ParameterServer:
407
411
  del self._memory_pool[checkpoint_name]
408
412
  # see https://github.com/pytorch/pytorch/blob/31d5c675394705f8a6bc767f80ae14bf4f01246b/torch/csrc/cuda/Module.cpp#L2018
409
413
  # this works by using torch>=2.5.0
410
- torch._C._host_emptyCache()
414
+ if self.device_manager.device_type == "cuda":
415
+ torch._C._host_emptyCache()
416
+ else:
417
+ # torch._C._host_emptyCache() is not supported on NPU, so we call gc.collect() to empty host cache.
418
+ import gc
419
+
420
+ gc.collect()
411
421
 
412
422
  def gather_metas(self, checkpoint_name: str):
413
423
  """
@@ -731,6 +741,7 @@ class ParameterServer:
731
741
  assert len(self._current_global_parameter_metas) != 0, "parameter metas is empty"
732
742
  assert dist.is_initialized(), "process group is not initialized"
733
743
 
744
+ p2p_update = False
734
745
  # if both ranks is None or [], it will use fully broadcast to update to all ranks
735
746
  if not ranks:
736
747
  logger.info(f"[rank{self._rank}] update checkpoint {checkpoint_name}")
@@ -739,6 +750,7 @@ class ParameterServer:
739
750
  assert self._p2p_store is not None, "p2p store is not initialized"
740
751
  assert ranks, "ranks should be set"
741
752
 
753
+ p2p_update = True
742
754
  need_update = self._rank in ranks
743
755
  logger.info(
744
756
  f"[rank{self._rank}] update checkpoint {checkpoint_name} p2p, {need_update=} with {ranks=}, "
@@ -764,11 +776,6 @@ class ParameterServer:
764
776
  if disable_h2d_buffer
765
777
  else torch.empty(bucket_size, dtype=torch.uint8, device=self.device_manager.device_type)
766
778
  )
767
- # p2p store need to register h2d_buffer to let other ranks read
768
- if ranks:
769
- h2d_buffer_name = "__h2d_buffer__"
770
- if h2d_buffer is not None and self._p2p_store is not None:
771
- self._p2p_store.register_named_tensors({h2d_buffer_name: h2d_buffer})
772
779
  receiver_rank_buckets: list[tuple[int, H2DBucket]] = []
773
780
  for receiver_rank, owner_rank, bucket in buckets:
774
781
  if receiver_rank != self._rank:
@@ -778,6 +785,12 @@ class ParameterServer:
778
785
  buffer = torch.empty(
779
786
  bucket_size * 2, dtype=torch.uint8, device=self.device_manager.device_type
780
787
  )
788
+ if p2p_update:
789
+ # p2p store need to register buffer to let other ranks read
790
+ p2p_ipc_buffer_name = "__ipc_buffer__"
791
+ self._p2p_store.register_named_tensors(
792
+ {p2p_ipc_buffer_name: buffer if disable_h2d_buffer else h2d_buffer}
793
+ )
781
794
  handle = reduce_tensor(buffer)
782
795
 
783
796
  buckets_by_receiver_rank: dict[int, list[H2DBucket]] = defaultdict(list)
@@ -823,7 +836,14 @@ class ParameterServer:
823
836
  buffer_b: torch.Tensor = buffer[start : start + bucket.size]
824
837
  if receiver_rank == self._rank:
825
838
  if disable_h2d_buffer:
826
- self._copy_to_buffer(checkpoint_name, bucket, buffer_b)
839
+ if p2p_update:
840
+ assert bucket == receiver_rank_buckets[i][1]
841
+ self._copy_to_buffer(
842
+ checkpoint_name,
843
+ bucket,
844
+ buffer_b,
845
+ receiver_rank_buckets[i][0] if p2p_update else None,
846
+ )
827
847
  else:
828
848
  buffer_b.data.copy_(h2d_buffer[: bucket.size])
829
849
  dist.broadcast(buffer_b, src=receiver_rank, group=ranks_group)
@@ -850,8 +870,8 @@ class ParameterServer:
850
870
  req_thread.join()
851
871
  dist.barrier(group=ranks_group)
852
872
  socket.close()
853
- if ranks and h2d_buffer is not None:
854
- self._p2p_store.unregister_named_tensors([h2d_buffer_name])
873
+ if p2p_update:
874
+ self._p2p_store.unregister_named_tensors([p2p_ipc_buffer_name])
855
875
 
856
876
  self.device_manager.device_module.empty_cache()
857
877
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: checkpoint-engine
3
- Version: 0.3.1rc0
3
+ Version: 0.3.3
4
4
  Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
5
5
  Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
6
6
  Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine
@@ -14,7 +14,8 @@ import torch.distributed as dist
14
14
  from loguru import logger
15
15
  from safetensors import safe_open
16
16
 
17
- from checkpoint_engine.ps import ParameterServer, request_inference_to_update
17
+ from checkpoint_engine import request_inference_to_update
18
+ from checkpoint_engine.ps import ParameterServer
18
19
 
19
20
 
20
21
  @contextmanager