checkpoint-engine 0.3.1rc0__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/PKG-INFO +1 -1
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/_version.py +3 -3
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/pin_memory.py +12 -1
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/ps.py +30 -10
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/PKG-INFO +1 -1
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/examples/update.py +2 -1
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.github/workflows/cpu-tests.yml +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.github/workflows/pre-commit.yaml +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.github/workflows/python-publish.yml +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.gitignore +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/.pre-commit-config.yaml +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/LICENCE +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/README.md +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/__init__.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/__main__.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/api.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/data_types.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/device_utils.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/p2p_store.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine/worker.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/SOURCES.txt +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/dependency_links.txt +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/requires.txt +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/top_level.txt +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/docs/npu_start.md +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/figures/checkpoint-engine.png +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/figures/overlap-update-and-copy.png +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/figures/pipeline.png +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/patches/vllm_fp8.patch +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/pyproject.toml +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/setup.cfg +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_assign_receiver_ranks.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_inplace_unpin.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_rdma_parser.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_reuse_pin_memory.py +0 -0
- {checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/tests/test_update.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: checkpoint-engine
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
|
|
5
5
|
Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
|
|
6
6
|
Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.3.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 3,
|
|
31
|
+
__version__ = version = '0.3.3'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 3, 3)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'gf6910d646'
|
|
@@ -191,6 +191,8 @@ def _load_checkpoint(files: list[str]) -> dict[str, torch.Tensor]:
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[MemoryBuffer]:
|
|
194
|
+
device_index = torch.cuda.current_device()
|
|
195
|
+
|
|
194
196
|
def _parse_and_pin_from_safetensors(file_path: str) -> MemoryBuffer:
|
|
195
197
|
"""
|
|
196
198
|
safetensors format see https://huggingface.co/docs/safetensors/en/index#format.
|
|
@@ -204,9 +206,12 @@ def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[Memor
|
|
|
204
206
|
Pin the memory of tensor in-place.
|
|
205
207
|
See: https://github.com/pytorch/pytorch/issues/32167
|
|
206
208
|
"""
|
|
209
|
+
torch.cuda.set_device(device_index)
|
|
207
210
|
cudart = torch.cuda.cudart()
|
|
208
211
|
r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0)
|
|
209
|
-
|
|
212
|
+
if r != 0:
|
|
213
|
+
error_msg = cudart.cudaGetErrorString(r)
|
|
214
|
+
raise RuntimeError(f"pin memory error, error code: {r}, error message: {error_msg}")
|
|
210
215
|
|
|
211
216
|
# TODO: should only support /dev/shm? but we found files in disk also work?
|
|
212
217
|
size = os.stat(file_path).st_size
|
|
@@ -251,6 +256,12 @@ def _inplace_pin_memory(files: list[str], rank: int | None = None) -> list[Memor
|
|
|
251
256
|
# Remove the file after successfully loading. This will avoid doubling the memory usage.
|
|
252
257
|
# We assume files in /dev/shm/ are temporary files. So it's safe to remove them after loading.
|
|
253
258
|
os.remove(file_path)
|
|
259
|
+
if not metas:
|
|
260
|
+
# TODO: should we still return this buffer?
|
|
261
|
+
assert buffer.nbytes == 0, f"buffer nbytes {buffer.nbytes} should be 0"
|
|
262
|
+
logger.warning(f"[rank{rank}] no metas found in {file_path}, skip pin memory")
|
|
263
|
+
return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=[], manually_pinned=False)
|
|
264
|
+
|
|
254
265
|
_pin(buffer)
|
|
255
266
|
logger.info(
|
|
256
267
|
f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB"
|
|
@@ -391,7 +391,11 @@ class ParameterServer:
|
|
|
391
391
|
)
|
|
392
392
|
cudart = torch.cuda.cudart()
|
|
393
393
|
r = cudart.cudaHostUnregister(t.data_ptr())
|
|
394
|
-
|
|
394
|
+
if r != 0:
|
|
395
|
+
error_msg = cudart.cudaGetErrorString(r)
|
|
396
|
+
raise RuntimeError(
|
|
397
|
+
f"unpin memory error, error code: {r}, error message: {error_msg}"
|
|
398
|
+
)
|
|
395
399
|
|
|
396
400
|
# if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually
|
|
397
401
|
try:
|
|
@@ -407,7 +411,13 @@ class ParameterServer:
|
|
|
407
411
|
del self._memory_pool[checkpoint_name]
|
|
408
412
|
# see https://github.com/pytorch/pytorch/blob/31d5c675394705f8a6bc767f80ae14bf4f01246b/torch/csrc/cuda/Module.cpp#L2018
|
|
409
413
|
# this works by using torch>=2.5.0
|
|
410
|
-
|
|
414
|
+
if self.device_manager.device_type == "cuda":
|
|
415
|
+
torch._C._host_emptyCache()
|
|
416
|
+
else:
|
|
417
|
+
# torch._C._host_emptyCache() is not supported on NPU, so we call gc.collect() to empty host cache.
|
|
418
|
+
import gc
|
|
419
|
+
|
|
420
|
+
gc.collect()
|
|
411
421
|
|
|
412
422
|
def gather_metas(self, checkpoint_name: str):
|
|
413
423
|
"""
|
|
@@ -731,6 +741,7 @@ class ParameterServer:
|
|
|
731
741
|
assert len(self._current_global_parameter_metas) != 0, "parameter metas is empty"
|
|
732
742
|
assert dist.is_initialized(), "process group is not initialized"
|
|
733
743
|
|
|
744
|
+
p2p_update = False
|
|
734
745
|
# if both ranks is None or [], it will use fully broadcast to update to all ranks
|
|
735
746
|
if not ranks:
|
|
736
747
|
logger.info(f"[rank{self._rank}] update checkpoint {checkpoint_name}")
|
|
@@ -739,6 +750,7 @@ class ParameterServer:
|
|
|
739
750
|
assert self._p2p_store is not None, "p2p store is not initialized"
|
|
740
751
|
assert ranks, "ranks should be set"
|
|
741
752
|
|
|
753
|
+
p2p_update = True
|
|
742
754
|
need_update = self._rank in ranks
|
|
743
755
|
logger.info(
|
|
744
756
|
f"[rank{self._rank}] update checkpoint {checkpoint_name} p2p, {need_update=} with {ranks=}, "
|
|
@@ -764,11 +776,6 @@ class ParameterServer:
|
|
|
764
776
|
if disable_h2d_buffer
|
|
765
777
|
else torch.empty(bucket_size, dtype=torch.uint8, device=self.device_manager.device_type)
|
|
766
778
|
)
|
|
767
|
-
# p2p store need to register h2d_buffer to let other ranks read
|
|
768
|
-
if ranks:
|
|
769
|
-
h2d_buffer_name = "__h2d_buffer__"
|
|
770
|
-
if h2d_buffer is not None and self._p2p_store is not None:
|
|
771
|
-
self._p2p_store.register_named_tensors({h2d_buffer_name: h2d_buffer})
|
|
772
779
|
receiver_rank_buckets: list[tuple[int, H2DBucket]] = []
|
|
773
780
|
for receiver_rank, owner_rank, bucket in buckets:
|
|
774
781
|
if receiver_rank != self._rank:
|
|
@@ -778,6 +785,12 @@ class ParameterServer:
|
|
|
778
785
|
buffer = torch.empty(
|
|
779
786
|
bucket_size * 2, dtype=torch.uint8, device=self.device_manager.device_type
|
|
780
787
|
)
|
|
788
|
+
if p2p_update:
|
|
789
|
+
# p2p store need to register buffer to let other ranks read
|
|
790
|
+
p2p_ipc_buffer_name = "__ipc_buffer__"
|
|
791
|
+
self._p2p_store.register_named_tensors(
|
|
792
|
+
{p2p_ipc_buffer_name: buffer if disable_h2d_buffer else h2d_buffer}
|
|
793
|
+
)
|
|
781
794
|
handle = reduce_tensor(buffer)
|
|
782
795
|
|
|
783
796
|
buckets_by_receiver_rank: dict[int, list[H2DBucket]] = defaultdict(list)
|
|
@@ -823,7 +836,14 @@ class ParameterServer:
|
|
|
823
836
|
buffer_b: torch.Tensor = buffer[start : start + bucket.size]
|
|
824
837
|
if receiver_rank == self._rank:
|
|
825
838
|
if disable_h2d_buffer:
|
|
826
|
-
|
|
839
|
+
if p2p_update:
|
|
840
|
+
assert bucket == receiver_rank_buckets[i][1]
|
|
841
|
+
self._copy_to_buffer(
|
|
842
|
+
checkpoint_name,
|
|
843
|
+
bucket,
|
|
844
|
+
buffer_b,
|
|
845
|
+
receiver_rank_buckets[i][0] if p2p_update else None,
|
|
846
|
+
)
|
|
827
847
|
else:
|
|
828
848
|
buffer_b.data.copy_(h2d_buffer[: bucket.size])
|
|
829
849
|
dist.broadcast(buffer_b, src=receiver_rank, group=ranks_group)
|
|
@@ -850,8 +870,8 @@ class ParameterServer:
|
|
|
850
870
|
req_thread.join()
|
|
851
871
|
dist.barrier(group=ranks_group)
|
|
852
872
|
socket.close()
|
|
853
|
-
if
|
|
854
|
-
self._p2p_store.unregister_named_tensors([
|
|
873
|
+
if p2p_update:
|
|
874
|
+
self._p2p_store.unregister_named_tensors([p2p_ipc_buffer_name])
|
|
855
875
|
|
|
856
876
|
self.device_manager.device_module.empty_cache()
|
|
857
877
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: checkpoint-engine
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
|
|
5
5
|
Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
|
|
6
6
|
Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine
|
|
@@ -14,7 +14,8 @@ import torch.distributed as dist
|
|
|
14
14
|
from loguru import logger
|
|
15
15
|
from safetensors import safe_open
|
|
16
16
|
|
|
17
|
-
from checkpoint_engine
|
|
17
|
+
from checkpoint_engine import request_inference_to_update
|
|
18
|
+
from checkpoint_engine.ps import ParameterServer
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
@contextmanager
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/requires.txt
RENAMED
|
File without changes
|
{checkpoint_engine-0.3.1rc0 → checkpoint_engine-0.3.3}/checkpoint_engine.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|