checkpoint-engine 0.3.0rc0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checkpoint_engine/__init__.py +36 -0
- checkpoint_engine/__main__.py +28 -0
- checkpoint_engine/_version.py +2 -2
- checkpoint_engine/api.py +95 -0
- checkpoint_engine/data_types.py +111 -0
- checkpoint_engine/p2p_store.py +210 -0
- checkpoint_engine/pin_memory.py +390 -0
- checkpoint_engine/ps.py +85 -798
- checkpoint_engine/worker.py +18 -9
- {checkpoint_engine-0.3.0rc0.dist-info → checkpoint_engine-0.3.1.dist-info}/METADATA +1 -1
- checkpoint_engine-0.3.1.dist-info/RECORD +15 -0
- checkpoint_engine-0.3.0rc0.dist-info/RECORD +0 -10
- {checkpoint_engine-0.3.0rc0.dist-info → checkpoint_engine-0.3.1.dist-info}/WHEEL +0 -0
- {checkpoint_engine-0.3.0rc0.dist-info → checkpoint_engine-0.3.1.dist-info}/licenses/LICENCE +0 -0
- {checkpoint_engine-0.3.0rc0.dist-info → checkpoint_engine-0.3.1.dist-info}/top_level.txt +0 -0
checkpoint_engine/worker.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import gc
|
|
2
2
|
import traceback
|
|
3
3
|
from collections.abc import Callable
|
|
4
|
+
from functools import cached_property
|
|
4
5
|
from typing import TypedDict
|
|
5
6
|
|
|
6
7
|
import torch
|
|
@@ -117,6 +118,21 @@ class VllmColocateWorkerExtension:
|
|
|
117
118
|
`worker_extension_cls` argument when initializing the vLLM worker.
|
|
118
119
|
"""
|
|
119
120
|
|
|
121
|
+
@cached_property
|
|
122
|
+
def _device_uuid(self) -> str:
|
|
123
|
+
from vllm.platforms import current_platform
|
|
124
|
+
|
|
125
|
+
if current_platform.device_type == "cuda":
|
|
126
|
+
return current_platform.get_device_uuid(self.device.index)
|
|
127
|
+
elif current_platform.device_type == "npu":
|
|
128
|
+
return f"NPU-{npu_generate_uuid()}"
|
|
129
|
+
else:
|
|
130
|
+
raise ValueError(f"Unsupported device type: {current_platform.device_type}")
|
|
131
|
+
|
|
132
|
+
@cached_property
|
|
133
|
+
def _zmq_ctx(self) -> zmq.Context:
|
|
134
|
+
return zmq.Context()
|
|
135
|
+
|
|
120
136
|
def update_weights_from_ipc(self, zmq_handles: dict[str, str]):
|
|
121
137
|
"""
|
|
122
138
|
Update model weights from checkpoint-engine via IPC communication.
|
|
@@ -149,17 +165,10 @@ class VllmColocateWorkerExtension:
|
|
|
149
165
|
if current_platform.device_type == "npu" and self.device is None:
|
|
150
166
|
self.device = torch.device(f"npu:{self.local_rank}")
|
|
151
167
|
assert self.device is not None
|
|
152
|
-
|
|
153
|
-
self._zmq_ctx = zmq.Context()
|
|
154
|
-
if current_platform.device_type == "cuda":
|
|
155
|
-
device_uuid = current_platform.get_device_uuid(self.device.index)
|
|
156
|
-
elif current_platform.device_type == "npu":
|
|
157
|
-
device_uuid = f"NPU-{npu_generate_uuid()}"
|
|
158
|
-
else:
|
|
159
|
-
raise ValueError(f"Unsupported device type: {current_platform.device_type}")
|
|
168
|
+
|
|
160
169
|
update_weights_from_ipc(
|
|
161
170
|
self._zmq_ctx,
|
|
162
|
-
zmq_handles[
|
|
171
|
+
zmq_handles[self._device_uuid],
|
|
163
172
|
device_id=self.device.index,
|
|
164
173
|
run=self.model_runner.model.load_weights,
|
|
165
174
|
post_hook=lambda: process_weights_after_loading(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: checkpoint-engine
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
|
|
5
5
|
Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
|
|
6
6
|
Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
checkpoint_engine/__init__.py,sha256=OeWxe9mxl2sZ6cW-blSTg6JbFlOMpGbBghLZtxGOqXk,942
|
|
2
|
+
checkpoint_engine/__main__.py,sha256=yzQlApuYo6eIOqtqM018RosyxNzXzB5a-stxUvsh-dg,709
|
|
3
|
+
checkpoint_engine/_version.py,sha256=gGLpQUQx-ty9SEy9PYw9OgJWWzJLBnCpfJOfzL7SjlI,704
|
|
4
|
+
checkpoint_engine/api.py,sha256=JDiQ4i3Gb6GoaBhlp8lNuUPaVURoFFdeGJY9ZDDGvPc,3518
|
|
5
|
+
checkpoint_engine/data_types.py,sha256=O9uAXjwB20iwrOHfEEQd8Y9CmaFspNJ9ks9noHqwQKk,2716
|
|
6
|
+
checkpoint_engine/device_utils.py,sha256=iKrof60j3CY3fStRTq3DRTt_kE1vYoEWHhAeyh0lByA,3020
|
|
7
|
+
checkpoint_engine/p2p_store.py,sha256=abiCDVmRISPt9QFfavHB9Jo7ZpBbSjUS1NevGuB-AVA,8721
|
|
8
|
+
checkpoint_engine/pin_memory.py,sha256=gpoe_z5XxbWkCvFLaXXpyUUFetBXUjsOrxBSX-ksZTw,16141
|
|
9
|
+
checkpoint_engine/ps.py,sha256=0d68Sqb_y3H6b5H37exMbghDJ294VKaGqoWkcKE-Ao8,40316
|
|
10
|
+
checkpoint_engine/worker.py,sha256=ghj9d2u8hY_U2uiOZWIN2CqRNZH6PrzujT22fHUFBWI,6879
|
|
11
|
+
checkpoint_engine-0.3.1.dist-info/licenses/LICENCE,sha256=D3gPmHKpGtF1yxYNhqjtBtZY_brZjDotJTzpnmClzlY,1067
|
|
12
|
+
checkpoint_engine-0.3.1.dist-info/METADATA,sha256=RSkQaKNs4euXk162_9AQzuNWaJQT4gMTnQ-3QJeYY_E,11559
|
|
13
|
+
checkpoint_engine-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
14
|
+
checkpoint_engine-0.3.1.dist-info/top_level.txt,sha256=66sik_1eLakLYmcllOEJzFaNbSfjsueuP0tHYEzhMSs,18
|
|
15
|
+
checkpoint_engine-0.3.1.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
checkpoint_engine/__init__.py,sha256=Zj4I008kn9R6fYr0lVBzcQSnvckLpX2s1ljCOOqV1c8,87
|
|
2
|
-
checkpoint_engine/_version.py,sha256=v0iyeXv9HxMc4JmYu_bJTIGKXRQVfpijACyjq2P_sk0,714
|
|
3
|
-
checkpoint_engine/device_utils.py,sha256=iKrof60j3CY3fStRTq3DRTt_kE1vYoEWHhAeyh0lByA,3020
|
|
4
|
-
checkpoint_engine/ps.py,sha256=eIvg_eI7HMedacoQQer62NRnGDjANtxsHVxgM93ccXQ,66977
|
|
5
|
-
checkpoint_engine/worker.py,sha256=f6kS1ushIXxkRCEHXM5wVofUer9OxRiVY03vmKYLzgo,6757
|
|
6
|
-
checkpoint_engine-0.3.0rc0.dist-info/licenses/LICENCE,sha256=D3gPmHKpGtF1yxYNhqjtBtZY_brZjDotJTzpnmClzlY,1067
|
|
7
|
-
checkpoint_engine-0.3.0rc0.dist-info/METADATA,sha256=iVd2qPdNyTPPX3XIEiuM0ASk8As72zSGfFIYicpZG3E,11562
|
|
8
|
-
checkpoint_engine-0.3.0rc0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
checkpoint_engine-0.3.0rc0.dist-info/top_level.txt,sha256=66sik_1eLakLYmcllOEJzFaNbSfjsueuP0tHYEzhMSs,18
|
|
10
|
-
checkpoint_engine-0.3.0rc0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|