checkpoint-engine 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/PKG-INFO +62 -9
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/README.md +61 -8
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/checkpoint_engine/_version.py +3 -3
- checkpoint_engine-0.2.2/checkpoint_engine/device_utils.py +86 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/checkpoint_engine/ps.py +383 -139
- checkpoint_engine-0.2.2/checkpoint_engine/worker.py +168 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/checkpoint_engine.egg-info/PKG-INFO +62 -9
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/checkpoint_engine.egg-info/SOURCES.txt +3 -0
- checkpoint_engine-0.2.2/docs/npu_start.md +91 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/examples/update.py +5 -2
- checkpoint_engine-0.2.2/tests/test_pin_memory.py +77 -0
- checkpoint_engine-0.2.2/tests/test_update.py +327 -0
- checkpoint_engine-0.2.0/checkpoint_engine/worker.py +0 -109
- checkpoint_engine-0.2.0/tests/test_update.py +0 -90
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/.github/workflows/cpu-tests.yml +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/.github/workflows/pre-commit.yaml +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/.github/workflows/python-publish.yml +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/.gitignore +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/.pre-commit-config.yaml +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/LICENCE +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/checkpoint_engine/__init__.py +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/checkpoint_engine.egg-info/dependency_links.txt +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/checkpoint_engine.egg-info/requires.txt +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/checkpoint_engine.egg-info/top_level.txt +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/figures/checkpoint-engine.png +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/figures/overlap-update-and-copy.png +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/figures/pipeline.png +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/patches/vllm_fp8.patch +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/pyproject.toml +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/setup.cfg +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/tests/test_assign_receiver_ranks.py +0 -0
- {checkpoint_engine-0.2.0 → checkpoint_engine-0.2.2}/tests/test_rdma_parser.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: checkpoint-engine
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
|
|
5
5
|
Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
|
|
6
6
|
Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine
|
|
@@ -99,17 +99,15 @@ Use the flexible P2P implementation, notice this will install `mooncake-transfer
|
|
|
99
99
|
pip install 'checkpoint-engine[p2p]'
|
|
100
100
|
```
|
|
101
101
|
|
|
102
|
-
If set `NCCL_IB_HCA` env, checkpoint-engine will use it to auto select net devices for different ranks. Available patterns can be found from [NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8). If not set, it will read all RDMA devices and try to divide them into each rank.
|
|
103
|
-
|
|
104
102
|
## Getting Started
|
|
105
103
|
|
|
106
|
-
Prepare an H800 or H20 machine with 8 GPUs with
|
|
104
|
+
Prepare an H800 or H20 machine with 8 GPUs with vLLM. Be sure to include [/collective_rpc API endpoint](https://github.com/vllm-project/vllm/commit/f7cf5b512ee41f36613deb2471a44de5f304f70d) commit (available in main branch) since checkpoint-engine will use this endpoint to update weights. vLLM version `v0.10.2` is fully tested and recommended.
|
|
107
105
|
|
|
108
106
|
```Bash
|
|
109
|
-
|
|
107
|
+
mkdir -p /opt/vLLM && cd /opt/vLLM
|
|
110
108
|
uv venv --python 3.12 --seed
|
|
111
109
|
source .venv/bin/activate
|
|
112
|
-
|
|
110
|
+
uv pip install vllm==0.10.2
|
|
113
111
|
```
|
|
114
112
|
|
|
115
113
|
Install checkpoint-engine
|
|
@@ -169,13 +167,68 @@ A [PR](https://github.com/vllm-project/vllm/pull/24488) is opened to the vLLM pr
|
|
|
169
167
|
Run a simple correctness test for checkpoint_engine
|
|
170
168
|
|
|
171
169
|
```bash
|
|
172
|
-
|
|
170
|
+
pytest tests/test_update.py
|
|
173
171
|
```
|
|
174
172
|
|
|
175
|
-
|
|
173
|
+
`test_update.py` are only designed to run with `pytest`. Please don't run it directly with `torchrun`.
|
|
174
|
+
|
|
175
|
+
Other unit tests can also be done with pytest. Only test_update.py requires GPUs, other tests can be run on CPUs. Only to run CPU tests, use:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
pytest tests/ -m "not gpu"
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Environment Variables
|
|
182
|
+
- `PS_MAX_BUCKET_SIZE_GB`: An integer is used to set the maximum bucket size for checkpoint-engine. If not set, 8GB is used as default.
|
|
183
|
+
- `PS_P2P_STORE_RDMA_DEVICES`: Comma-separated RDMA devices' names for P2P transfer. If not set, checkpoint-engine will fall back to use `NCCL_IB_HCA` to detect RDMA devices.
|
|
184
|
+
- `NCCL_IB_HCA`: Available patterns can be found from [NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8). If also not set, all RDMA devices will be used and divided evenly among the ranks.
|
|
185
|
+
|
|
186
|
+
## SGLang Integration
|
|
187
|
+
|
|
188
|
+
Checkpoint Engine provides efficient distributed checkpoint loading for SGLang inference servers, significantly reducing model loading time for large models and multi-node setups.
|
|
189
|
+
|
|
190
|
+
### Quick Start
|
|
191
|
+
|
|
192
|
+
**1. Install checkpoint-engine:**
|
|
193
|
+
```bash
|
|
194
|
+
pip install 'checkpoint-engine[p2p]'
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
**2. Launch SGLang server:**
|
|
198
|
+
```bash
|
|
199
|
+
python -m sglang.launch_server \
|
|
200
|
+
--model-path $MODEL_PATH \
|
|
201
|
+
--tp 8 \
|
|
202
|
+
--load-format dummy \
|
|
203
|
+
--wait-for-initial-weights
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
**3. Run checkpoint engine:**
|
|
207
|
+
```bash
|
|
208
|
+
python -m sglang.srt.checkpoint_engine.update \
|
|
209
|
+
--update-method broadcast \
|
|
210
|
+
--checkpoint-path $MODEL_PATH \
|
|
211
|
+
--inference-parallel-size 8
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Multi-Node Setup
|
|
215
|
+
|
|
216
|
+
For 2-node setup, run the same commands on both nodes with appropriate `--host` and distributed training parameters.
|
|
217
|
+
|
|
218
|
+
### Key Options
|
|
219
|
+
|
|
220
|
+
**SGLang Server:**
|
|
221
|
+
- `--wait-for-initial-weights`: Wait for checkpoint engine before becoming ready
|
|
222
|
+
- `--load-format dummy`: Enable overlapping initialization tasks
|
|
223
|
+
|
|
224
|
+
**Checkpoint Engine:**
|
|
225
|
+
- `--update-method`: Choose `broadcast`, `p2p`, or `all`
|
|
226
|
+
- `--inference-parallel-size`: Number of parallel processes
|
|
227
|
+
- `--checkpoint-path`: Model checkpoint directory
|
|
228
|
+
|
|
176
229
|
## Limitations and Future Work
|
|
177
230
|
|
|
178
|
-
- This project is currently
|
|
231
|
+
- This project is currently tested with vLLM and SGLang. Integration with other frameworks is planned for future releases.
|
|
179
232
|
- The perfect three-stage pipeline mentioned in our paper is currently not implemented. This could be useful for architectures where H2D and broadcast do not conflict in PCIE.
|
|
180
233
|
|
|
181
234
|
## Acknowledgments
|
|
@@ -75,17 +75,15 @@ Use the flexible P2P implementation, notice this will install `mooncake-transfer
|
|
|
75
75
|
pip install 'checkpoint-engine[p2p]'
|
|
76
76
|
```
|
|
77
77
|
|
|
78
|
-
If set `NCCL_IB_HCA` env, checkpoint-engine will use it to auto select net devices for different ranks. Available patterns can be found from [NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8). If not set, it will read all RDMA devices and try to divide them into each rank.
|
|
79
|
-
|
|
80
78
|
## Getting Started
|
|
81
79
|
|
|
82
|
-
Prepare an H800 or H20 machine with 8 GPUs with
|
|
80
|
+
Prepare an H800 or H20 machine with 8 GPUs with vLLM. Be sure to include [/collective_rpc API endpoint](https://github.com/vllm-project/vllm/commit/f7cf5b512ee41f36613deb2471a44de5f304f70d) commit (available in main branch) since checkpoint-engine will use this endpoint to update weights. vLLM version `v0.10.2` is fully tested and recommended.
|
|
83
81
|
|
|
84
82
|
```Bash
|
|
85
|
-
|
|
83
|
+
mkdir -p /opt/vLLM && cd /opt/vLLM
|
|
86
84
|
uv venv --python 3.12 --seed
|
|
87
85
|
source .venv/bin/activate
|
|
88
|
-
|
|
86
|
+
uv pip install vllm==0.10.2
|
|
89
87
|
```
|
|
90
88
|
|
|
91
89
|
Install checkpoint-engine
|
|
@@ -145,13 +143,68 @@ A [PR](https://github.com/vllm-project/vllm/pull/24488) is opened to the vLLM pr
|
|
|
145
143
|
Run a simple correctness test for checkpoint_engine
|
|
146
144
|
|
|
147
145
|
```bash
|
|
148
|
-
|
|
146
|
+
pytest tests/test_update.py
|
|
149
147
|
```
|
|
150
148
|
|
|
151
|
-
|
|
149
|
+
`test_update.py` are only designed to run with `pytest`. Please don't run it directly with `torchrun`.
|
|
150
|
+
|
|
151
|
+
Other unit tests can also be done with pytest. Only test_update.py requires GPUs, other tests can be run on CPUs. Only to run CPU tests, use:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
pytest tests/ -m "not gpu"
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Environment Variables
|
|
158
|
+
- `PS_MAX_BUCKET_SIZE_GB`: An integer is used to set the maximum bucket size for checkpoint-engine. If not set, 8GB is used as default.
|
|
159
|
+
- `PS_P2P_STORE_RDMA_DEVICES`: Comma-separated RDMA devices' names for P2P transfer. If not set, checkpoint-engine will fall back to use `NCCL_IB_HCA` to detect RDMA devices.
|
|
160
|
+
- `NCCL_IB_HCA`: Available patterns can be found from [NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8). If also not set, all RDMA devices will be used and divided evenly among the ranks.
|
|
161
|
+
|
|
162
|
+
## SGLang Integration
|
|
163
|
+
|
|
164
|
+
Checkpoint Engine provides efficient distributed checkpoint loading for SGLang inference servers, significantly reducing model loading time for large models and multi-node setups.
|
|
165
|
+
|
|
166
|
+
### Quick Start
|
|
167
|
+
|
|
168
|
+
**1. Install checkpoint-engine:**
|
|
169
|
+
```bash
|
|
170
|
+
pip install 'checkpoint-engine[p2p]'
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
**2. Launch SGLang server:**
|
|
174
|
+
```bash
|
|
175
|
+
python -m sglang.launch_server \
|
|
176
|
+
--model-path $MODEL_PATH \
|
|
177
|
+
--tp 8 \
|
|
178
|
+
--load-format dummy \
|
|
179
|
+
--wait-for-initial-weights
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
**3. Run checkpoint engine:**
|
|
183
|
+
```bash
|
|
184
|
+
python -m sglang.srt.checkpoint_engine.update \
|
|
185
|
+
--update-method broadcast \
|
|
186
|
+
--checkpoint-path $MODEL_PATH \
|
|
187
|
+
--inference-parallel-size 8
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Multi-Node Setup
|
|
191
|
+
|
|
192
|
+
For 2-node setup, run the same commands on both nodes with appropriate `--host` and distributed training parameters.
|
|
193
|
+
|
|
194
|
+
### Key Options
|
|
195
|
+
|
|
196
|
+
**SGLang Server:**
|
|
197
|
+
- `--wait-for-initial-weights`: Wait for checkpoint engine before becoming ready
|
|
198
|
+
- `--load-format dummy`: Enable overlapping initialization tasks
|
|
199
|
+
|
|
200
|
+
**Checkpoint Engine:**
|
|
201
|
+
- `--update-method`: Choose `broadcast`, `p2p`, or `all`
|
|
202
|
+
- `--inference-parallel-size`: Number of parallel processes
|
|
203
|
+
- `--checkpoint-path`: Model checkpoint directory
|
|
204
|
+
|
|
152
205
|
## Limitations and Future Work
|
|
153
206
|
|
|
154
|
-
- This project is currently
|
|
207
|
+
- This project is currently tested with vLLM and SGLang. Integration with other frameworks is planned for future releases.
|
|
155
208
|
- The perfect three-stage pipeline mentioned in our paper is currently not implemented. This could be useful for architectures where H2D and broadcast do not conflict in PCIE.
|
|
156
209
|
|
|
157
210
|
## Acknowledgments
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.2.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 2,
|
|
31
|
+
__version__ = version = '0.2.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 2)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g089d18598'
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import socket
|
|
4
|
+
import subprocess
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@lru_cache(maxsize=1)
|
|
12
|
+
def get_ip() -> str:
|
|
13
|
+
try:
|
|
14
|
+
# try to get ip from network interface
|
|
15
|
+
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
|
|
16
|
+
s.connect(("8.8.8.8", 80))
|
|
17
|
+
return s.getsockname()[0]
|
|
18
|
+
except Exception as e: # noqa: BLE001
|
|
19
|
+
# fallback to get ip from hostname
|
|
20
|
+
logger.warning(
|
|
21
|
+
f"fail to get ip from network interface, fallback to get ip from hostname: {e}"
|
|
22
|
+
)
|
|
23
|
+
return socket.gethostbyname(socket.gethostname())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def npu_generate_uuid() -> str:
|
|
27
|
+
str_pid = str(os.getpid())
|
|
28
|
+
npu_num = 8
|
|
29
|
+
try:
|
|
30
|
+
for npu_id in range(npu_num):
|
|
31
|
+
cmd = ["npu-smi", "info", "-t", "proc-mem", "-i", str(npu_id)]
|
|
32
|
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True) # noqa: S603
|
|
33
|
+
str_result = str(result.stdout)
|
|
34
|
+
if str_pid in str_result:
|
|
35
|
+
# In A3 server, one NPU has two chips.
|
|
36
|
+
match_chip_count = re.search(r"Chip Count[^\d]*(\d+)", str_result)
|
|
37
|
+
chip_count = int(match_chip_count.group(1))
|
|
38
|
+
search_after_pid = str_result[str_result.find(str_pid) + len(str_pid) :]
|
|
39
|
+
match_chip_id = re.search(r"Chip ID[^\d]*(\d+)", search_after_pid)
|
|
40
|
+
chip_id = int(match_chip_id.group(1))
|
|
41
|
+
return f"{get_ip()}-{npu_id * chip_count + chip_id}"
|
|
42
|
+
raise ValueError("The current process is not running on the npu device")
|
|
43
|
+
except subprocess.CalledProcessError as e:
|
|
44
|
+
raise ValueError("The current process is not running on the npu device") from e
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DeviceManager:
|
|
48
|
+
def __init__(self):
|
|
49
|
+
self.device_type = self._detect_device_type()
|
|
50
|
+
self._setup_device_module()
|
|
51
|
+
|
|
52
|
+
def _is_torch_npu_available(self) -> bool:
|
|
53
|
+
try:
|
|
54
|
+
if hasattr(torch, "npu") and callable(getattr(torch.npu, "is_available", None)):
|
|
55
|
+
return torch.npu.is_available()
|
|
56
|
+
else:
|
|
57
|
+
return False
|
|
58
|
+
except ImportError:
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
def _detect_device_type(self) -> str:
|
|
62
|
+
if self._is_torch_npu_available():
|
|
63
|
+
return "npu"
|
|
64
|
+
elif torch.cuda.is_available():
|
|
65
|
+
return "cuda"
|
|
66
|
+
else:
|
|
67
|
+
raise TypeError("The current device type is not supported")
|
|
68
|
+
|
|
69
|
+
def _setup_device_module(self):
|
|
70
|
+
if self.device_type == "npu":
|
|
71
|
+
import torch_npu
|
|
72
|
+
|
|
73
|
+
self.device_module = torch_npu.npu
|
|
74
|
+
elif self.device_type == "cuda":
|
|
75
|
+
self.device_module = torch.cuda
|
|
76
|
+
else:
|
|
77
|
+
raise TypeError("The current device type is not supported")
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def backend(self) -> str:
|
|
81
|
+
if self.device_type == "npu":
|
|
82
|
+
return "hccl"
|
|
83
|
+
elif self.device_type == "cuda":
|
|
84
|
+
return "nccl"
|
|
85
|
+
else:
|
|
86
|
+
raise TypeError("The current device type is not supported")
|