checkpoint-engine 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checkpoint_engine-0.1.2/.github/workflows/pre-commit.yaml +22 -0
- checkpoint_engine-0.1.2/.github/workflows/python-publish.yml +70 -0
- checkpoint_engine-0.1.2/.gitignore +13 -0
- checkpoint_engine-0.1.2/.pre-commit-config.yaml +32 -0
- checkpoint_engine-0.1.2/LICENCE +21 -0
- checkpoint_engine-0.1.2/PKG-INFO +176 -0
- checkpoint_engine-0.1.2/README.md +152 -0
- checkpoint_engine-0.1.2/checkpoint_engine/__init__.py +4 -0
- checkpoint_engine-0.1.2/checkpoint_engine/_version.py +34 -0
- checkpoint_engine-0.1.2/checkpoint_engine/ps.py +1198 -0
- checkpoint_engine-0.1.2/checkpoint_engine/worker.py +109 -0
- checkpoint_engine-0.1.2/checkpoint_engine.egg-info/PKG-INFO +176 -0
- checkpoint_engine-0.1.2/checkpoint_engine.egg-info/SOURCES.txt +22 -0
- checkpoint_engine-0.1.2/checkpoint_engine.egg-info/dependency_links.txt +1 -0
- checkpoint_engine-0.1.2/checkpoint_engine.egg-info/requires.txt +12 -0
- checkpoint_engine-0.1.2/checkpoint_engine.egg-info/top_level.txt +1 -0
- checkpoint_engine-0.1.2/examples/update.py +194 -0
- checkpoint_engine-0.1.2/figures/checkpoint-engine.png +0 -0
- checkpoint_engine-0.1.2/figures/overlap-update-and-copy.png +0 -0
- checkpoint_engine-0.1.2/figures/pipeline.png +0 -0
- checkpoint_engine-0.1.2/patches/vllm_fp8.patch +98 -0
- checkpoint_engine-0.1.2/pyproject.toml +160 -0
- checkpoint_engine-0.1.2/setup.cfg +4 -0
- checkpoint_engine-0.1.2/tests/test_update.py +90 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: pre-commit
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
types: [opened, synchronize, reopened]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
pre-commit:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- name: Checkout code
|
|
14
|
+
uses: actions/checkout@v4
|
|
15
|
+
- name: Set up Python
|
|
16
|
+
uses: actions/setup-python@v4
|
|
17
|
+
with:
|
|
18
|
+
python-version: '3.x'
|
|
19
|
+
- name: Run pre-commit hooks
|
|
20
|
+
uses: pre-commit/action@v3.0.1
|
|
21
|
+
with:
|
|
22
|
+
extra_args: --all-files
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# This workflow will upload a Python Package to PyPI when a release is created
|
|
2
|
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
|
3
|
+
|
|
4
|
+
# This workflow uses actions that are not certified by GitHub.
|
|
5
|
+
# They are provided by a third-party and are governed by
|
|
6
|
+
# separate terms of service, privacy policy, and support
|
|
7
|
+
# documentation.
|
|
8
|
+
|
|
9
|
+
name: Upload Python Package
|
|
10
|
+
|
|
11
|
+
on:
|
|
12
|
+
release:
|
|
13
|
+
types: [published]
|
|
14
|
+
|
|
15
|
+
permissions:
|
|
16
|
+
contents: read
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
release-build:
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.x"
|
|
28
|
+
|
|
29
|
+
- name: Build release distributions
|
|
30
|
+
run: |
|
|
31
|
+
# NOTE: put your own distribution build steps here.
|
|
32
|
+
python -m pip install build
|
|
33
|
+
python -m build
|
|
34
|
+
|
|
35
|
+
- name: Upload distributions
|
|
36
|
+
uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: release-dists
|
|
39
|
+
path: dist/
|
|
40
|
+
|
|
41
|
+
pypi-publish:
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
needs:
|
|
44
|
+
- release-build
|
|
45
|
+
permissions:
|
|
46
|
+
# IMPORTANT: this permission is mandatory for trusted publishing
|
|
47
|
+
id-token: write
|
|
48
|
+
|
|
49
|
+
# Dedicated environments with protections for publishing are strongly recommended.
|
|
50
|
+
# For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
|
|
51
|
+
environment:
|
|
52
|
+
name: pypi
|
|
53
|
+
# OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
|
|
54
|
+
# url: https://pypi.org/p/YOURPROJECT
|
|
55
|
+
#
|
|
56
|
+
# ALTERNATIVE: if your GitHub Release name is the PyPI project version string
|
|
57
|
+
# ALTERNATIVE: exactly, uncomment the following line instead:
|
|
58
|
+
# url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
|
|
59
|
+
|
|
60
|
+
steps:
|
|
61
|
+
- name: Retrieve release distributions
|
|
62
|
+
uses: actions/download-artifact@v4
|
|
63
|
+
with:
|
|
64
|
+
name: release-dists
|
|
65
|
+
path: dist/
|
|
66
|
+
|
|
67
|
+
- name: Publish release distributions to PyPI
|
|
68
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
69
|
+
with:
|
|
70
|
+
packages-dir: dist/
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v5.0.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: check-symlinks
|
|
6
|
+
- id: destroyed-symlinks
|
|
7
|
+
- id: trailing-whitespace
|
|
8
|
+
- id: end-of-file-fixer
|
|
9
|
+
- id: check-yaml
|
|
10
|
+
- id: check-toml
|
|
11
|
+
- id: check-ast
|
|
12
|
+
- id: check-added-large-files
|
|
13
|
+
- id: check-merge-conflict
|
|
14
|
+
- id: check-executables-have-shebangs
|
|
15
|
+
- id: check-shebang-scripts-are-executable
|
|
16
|
+
- id: detect-private-key
|
|
17
|
+
- id: debug-statements
|
|
18
|
+
- id: check-added-large-files
|
|
19
|
+
args: [--maxkb=10240]
|
|
20
|
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
|
21
|
+
rev: v0.12.2
|
|
22
|
+
hooks:
|
|
23
|
+
- id: ruff
|
|
24
|
+
args: [--fix, --exit-non-zero-on-fix]
|
|
25
|
+
- id: ruff-format
|
|
26
|
+
- repo: https://github.com/codespell-project/codespell
|
|
27
|
+
rev: v2.4.1
|
|
28
|
+
hooks:
|
|
29
|
+
- id: codespell
|
|
30
|
+
additional_dependencies: [".[toml]"]
|
|
31
|
+
|
|
32
|
+
exclude: ^patches
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Moonshot AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: checkpoint-engine
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
|
|
5
|
+
Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
|
|
6
|
+
Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine
|
|
7
|
+
Project-URL: Documentation, https://github.com/MoonshotAI/checkpoint-engine
|
|
8
|
+
Project-URL: Issue Tracker, https://github.com/MoonshotAI/checkpoint-engine/issues
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENCE
|
|
12
|
+
Requires-Dist: torch>=2.5.0
|
|
13
|
+
Requires-Dist: fastapi
|
|
14
|
+
Requires-Dist: pydantic>=2.0.0
|
|
15
|
+
Requires-Dist: safetensors
|
|
16
|
+
Requires-Dist: pyzmq
|
|
17
|
+
Requires-Dist: uvicorn
|
|
18
|
+
Requires-Dist: loguru
|
|
19
|
+
Requires-Dist: numpy
|
|
20
|
+
Requires-Dist: httpx
|
|
21
|
+
Provides-Extra: p2p
|
|
22
|
+
Requires-Dist: mooncake-transfer-engine>=0.3.5; extra == "p2p"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# Checkpoint Engine
|
|
26
|
+
Checkpoint-engine is a simple middleware to update model weights in LLM inference engines -- a critical step in reinforcement learning.
|
|
27
|
+
We provide an efficient and lightweight implementation for inplace weight update:
|
|
28
|
+
updating our [Kimi-K2](https://github.com/MoonshotAI/Kimi-K2) model (1 Trillion parameters) across thousands of GPUs takes about 20s.
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
<div align="center">
|
|
32
|
+
<picture>
|
|
33
|
+
<img src="figures/checkpoint-engine.png" width="80%" alt="ckpt-engine">
|
|
34
|
+
</picture>
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
## Architecture
|
|
38
|
+
|
|
39
|
+
The core weight update logic is in `ParameterServer` class, a service colocated with inference engines. It provides two implementations of weight update: Broadcast and P2P.
|
|
40
|
+
|
|
41
|
+
- **Broadcast**: Used when a large number of inference instances need to update weights in synchronous. This is the fastest implementation and should be used as the default update method. See `_update_per_bucket`.
|
|
42
|
+
- **P2P**: Used when new inference instances are dynamically added (due to restarts or dynamic availability) while the existing instances are already serving requests. Under this scenario, to avoid affecting the workloads on existing instances, we use the [`mooncake-transfer-engine`](https://github.com/kvcache-ai/Mooncake?tab=readme-ov-file#use-python-package) to P2P send weights from CPUs in existing instances to GPUs in new instances. See `_update_per_bucket_p2p`.
|
|
43
|
+
|
|
44
|
+
### Optimized Weight Broadcast
|
|
45
|
+
In the *Broadcast* implementation, the checkpoint-engine holds references to sharded weights in CPU memory, and need to efficiently broadcast them to a cluster of inference instances, often under a different sharding pattern.
|
|
46
|
+
We arrange the data transfer into 3 stages:
|
|
47
|
+
1. H2D: moving weights to GPU memory. These weights may come from disk or the training engine.
|
|
48
|
+
2. broadcast: broadcast among checkpoint engine workers; the data results in a CUDA IPC buffer shared with inference engine.
|
|
49
|
+
3. reload: inference engine decides what subset of weights to copy from the broadcasted data.
|
|
50
|
+
|
|
51
|
+
Checkpoint-engine orchestrates the entire transfer process. It first gathers necessary metadata to create a plan, including deciding the proper bucket size for data transfer.
|
|
52
|
+
It then executes the transfer, where it controls the inference engine through a ZeroMQ socket. To maximize performance, it organizes the data transfers into a pipeline with overlapped communication and copy, illustrated below. The details can be found in [Kimi-K2 Technical Report](https://arxiv.org/abs/2507.20534).
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
<div align="center">
|
|
56
|
+
<picture>
|
|
57
|
+
<img src="figures/pipeline.png" width="80%" alt="pipeline">
|
|
58
|
+
</picture>
|
|
59
|
+
</div>
|
|
60
|
+
|
|
61
|
+
Pipelining naturally requires more GPU memory. When memory is not enough, checkpoint-engine will fallback to serial execution.
|
|
62
|
+
|
|
63
|
+
## Benchmark
|
|
64
|
+
|
|
65
|
+
| Model | Device Info | GatherMetas | Update (Broadcast) | Update (P2P) |
|
|
66
|
+
| :----------------------------------- | :----------- | :---------- |:-------------------| :---------------------- |
|
|
67
|
+
| GLM-4.5-Air (BF16) | 8xH800 TP8 | 0.17s | 3.94s (1.42GiB) | 8.83s (4.77GiB) |
|
|
68
|
+
| Qwen3-235B-A22B-Instruct-2507 (BF16) | 8xH800 TP8 | 0.46s | 6.75s (2.69GiB) | 16.47s (4.05GiB) |
|
|
69
|
+
| DeepSeek-V3.1 (FP8) | 16xH20 TP16 | 1.44s | 12.22s (2.38GiB) | 25.77s (3.61GiB) |
|
|
70
|
+
| Kimi-K2-Instruct (FP8) | 16xH20 TP16 | 1.81s | 15.45s (2.93GiB) | 36.24s (4.46GiB) |
|
|
71
|
+
| DeepSeek-V3.1 (FP8) | 256xH20 TP16 | 1.40s | 13.88s (2.54GiB) | 33.30s (3.86 GiB) |
|
|
72
|
+
| Kimi-K2-Instruct (FP8) | 256xH20 TP16 | 1.88s | 21.50s (2.99GiB) | 34.49s (4.57 GiB) |
|
|
73
|
+
|
|
74
|
+
All results above are tested by [`examples/update.py`](./examples/update.py) and use [vLLM v0.10.2rc1](https://github.com/vllm-project/vllm/tree/v0.10.2rc1) as inference engine. Some notes:
|
|
75
|
+
|
|
76
|
+
* FP8 test needs additional vLLM patches, see [FP8 quantization](#fp8-quantization).
|
|
77
|
+
* Device Info: we tested various combination of devices and parallelism setups. For example, a 256-GPU TP16 setup means that we deploy 16 vLLM instances, each with 16-way tensor parallelism.
|
|
78
|
+
* Since update duration is related to IPC bucket size, we provide the bucket size in the table.
|
|
79
|
+
* The P2P time were tested for updating no more than two nodes (16 GPUs) (`ParameterServer.update(ranks=range(0, 16))`) out of the entire cluster.
|
|
80
|
+
|
|
81
|
+
## Installation
|
|
82
|
+
|
|
83
|
+
Use the fastest broadcast implementation
|
|
84
|
+
|
|
85
|
+
```Bash
|
|
86
|
+
pip install checkpoint-engine
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Use the flexible P2P implementation, notice this will install `mooncake-transfer-engine` to support RDMA transfer between different ranks.
|
|
90
|
+
|
|
91
|
+
```Bash
|
|
92
|
+
pip install 'checkpoint-engine[p2p]'
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
If set `NCCL_IB_HCA` env, checkpoint-engine will use it to auto select net devices for different ranks. If not set, it will read all RDMA devices and try to divide them into each rank.
|
|
96
|
+
|
|
97
|
+
## Getting Started
|
|
98
|
+
|
|
99
|
+
Prepare an H800 or H20 machine with 8 GPUs with latest vLLM. Be sure to include [/collective_rpc API endpoint](https://github.com/vllm-project/vllm/commit/f7cf5b512ee41f36613deb2471a44de5f304f70d) commit (available in main branch) since checkpoint-engine will use this endpoint to update weights.
|
|
100
|
+
|
|
101
|
+
```Bash
|
|
102
|
+
cd /opt && git clone https://github.com/vllm-project/vllm && cd vllm
|
|
103
|
+
uv venv --python 3.12 --seed
|
|
104
|
+
source .venv/bin/activate
|
|
105
|
+
VLLM_USE_PRECOMPILED=1 uv pip install --editable .
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Install checkpoint-engine
|
|
109
|
+
|
|
110
|
+
```Bash
|
|
111
|
+
uv pip install 'checkpoint-engine[p2p]'
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
We use `Qwen/Qwen3-235B-A22B-Instruct-2507` (BF16) as the test model
|
|
115
|
+
|
|
116
|
+
```Bash
|
|
117
|
+
hf download Qwen/Qwen3-235B-A22B-Instruct-2507 --local-dir /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Start vLLM in dev mode and set `--load-format dummy`. Notice that we also set `--worker-extension-cls=checkpoint_engine.worker.VllmColocateWorkerExtension`
|
|
121
|
+
|
|
122
|
+
```Bash
|
|
123
|
+
VLLM_SERVER_DEV_MODE=1 python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 19730 --trust-remote-code \
|
|
124
|
+
--tensor-parallel-size=8 --max-model-len 4096 --load-format dummy \
|
|
125
|
+
--served-model-name checkpoint-engine-demo --model /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/ \
|
|
126
|
+
--worker-extension-cls checkpoint_engine.worker.VllmColocateWorkerExtension
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Meanwhile, use this command to update weights by checkpoint-engine. No need to wait for vLLM to get ready.
|
|
130
|
+
|
|
131
|
+
```Bash
|
|
132
|
+
torchrun --nproc-per-node 8 examples/update.py --update-method all --checkpoint-path /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Reuse weights from existing instances
|
|
136
|
+
|
|
137
|
+
New checkpoint-engine instances can join existing instances and reuse their weights. This is simple to achieve.
|
|
138
|
+
|
|
139
|
+
First, start the existing instances with `--save-metas-file global_metas.pkl` to save global metas to a file and use `--sleep-time 300` to make sure they stay alive.
|
|
140
|
+
|
|
141
|
+
```Bash
|
|
142
|
+
torchrun --nproc-per-node 8 examples/update.py --checkpoint-path $MODEL_PATH \
|
|
143
|
+
--sleep-time 300 --save-metas-file global_metas.pkl
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
After a checkpoint is registered, new instances can obtain a copy of the checkpoint by setting `--load-metas-file global_metas.pkl`.
|
|
147
|
+
|
|
148
|
+
```Bash
|
|
149
|
+
torchrun --nproc-per-node 8 examples/update.py --load-metas-file global_metas.pkl
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### FP8 quantization
|
|
153
|
+
|
|
154
|
+
FP8 quantization currently do not natively work in vLLM when updating weights.
|
|
155
|
+
We provide a simple patch in [`patches/vllm_fp8.patch`](./patches/vllm_fp8.patch) to handle the correct weight update.
|
|
156
|
+
Notice this patch is only tested in DeepSeek-V3.1 and Kimi-K2. Other models may meet some compatible issues.
|
|
157
|
+
|
|
158
|
+
A [PR](https://github.com/vllm-project/vllm/pull/24488) is opened to the vLLM project and waiting to discuss and review.
|
|
159
|
+
|
|
160
|
+
### Test
|
|
161
|
+
|
|
162
|
+
Run a simple correctness test for checkpoint_engine
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
torchrun --nproc-per-node 8 tests/test_update.py
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Limitations and Future Work
|
|
169
|
+
|
|
170
|
+
- This project is currently only tested with vLLM. But it is easy to integrate with other frameworks like SGLang.
|
|
171
|
+
- The perfect three-stage pipeline mentioned in our paper is currently not implemented. This could be useful for architectures where H2D and broadcast do not conflict in PCIE.
|
|
172
|
+
- The P2P update method is currently not the optimal implementation since it will receive data only in rank 0 and broadcast to others synchronizely. This is a potential optimization in the future.
|
|
173
|
+
|
|
174
|
+
## Acknowledgments
|
|
175
|
+
|
|
176
|
+
This open source project uses the same vLLM interface in https://github.com/vllm-project/vllm/pull/24295 . Thanks for the comments and insights from [youkaichao](https://github.com/youkaichao).
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# Checkpoint Engine
|
|
2
|
+
Checkpoint-engine is a simple middleware to update model weights in LLM inference engines -- a critical step in reinforcement learning.
|
|
3
|
+
We provide an efficient and lightweight implementation for inplace weight update:
|
|
4
|
+
updating our [Kimi-K2](https://github.com/MoonshotAI/Kimi-K2) model (1 Trillion parameters) across thousands of GPUs takes about 20s.
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
<div align="center">
|
|
8
|
+
<picture>
|
|
9
|
+
<img src="figures/checkpoint-engine.png" width="80%" alt="ckpt-engine">
|
|
10
|
+
</picture>
|
|
11
|
+
</div>
|
|
12
|
+
|
|
13
|
+
## Architecture
|
|
14
|
+
|
|
15
|
+
The core weight update logic is in `ParameterServer` class, a service colocated with inference engines. It provides two implementations of weight update: Broadcast and P2P.
|
|
16
|
+
|
|
17
|
+
- **Broadcast**: Used when a large number of inference instances need to update weights in synchronous. This is the fastest implementation and should be used as the default update method. See `_update_per_bucket`.
|
|
18
|
+
- **P2P**: Used when new inference instances are dynamically added (due to restarts or dynamic availability) while the existing instances are already serving requests. Under this scenario, to avoid affecting the workloads on existing instances, we use the [`mooncake-transfer-engine`](https://github.com/kvcache-ai/Mooncake?tab=readme-ov-file#use-python-package) to P2P send weights from CPUs in existing instances to GPUs in new instances. See `_update_per_bucket_p2p`.
|
|
19
|
+
|
|
20
|
+
### Optimized Weight Broadcast
|
|
21
|
+
In the *Broadcast* implementation, the checkpoint-engine holds references to sharded weights in CPU memory, and need to efficiently broadcast them to a cluster of inference instances, often under a different sharding pattern.
|
|
22
|
+
We arrange the data transfer into 3 stages:
|
|
23
|
+
1. H2D: moving weights to GPU memory. These weights may come from disk or the training engine.
|
|
24
|
+
2. broadcast: broadcast among checkpoint engine workers; the data results in a CUDA IPC buffer shared with inference engine.
|
|
25
|
+
3. reload: inference engine decides what subset of weights to copy from the broadcasted data.
|
|
26
|
+
|
|
27
|
+
Checkpoint-engine orchestrates the entire transfer process. It first gathers necessary metadata to create a plan, including deciding the proper bucket size for data transfer.
|
|
28
|
+
It then executes the transfer, where it controls the inference engine through a ZeroMQ socket. To maximize performance, it organizes the data transfers into a pipeline with overlapped communication and copy, illustrated below. The details can be found in [Kimi-K2 Technical Report](https://arxiv.org/abs/2507.20534).
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
<div align="center">
|
|
32
|
+
<picture>
|
|
33
|
+
<img src="figures/pipeline.png" width="80%" alt="pipeline">
|
|
34
|
+
</picture>
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
Pipelining naturally requires more GPU memory. When memory is not enough, checkpoint-engine will fallback to serial execution.
|
|
38
|
+
|
|
39
|
+
## Benchmark
|
|
40
|
+
|
|
41
|
+
| Model | Device Info | GatherMetas | Update (Broadcast) | Update (P2P) |
|
|
42
|
+
| :----------------------------------- | :----------- | :---------- |:-------------------| :---------------------- |
|
|
43
|
+
| GLM-4.5-Air (BF16) | 8xH800 TP8 | 0.17s | 3.94s (1.42GiB) | 8.83s (4.77GiB) |
|
|
44
|
+
| Qwen3-235B-A22B-Instruct-2507 (BF16) | 8xH800 TP8 | 0.46s | 6.75s (2.69GiB) | 16.47s (4.05GiB) |
|
|
45
|
+
| DeepSeek-V3.1 (FP8) | 16xH20 TP16 | 1.44s | 12.22s (2.38GiB) | 25.77s (3.61GiB) |
|
|
46
|
+
| Kimi-K2-Instruct (FP8) | 16xH20 TP16 | 1.81s | 15.45s (2.93GiB) | 36.24s (4.46GiB) |
|
|
47
|
+
| DeepSeek-V3.1 (FP8) | 256xH20 TP16 | 1.40s | 13.88s (2.54GiB) | 33.30s (3.86 GiB) |
|
|
48
|
+
| Kimi-K2-Instruct (FP8) | 256xH20 TP16 | 1.88s | 21.50s (2.99GiB) | 34.49s (4.57 GiB) |
|
|
49
|
+
|
|
50
|
+
All results above are tested by [`examples/update.py`](./examples/update.py) and use [vLLM v0.10.2rc1](https://github.com/vllm-project/vllm/tree/v0.10.2rc1) as inference engine. Some notes:
|
|
51
|
+
|
|
52
|
+
* FP8 test needs additional vLLM patches, see [FP8 quantization](#fp8-quantization).
|
|
53
|
+
* Device Info: we tested various combination of devices and parallelism setups. For example, a 256-GPU TP16 setup means that we deploy 16 vLLM instances, each with 16-way tensor parallelism.
|
|
54
|
+
* Since update duration is related to IPC bucket size, we provide the bucket size in the table.
|
|
55
|
+
* The P2P time were tested for updating no more than two nodes (16 GPUs) (`ParameterServer.update(ranks=range(0, 16))`) out of the entire cluster.
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
Use the fastest broadcast implementation
|
|
60
|
+
|
|
61
|
+
```Bash
|
|
62
|
+
pip install checkpoint-engine
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Use the flexible P2P implementation, notice this will install `mooncake-transfer-engine` to support RDMA transfer between different ranks.
|
|
66
|
+
|
|
67
|
+
```Bash
|
|
68
|
+
pip install 'checkpoint-engine[p2p]'
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
If set `NCCL_IB_HCA` env, checkpoint-engine will use it to auto select net devices for different ranks. If not set, it will read all RDMA devices and try to divide them into each rank.
|
|
72
|
+
|
|
73
|
+
## Getting Started
|
|
74
|
+
|
|
75
|
+
Prepare an H800 or H20 machine with 8 GPUs with latest vLLM. Be sure to include [/collective_rpc API endpoint](https://github.com/vllm-project/vllm/commit/f7cf5b512ee41f36613deb2471a44de5f304f70d) commit (available in main branch) since checkpoint-engine will use this endpoint to update weights.
|
|
76
|
+
|
|
77
|
+
```Bash
|
|
78
|
+
cd /opt && git clone https://github.com/vllm-project/vllm && cd vllm
|
|
79
|
+
uv venv --python 3.12 --seed
|
|
80
|
+
source .venv/bin/activate
|
|
81
|
+
VLLM_USE_PRECOMPILED=1 uv pip install --editable .
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Install checkpoint-engine
|
|
85
|
+
|
|
86
|
+
```Bash
|
|
87
|
+
uv pip install 'checkpoint-engine[p2p]'
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
We use `Qwen/Qwen3-235B-A22B-Instruct-2507` (BF16) as the test model
|
|
91
|
+
|
|
92
|
+
```Bash
|
|
93
|
+
hf download Qwen/Qwen3-235B-A22B-Instruct-2507 --local-dir /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Start vLLM in dev mode and set `--load-format dummy`. Notice that we also set `--worker-extension-cls=checkpoint_engine.worker.VllmColocateWorkerExtension`
|
|
97
|
+
|
|
98
|
+
```Bash
|
|
99
|
+
VLLM_SERVER_DEV_MODE=1 python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 19730 --trust-remote-code \
|
|
100
|
+
--tensor-parallel-size=8 --max-model-len 4096 --load-format dummy \
|
|
101
|
+
--served-model-name checkpoint-engine-demo --model /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/ \
|
|
102
|
+
--worker-extension-cls checkpoint_engine.worker.VllmColocateWorkerExtension
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Meanwhile, use this command to update weights by checkpoint-engine. No need to wait for vLLM to get ready.
|
|
106
|
+
|
|
107
|
+
```Bash
|
|
108
|
+
torchrun --nproc-per-node 8 examples/update.py --update-method all --checkpoint-path /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Reuse weights from existing instances
|
|
112
|
+
|
|
113
|
+
New checkpoint-engine instances can join existing instances and reuse their weights. This is simple to achieve.
|
|
114
|
+
|
|
115
|
+
First, start the existing instances with `--save-metas-file global_metas.pkl` to save global metas to a file and use `--sleep-time 300` to make sure they stay alive.
|
|
116
|
+
|
|
117
|
+
```Bash
|
|
118
|
+
torchrun --nproc-per-node 8 examples/update.py --checkpoint-path $MODEL_PATH \
|
|
119
|
+
--sleep-time 300 --save-metas-file global_metas.pkl
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
After a checkpoint is registered, new instances can obtain a copy of the checkpoint by setting `--load-metas-file global_metas.pkl`.
|
|
123
|
+
|
|
124
|
+
```Bash
|
|
125
|
+
torchrun --nproc-per-node 8 examples/update.py --load-metas-file global_metas.pkl
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### FP8 quantization
|
|
129
|
+
|
|
130
|
+
FP8 quantization currently do not natively work in vLLM when updating weights.
|
|
131
|
+
We provide a simple patch in [`patches/vllm_fp8.patch`](./patches/vllm_fp8.patch) to handle the correct weight update.
|
|
132
|
+
Notice this patch is only tested in DeepSeek-V3.1 and Kimi-K2. Other models may meet some compatible issues.
|
|
133
|
+
|
|
134
|
+
A [PR](https://github.com/vllm-project/vllm/pull/24488) is opened to the vLLM project and waiting to discuss and review.
|
|
135
|
+
|
|
136
|
+
### Test
|
|
137
|
+
|
|
138
|
+
Run a simple correctness test for checkpoint_engine
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
torchrun --nproc-per-node 8 tests/test_update.py
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Limitations and Future Work
|
|
145
|
+
|
|
146
|
+
- This project is currently only tested with vLLM. But it is easy to integrate with other frameworks like SGLang.
|
|
147
|
+
- The perfect three-stage pipeline mentioned in our paper is currently not implemented. This could be useful for architectures where H2D and broadcast do not conflict in PCIE.
|
|
148
|
+
- The P2P update method is currently not the optimal implementation since it will receive data only in rank 0 and broadcast to others synchronizely. This is a potential optimization in the future.
|
|
149
|
+
|
|
150
|
+
## Acknowledgments
|
|
151
|
+
|
|
152
|
+
This open source project uses the same vLLM interface in https://github.com/vllm-project/vllm/pull/24295 . Thanks for the comments and insights from [youkaichao](https://github.com/youkaichao).
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.1.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 2)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = 'g716c0dad9'
|