checkpoint-engine 0.1.3__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checkpoint_engine-0.2.1/.github/workflows/cpu-tests.yml +30 -0
  2. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/PKG-INFO +70 -13
  3. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/README.md +69 -12
  4. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/checkpoint_engine/_version.py +3 -3
  5. checkpoint_engine-0.2.1/checkpoint_engine/device_utils.py +86 -0
  6. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/checkpoint_engine/ps.py +368 -192
  7. checkpoint_engine-0.2.1/checkpoint_engine/worker.py +165 -0
  8. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/checkpoint_engine.egg-info/PKG-INFO +70 -13
  9. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/checkpoint_engine.egg-info/SOURCES.txt +5 -0
  10. checkpoint_engine-0.2.1/docs/npu_start.md +91 -0
  11. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/pyproject.toml +5 -0
  12. checkpoint_engine-0.2.1/tests/test_assign_receiver_ranks.py +68 -0
  13. checkpoint_engine-0.2.1/tests/test_rdma_parser.py +197 -0
  14. checkpoint_engine-0.2.1/tests/test_update.py +234 -0
  15. checkpoint_engine-0.1.3/checkpoint_engine/worker.py +0 -109
  16. checkpoint_engine-0.1.3/tests/test_update.py +0 -90
  17. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/.github/workflows/pre-commit.yaml +0 -0
  18. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/.github/workflows/python-publish.yml +0 -0
  19. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/.gitignore +0 -0
  20. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/.pre-commit-config.yaml +0 -0
  21. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/LICENCE +0 -0
  22. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/checkpoint_engine/__init__.py +0 -0
  23. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/checkpoint_engine.egg-info/dependency_links.txt +0 -0
  24. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/checkpoint_engine.egg-info/requires.txt +0 -0
  25. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/checkpoint_engine.egg-info/top_level.txt +0 -0
  26. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/examples/update.py +0 -0
  27. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/figures/checkpoint-engine.png +0 -0
  28. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/figures/overlap-update-and-copy.png +0 -0
  29. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/figures/pipeline.png +0 -0
  30. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/patches/vllm_fp8.patch +0 -0
  31. {checkpoint_engine-0.1.3 → checkpoint_engine-0.2.1}/setup.cfg +0 -0
@@ -0,0 +1,30 @@
1
+ name: CPU Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ types: [opened, synchronize, reopened]
8
+
9
+
10
+ permissions:
11
+ contents: read
12
+
13
+ jobs:
14
+ build:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - name: Checkout code
18
+ uses: actions/checkout@v4
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v3
21
+ with:
22
+ python-version: "3.10"
23
+ - name: Install dependencies
24
+ run: |
25
+ python -m pip install --upgrade pip
26
+ pip install pytest
27
+ pip install .[p2p]
28
+ - name: Do CPU tests with pytest
29
+ run: |
30
+ pytest -v -m "not gpu" tests/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: checkpoint-engine
3
- Version: 0.1.3
3
+ Version: 0.2.1
4
4
  Summary: checkpoint-engine is a lightweight, decoupling and efficient weight update middleware
5
5
  Project-URL: Homepage, https://github.com/MoonshotAI/checkpoint-engine
6
6
  Project-URL: Repository, https://github.com/MoonshotAI/checkpoint-engine
@@ -38,8 +38,8 @@ updating our [Kimi-K2](https://github.com/MoonshotAI/Kimi-K2) model (1 Trillion
38
38
 
39
39
  The core weight update logic is in `ParameterServer` class, a service colocated with inference engines. It provides two implementations of weight update: Broadcast and P2P.
40
40
 
41
- - **Broadcast**: Used when a large number of inference instances need to update weights in synchronous. This is the fastest implementation and should be used as the default update method. See `_update_per_bucket`.
42
- - **P2P**: Used when new inference instances are dynamically added (due to restarts or dynamic availability) while the existing instances are already serving requests. Under this scenario, to avoid affecting the workloads on existing instances, we use the [`mooncake-transfer-engine`](https://github.com/kvcache-ai/Mooncake?tab=readme-ov-file#use-python-package) to P2P send weights from CPUs in existing instances to GPUs in new instances. See `_update_per_bucket_p2p`.
41
+ - **Broadcast**: Used when a large number of inference instances need to update weights in synchronous. This is the fastest implementation and should be used as the default update method. See `_update_per_bucket` with `ranks == None or []`.
42
+ - **P2P**: Used when new inference instances are dynamically added (due to restarts or dynamic availability) while the existing instances are already serving requests. Under this scenario, to avoid affecting the workloads on existing instances, we use the [`mooncake-transfer-engine`](https://github.com/kvcache-ai/Mooncake?tab=readme-ov-file#use-python-package) to P2P send weights from CPUs in existing instances to GPUs in new instances. See `_update_per_bucket` with `ranks` specified.
43
43
 
44
44
  ### Optimized Weight Broadcast
45
45
  In the *Broadcast* implementation, the checkpoint-engine holds references to sharded weights in CPU memory, and need to efficiently broadcast them to a cluster of inference instances, often under a different sharding pattern.
@@ -60,16 +60,22 @@ It then executes the transfer, where it controls the inference engine through a
60
60
 
61
61
  Pipelining naturally requires more GPU memory. When memory is not enough, checkpoint-engine will fallback to serial execution.
62
62
 
63
+ ### Optimized P2P Bucket Assignment
64
+ In the *P2P* implementation, checkpoint-engine needs to send weights from existing instances to new instances.
65
+ To minimize the overall transfer time, checkpoint-engine optimizes the bucket assignment for each sender-receiver pair.
66
+ The optimization goal is to make full use of the available network bandwidth for each sender and receiver.
67
+ See [issue #25](https://github.com/MoonshotAI/checkpoint-engine/issues/25)
68
+
63
69
  ## Benchmark
64
70
 
65
71
  | Model | Device Info | GatherMetas | Update (Broadcast) | Update (P2P) |
66
72
  | :----------------------------------- | :----------- | :---------- |:-------------------| :---------------------- |
67
- | GLM-4.5-Air (BF16) | 8xH800 TP8 | 0.17s | 3.94s (1.42GiB) | 8.83s (4.77GiB) |
68
- | Qwen3-235B-A22B-Instruct-2507 (BF16) | 8xH800 TP8 | 0.46s | 6.75s (2.69GiB) | 16.47s (4.05GiB) |
69
- | DeepSeek-V3.1 (FP8) | 16xH20 TP16 | 1.44s | 12.22s (2.38GiB) | 25.77s (3.61GiB) |
70
- | Kimi-K2-Instruct (FP8) | 16xH20 TP16 | 1.81s | 15.45s (2.93GiB) | 36.24s (4.46GiB) |
71
- | DeepSeek-V3.1 (FP8) | 256xH20 TP16 | 1.40s | 13.88s (2.54GiB) | 33.30s (3.86 GiB) |
72
- | Kimi-K2-Instruct (FP8) | 256xH20 TP16 | 1.88s | 21.50s (2.99GiB) | 34.49s (4.57 GiB) |
73
+ | GLM-4.5-Air (BF16) | 8xH800 TP8 | 0.12s | 3.47s (3.02GiB) | 4.12s (3.02GiB) |
74
+ | Qwen3-235B-A22B-Instruct-2507 (BF16) | 8xH800 TP8 | 0.33s | 6.22s (2.67GiB) | 7.10s (2.68GiB) |
75
+ | DeepSeek-V3.1 (FP8) | 16xH20 TP16 | 1.17s | 10.19s (5.39GiB) | 11.80s (5.41GiB) |
76
+ | Kimi-K2-Instruct (FP8) | 16xH20 TP16 | 1.33s | 14.36s (5.89GiB) | 17.49s (5.91GiB) |
77
+ | DeepSeek-V3.1 (FP8) | 256xH20 TP16 | 0.80s | 11.33s (8.00GiB) | 11.81s (8.00GiB) |
78
+ | Kimi-K2-Instruct (FP8) | 256xH20 TP16 | 1.22s | 16.04s (8.00GiB) | 16.75s (8.00GiB) |
73
79
 
74
80
  All results above are tested by [`examples/update.py`](./examples/update.py) and use [vLLM v0.10.2rc1](https://github.com/vllm-project/vllm/tree/v0.10.2rc1) as inference engine. Some notes:
75
81
 
@@ -77,6 +83,7 @@ All results above are tested by [`examples/update.py`](./examples/update.py) and
77
83
  * Device Info: we tested various combination of devices and parallelism setups. For example, a 256-GPU TP16 setup means that we deploy 16 vLLM instances, each with 16-way tensor parallelism.
78
84
  * Since update duration is related to IPC bucket size, we provide the bucket size in the table.
79
85
  * The P2P time were tested for updating no more than two nodes (16 GPUs) (`ParameterServer.update(ranks=range(0, 16))`) out of the entire cluster.
86
+ * We bind each GPU to its corresponding NUMA node to ensure stable H2D transfer speeds.
80
87
 
81
88
  ## Installation
82
89
 
@@ -92,7 +99,7 @@ Use the flexible P2P implementation, notice this will install `mooncake-transfer
92
99
  pip install 'checkpoint-engine[p2p]'
93
100
  ```
94
101
 
95
- If set `NCCL_IB_HCA` env, checkpoint-engine will use it to auto select net devices for different ranks. If not set, it will read all RDMA devices and try to divide them into each rank.
102
+ If set `NCCL_IB_HCA` env, checkpoint-engine will use it to auto select net devices for different ranks. Available patterns can be found from [NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8). If not set, it will read all RDMA devices and try to divide them into each rank.
96
103
 
97
104
  ## Getting Started
98
105
 
@@ -162,14 +169,64 @@ A [PR](https://github.com/vllm-project/vllm/pull/24488) is opened to the vLLM pr
162
169
  Run a simple correctness test for checkpoint_engine
163
170
 
164
171
  ```bash
165
- torchrun --nproc-per-node 8 tests/test_update.py
172
+ pytest tests/test_update.py
173
+ ```
174
+
175
+ `test_update.py` are only designed to run with `pytest`. Please don't run it directly with `torchrun`.
176
+
177
+ Other unit tests can also be done with pytest. Only test_update.py requires GPUs, other tests can be run on CPUs. Only to run CPU tests, use:
178
+
179
+ ```bash
180
+ pytest tests/ -m "not gpu"
181
+ ```
182
+
183
+ ## SGLang Integration
184
+
185
+ Checkpoint Engine provides efficient distributed checkpoint loading for SGLang inference servers, significantly reducing model loading time for large models and multi-node setups.
186
+
187
+ ### Quick Start
188
+
189
+ **1. Install checkpoint-engine:**
190
+ ```bash
191
+ pip install 'checkpoint-engine[p2p]'
192
+ ```
193
+
194
+ **2. Launch SGLang server:**
195
+ ```bash
196
+ python -m sglang.launch_server \
197
+ --model-path $MODEL_PATH \
198
+ --tp 8 \
199
+ --load-format dummy \
200
+ --wait-for-initial-weights
201
+ ```
202
+
203
+ **3. Run checkpoint engine:**
204
+ ```bash
205
+ python -m sglang.srt.checkpoint_engine.update \
206
+ --update-method broadcast \
207
+ --checkpoint-path $MODEL_PATH \
208
+ --inference-parallel-size 8
166
209
  ```
167
210
 
211
+ ### Multi-Node Setup
212
+
213
+ For 2-node setup, run the same commands on both nodes with appropriate `--host` and distributed training parameters.
214
+
215
+ ### Key Options
216
+
217
+ **SGLang Server:**
218
+ - `--wait-for-initial-weights`: Wait for checkpoint engine before becoming ready
219
+ - `--load-format dummy`: Enable overlapping initialization tasks
220
+
221
+ **Checkpoint Engine:**
222
+ - `--update-method`: Choose `broadcast`, `p2p`, or `all`
223
+ - `--inference-parallel-size`: Number of parallel processes
224
+ - `--checkpoint-path`: Model checkpoint directory
225
+
168
226
  ## Limitations and Future Work
169
227
 
170
- - This project is currently only tested with vLLM. But it is easy to integrate with other frameworks like SGLang.
228
+ - This project is currently tested with vLLM and SGLang. Integration with other frameworks is planned for future releases.
171
229
  - The perfect three-stage pipeline mentioned in our paper is currently not implemented. This could be useful for architectures where H2D and broadcast do not conflict in PCIE.
172
- - The P2P update method is currently not the optimal implementation since it will receive data only in rank 0 and broadcast to others synchronizely. This is a potential optimization in the future.
173
230
 
174
231
  ## Acknowledgments
175
232
 
@@ -14,8 +14,8 @@ updating our [Kimi-K2](https://github.com/MoonshotAI/Kimi-K2) model (1 Trillion
14
14
 
15
15
  The core weight update logic is in `ParameterServer` class, a service colocated with inference engines. It provides two implementations of weight update: Broadcast and P2P.
16
16
 
17
- - **Broadcast**: Used when a large number of inference instances need to update weights in synchronous. This is the fastest implementation and should be used as the default update method. See `_update_per_bucket`.
18
- - **P2P**: Used when new inference instances are dynamically added (due to restarts or dynamic availability) while the existing instances are already serving requests. Under this scenario, to avoid affecting the workloads on existing instances, we use the [`mooncake-transfer-engine`](https://github.com/kvcache-ai/Mooncake?tab=readme-ov-file#use-python-package) to P2P send weights from CPUs in existing instances to GPUs in new instances. See `_update_per_bucket_p2p`.
17
+ - **Broadcast**: Used when a large number of inference instances need to update weights in synchronous. This is the fastest implementation and should be used as the default update method. See `_update_per_bucket` with `ranks == None or []`.
18
+ - **P2P**: Used when new inference instances are dynamically added (due to restarts or dynamic availability) while the existing instances are already serving requests. Under this scenario, to avoid affecting the workloads on existing instances, we use the [`mooncake-transfer-engine`](https://github.com/kvcache-ai/Mooncake?tab=readme-ov-file#use-python-package) to P2P send weights from CPUs in existing instances to GPUs in new instances. See `_update_per_bucket` with `ranks` specified.
19
19
 
20
20
  ### Optimized Weight Broadcast
21
21
  In the *Broadcast* implementation, the checkpoint-engine holds references to sharded weights in CPU memory, and need to efficiently broadcast them to a cluster of inference instances, often under a different sharding pattern.
@@ -36,16 +36,22 @@ It then executes the transfer, where it controls the inference engine through a
36
36
 
37
37
  Pipelining naturally requires more GPU memory. When memory is not enough, checkpoint-engine will fallback to serial execution.
38
38
 
39
+ ### Optimized P2P Bucket Assignment
40
+ In the *P2P* implementation, checkpoint-engine needs to send weights from existing instances to new instances.
41
+ To minimize the overall transfer time, checkpoint-engine optimizes the bucket assignment for each sender-receiver pair.
42
+ The optimization goal is to make full use of the available network bandwidth for each sender and receiver.
43
+ See [issue #25](https://github.com/MoonshotAI/checkpoint-engine/issues/25)
44
+
39
45
  ## Benchmark
40
46
 
41
47
  | Model | Device Info | GatherMetas | Update (Broadcast) | Update (P2P) |
42
48
  | :----------------------------------- | :----------- | :---------- |:-------------------| :---------------------- |
43
- | GLM-4.5-Air (BF16) | 8xH800 TP8 | 0.17s | 3.94s (1.42GiB) | 8.83s (4.77GiB) |
44
- | Qwen3-235B-A22B-Instruct-2507 (BF16) | 8xH800 TP8 | 0.46s | 6.75s (2.69GiB) | 16.47s (4.05GiB) |
45
- | DeepSeek-V3.1 (FP8) | 16xH20 TP16 | 1.44s | 12.22s (2.38GiB) | 25.77s (3.61GiB) |
46
- | Kimi-K2-Instruct (FP8) | 16xH20 TP16 | 1.81s | 15.45s (2.93GiB) | 36.24s (4.46GiB) |
47
- | DeepSeek-V3.1 (FP8) | 256xH20 TP16 | 1.40s | 13.88s (2.54GiB) | 33.30s (3.86 GiB) |
48
- | Kimi-K2-Instruct (FP8) | 256xH20 TP16 | 1.88s | 21.50s (2.99GiB) | 34.49s (4.57 GiB) |
49
+ | GLM-4.5-Air (BF16) | 8xH800 TP8 | 0.12s | 3.47s (3.02GiB) | 4.12s (3.02GiB) |
50
+ | Qwen3-235B-A22B-Instruct-2507 (BF16) | 8xH800 TP8 | 0.33s | 6.22s (2.67GiB) | 7.10s (2.68GiB) |
51
+ | DeepSeek-V3.1 (FP8) | 16xH20 TP16 | 1.17s | 10.19s (5.39GiB) | 11.80s (5.41GiB) |
52
+ | Kimi-K2-Instruct (FP8) | 16xH20 TP16 | 1.33s | 14.36s (5.89GiB) | 17.49s (5.91GiB) |
53
+ | DeepSeek-V3.1 (FP8) | 256xH20 TP16 | 0.80s | 11.33s (8.00GiB) | 11.81s (8.00GiB) |
54
+ | Kimi-K2-Instruct (FP8) | 256xH20 TP16 | 1.22s | 16.04s (8.00GiB) | 16.75s (8.00GiB) |
49
55
 
50
56
  All results above are tested by [`examples/update.py`](./examples/update.py) and use [vLLM v0.10.2rc1](https://github.com/vllm-project/vllm/tree/v0.10.2rc1) as inference engine. Some notes:
51
57
 
@@ -53,6 +59,7 @@ All results above are tested by [`examples/update.py`](./examples/update.py) and
53
59
  * Device Info: we tested various combination of devices and parallelism setups. For example, a 256-GPU TP16 setup means that we deploy 16 vLLM instances, each with 16-way tensor parallelism.
54
60
  * Since update duration is related to IPC bucket size, we provide the bucket size in the table.
55
61
  * The P2P time were tested for updating no more than two nodes (16 GPUs) (`ParameterServer.update(ranks=range(0, 16))`) out of the entire cluster.
62
+ * We bind each GPU to its corresponding NUMA node to ensure stable H2D transfer speeds.
56
63
 
57
64
  ## Installation
58
65
 
@@ -68,7 +75,7 @@ Use the flexible P2P implementation, notice this will install `mooncake-transfer
68
75
  pip install 'checkpoint-engine[p2p]'
69
76
  ```
70
77
 
71
- If set `NCCL_IB_HCA` env, checkpoint-engine will use it to auto select net devices for different ranks. If not set, it will read all RDMA devices and try to divide them into each rank.
78
+ If set `NCCL_IB_HCA` env, checkpoint-engine will use it to auto select net devices for different ranks. Available patterns can be found from [NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8). If not set, it will read all RDMA devices and try to divide them into each rank.
72
79
 
73
80
  ## Getting Started
74
81
 
@@ -138,14 +145,64 @@ A [PR](https://github.com/vllm-project/vllm/pull/24488) is opened to the vLLM pr
138
145
  Run a simple correctness test for checkpoint_engine
139
146
 
140
147
  ```bash
141
- torchrun --nproc-per-node 8 tests/test_update.py
148
+ pytest tests/test_update.py
149
+ ```
150
+
151
+ `test_update.py` are only designed to run with `pytest`. Please don't run it directly with `torchrun`.
152
+
153
+ Other unit tests can also be done with pytest. Only test_update.py requires GPUs, other tests can be run on CPUs. Only to run CPU tests, use:
154
+
155
+ ```bash
156
+ pytest tests/ -m "not gpu"
157
+ ```
158
+
159
+ ## SGLang Integration
160
+
161
+ Checkpoint Engine provides efficient distributed checkpoint loading for SGLang inference servers, significantly reducing model loading time for large models and multi-node setups.
162
+
163
+ ### Quick Start
164
+
165
+ **1. Install checkpoint-engine:**
166
+ ```bash
167
+ pip install 'checkpoint-engine[p2p]'
168
+ ```
169
+
170
+ **2. Launch SGLang server:**
171
+ ```bash
172
+ python -m sglang.launch_server \
173
+ --model-path $MODEL_PATH \
174
+ --tp 8 \
175
+ --load-format dummy \
176
+ --wait-for-initial-weights
177
+ ```
178
+
179
+ **3. Run checkpoint engine:**
180
+ ```bash
181
+ python -m sglang.srt.checkpoint_engine.update \
182
+ --update-method broadcast \
183
+ --checkpoint-path $MODEL_PATH \
184
+ --inference-parallel-size 8
142
185
  ```
143
186
 
187
+ ### Multi-Node Setup
188
+
189
+ For 2-node setup, run the same commands on both nodes with appropriate `--host` and distributed training parameters.
190
+
191
+ ### Key Options
192
+
193
+ **SGLang Server:**
194
+ - `--wait-for-initial-weights`: Wait for checkpoint engine before becoming ready
195
+ - `--load-format dummy`: Enable overlapping initialization tasks
196
+
197
+ **Checkpoint Engine:**
198
+ - `--update-method`: Choose `broadcast`, `p2p`, or `all`
199
+ - `--inference-parallel-size`: Number of parallel processes
200
+ - `--checkpoint-path`: Model checkpoint directory
201
+
144
202
  ## Limitations and Future Work
145
203
 
146
- - This project is currently only tested with vLLM. But it is easy to integrate with other frameworks like SGLang.
204
+ - This project is currently tested with vLLM and SGLang. Integration with other frameworks is planned for future releases.
147
205
  - The perfect three-stage pipeline mentioned in our paper is currently not implemented. This could be useful for architectures where H2D and broadcast do not conflict in PCIE.
148
- - The P2P update method is currently not the optimal implementation since it will receive data only in rank 0 and broadcast to others synchronizely. This is a potential optimization in the future.
149
206
 
150
207
  ## Acknowledgments
151
208
 
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.3'
32
- __version_tuple__ = version_tuple = (0, 1, 3)
31
+ __version__ = version = '0.2.1'
32
+ __version_tuple__ = version_tuple = (0, 2, 1)
33
33
 
34
- __commit_id__ = commit_id = 'g8a60e65ba'
34
+ __commit_id__ = commit_id = 'g279a908a9'
@@ -0,0 +1,86 @@
1
+ import os
2
+ import re
3
+ import socket
4
+ import subprocess
5
+ from functools import lru_cache
6
+
7
+ import torch
8
+ from loguru import logger
9
+
10
+
11
+ @lru_cache(maxsize=1)
12
+ def get_ip() -> str:
13
+ try:
14
+ # try to get ip from network interface
15
+ with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
16
+ s.connect(("8.8.8.8", 80))
17
+ return s.getsockname()[0]
18
+ except Exception as e: # noqa: BLE001
19
+ # fallback to get ip from hostname
20
+ logger.warning(
21
+ f"fail to get ip from network interface, fallback to get ip from hostname: {e}"
22
+ )
23
+ return socket.gethostbyname(socket.gethostname())
24
+
25
+
26
+ def npu_generate_uuid() -> str:
27
+ str_pid = str(os.getpid())
28
+ npu_num = 8
29
+ try:
30
+ for npu_id in range(npu_num):
31
+ cmd = ["npu-smi", "info", "-t", "proc-mem", "-i", str(npu_id)]
32
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True) # noqa: S603
33
+ str_result = str(result.stdout)
34
+ if str_pid in str_result:
35
+ # In A3 server, one NPU has two chips.
36
+ match_chip_count = re.search(r"Chip Count[^\d]*(\d+)", str_result)
37
+ chip_count = int(match_chip_count.group(1))
38
+ search_after_pid = str_result[str_result.find(str_pid) + len(str_pid) :]
39
+ match_chip_id = re.search(r"Chip ID[^\d]*(\d+)", search_after_pid)
40
+ chip_id = int(match_chip_id.group(1))
41
+ return f"{get_ip()}-{npu_id * chip_count + chip_id}"
42
+ raise ValueError("The current process is not running on the npu device")
43
+ except subprocess.CalledProcessError as e:
44
+ raise ValueError("The current process is not running on the npu device") from e
45
+
46
+
47
+ class DeviceManager:
48
+ def __init__(self):
49
+ self.device_type = self._detect_device_type()
50
+ self._setup_device_module()
51
+
52
+ def _is_torch_npu_available(self) -> bool:
53
+ try:
54
+ if hasattr(torch, "npu") and callable(getattr(torch.npu, "is_available", None)):
55
+ return torch.npu.is_available()
56
+ else:
57
+ return False
58
+ except ImportError:
59
+ return False
60
+
61
+ def _detect_device_type(self) -> str:
62
+ if self._is_torch_npu_available():
63
+ return "npu"
64
+ elif torch.cuda.is_available():
65
+ return "cuda"
66
+ else:
67
+ raise TypeError("The current device type is not supported")
68
+
69
+ def _setup_device_module(self):
70
+ if self.device_type == "npu":
71
+ import torch_npu
72
+
73
+ self.device_module = torch_npu.npu
74
+ elif self.device_type == "cuda":
75
+ self.device_module = torch.cuda
76
+ else:
77
+ raise TypeError("The current device type is not supported")
78
+
79
+ @property
80
+ def backend(self) -> str:
81
+ if self.device_type == "npu":
82
+ return "hccl"
83
+ elif self.device_type == "cuda":
84
+ return "nccl"
85
+ else:
86
+ raise TypeError("The current device type is not supported")