gpustack-runner 0.1.22.post2__tar.gz → 0.1.22.post4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/Makefile +1 -1
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/PKG-INFO +13 -12
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/README.md +12 -11
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/_version.py +2 -2
- gpustack_runner-0.1.22.post4/gpustack_runner/_version_appendix.py +1 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/runner.py.json +132 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cuda/Dockerfile +24 -14
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/matrix.yaml +0 -1
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/rocm/Dockerfile +20 -6
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_list_runners_by_backend.json +132 -0
- gpustack_runner-0.1.22.post2/gpustack_runner/_version_appendix.py +0 -1
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.codespelldict +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.codespellrc +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.gitattributes +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.gitignore +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.pre-commit-config.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.python-version +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/LICENSE +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/docs/index.md +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/docs/modules/gpustack_runner.md +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/__init__.py +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/__main__.py +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/_version.pyi +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/cmds/__init__.py +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/cmds/__types__.py +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/cmds/images.py +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/runner.py +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/hatch.toml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/mkdocs.yml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251020_vllm_install_lmcache/cann/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251020_vllm_install_lmcache/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251020_vllm_install_lmcache/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251020_vllm_install_lmcache/rocm/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_client/cann/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_client/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_client/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_client/rocm/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_default/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_default/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_default/rocm/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251024_vllm_install_nvidia_hpcx/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251024_vllm_install_nvidia_hpcx/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251024_vllm_reinstall_lmcache/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251024_vllm_reinstall_lmcache/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251029_vllm_reinstall_ray/cann/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251029_vllm_reinstall_ray/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251103_mindie_refresh_entrypoint/cann/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251103_mindie_refresh_entrypoint/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251105_vllm_polish_nvidia_hpcx/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251105_vllm_polish_nvidia_hpcx/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251106_vllm_install_ep_kernel/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251106_vllm_install_ep_kernel/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251107_vllm_reinstall_lmcache/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251107_vllm_reinstall_lmcache/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251110_sglang_install_diffusion/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251110_sglang_install_diffusion/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251110_sglang_install_flashattn/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251110_sglang_install_flashattn/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251125_mindie_install_posix_ipc/cann/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251125_mindie_install_posix_ipc/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251201_vllm_patch_qwen2_5_vl/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251201_vllm_patch_qwen2_5_vl/cuda/patches/vllm_001_disable_flashatten_in_qwen2_5_vl.patch +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251201_vllm_patch_qwen2_5_vl/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251209_mindie_install_av/cann/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251209_mindie_install_av/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_mindie_patch_minicpm_qwen2_v2/cann/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_mindie_patch_minicpm_qwen2_v2/cann/patches.zip +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_mindie_patch_minicpm_qwen2_v2/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_sglang_patch_server_args/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_sglang_patch_server_args/cuda/patches/sglang_001_fix_server_args.patch +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_sglang_patch_server_args/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251214_cuda_several_patches/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251214_cuda_several_patches/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251215_cann_several_patches/cann/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251215_cann_several_patches/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251216_sglang_uninstall_runai_model_streamer/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251216_sglang_uninstall_runai_model_streamer/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_rocm_install_petit_kernel/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_rocm_install_petit_kernel/rocm/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_vllm_install_audio_extra/cuda/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_vllm_install_audio_extra/matrix.yaml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_vllm_install_audio_extra/rocm/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/README.md +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cann/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cann/mindie-atb-models_2.2.rc1_linux-amd64_py3.11_torch2.1.0-abi0.tar.gz +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cann/mindie-atb-models_2.2.rc1_linux-arm64_py3.11_torch2.1.0-abi0.tar.gz +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cann/patches/mindie.zip +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/corex/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/discard_runner.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/dtk/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/expand_matrix.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/maca/Dockerfile +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/merge_runner.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/prune_runner.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/rocm/patches/sglang_001_wrong_vram.patch +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pyproject.toml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pytest.ini +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/ruff.toml +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/__init__.py +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_docker_image.json +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_list_backend_runners.json +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_list_runners_by_prefix.json +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_list_service_runners.json +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/test_runner.py +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/activate +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_current_date_time.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_get_temperature.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_get_weather.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_square_of_number.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_square_root_of_number.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_where_am_i.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/run_runner.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/run_runner_cluster.sh +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/uv.lock +0 -0
- {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/uv.toml +0 -0
|
@@ -117,7 +117,7 @@ package:
|
|
|
117
117
|
JOB_EXTRA_ARGS+=("--cache-from=type=registry,ref=gpustack/runner-build-cache:$${TAG_CACHE}"); \
|
|
118
118
|
done; \
|
|
119
119
|
fi; \
|
|
120
|
-
if [[ "$(PACKAGE_PUSH)" == "true"
|
|
120
|
+
if [[ "$(PACKAGE_PUSH)" == "true" || "$(PACKAGE_CACHE_PUSH)" == "true" ]] && [[ -z "$(PACKAGE_POST_OPERATION)" ]]; then \
|
|
121
121
|
for TAG_CACHE in $${JOB_PLATFORM_CACHE}; do \
|
|
122
122
|
JOB_EXTRA_ARGS+=("--cache-to=type=registry,ignore-error=true,mode=max,compression=gzip,ref=$(PACKAGE_NAMESPACE)/$(PACKAGE_CACHE_REPOSITORY):$${TAG_CACHE}"); \
|
|
123
123
|
done; \
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpustack-runner
|
|
3
|
-
Version: 0.1.22.
|
|
3
|
+
Version: 0.1.22.post4
|
|
4
4
|
Summary: GPUStack Runner is library for registering runnable accelerated backends and services in GPUStack.
|
|
5
5
|
Project-URL: Homepage, https://github.com/gpustack/runner
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/gpustack/gpustack/issues
|
|
@@ -86,12 +86,12 @@ The following table lists the supported accelerated backends and their correspon
|
|
|
86
86
|
> - Applied [Qwen2.5 VL patched](https://github.com/gpustack/gpustack/issues/3606) to vLLM 0.11.2.
|
|
87
87
|
> - Applied [vLLM[audio] packages](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L720-L724) to vLLM 0.11.2.
|
|
88
88
|
|
|
89
|
-
| CUDA Version <br/> (Variant) | vLLM
|
|
90
|
-
|
|
91
|
-
| 12.9 | `0.12.0`,
|
|
92
|
-
| 12.8 | `0.12.0`,
|
|
93
|
-
| 12.6 | `0.12.0`,
|
|
94
|
-
| 12.4 | `0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0`
|
|
89
|
+
| CUDA Version <br/> (Variant) | vLLM | SGLang | VoxBox |
|
|
90
|
+
|------------------------------|-------------------------------------------------------------------------------------------|-----------------------------------------------------------|--------------------|
|
|
91
|
+
| 12.9 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`** | `0.5.6.post2` | |
|
|
92
|
+
| 12.8 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0`, <br/>`0.10.2`, `0.10.1.1`, <br/>`0.10.0` | `0.5.6.post2`, `0.5.5.post3`, <br/>`0.5.5`, `0.5.4.post3` | `0.0.21`, `0.0.20` |
|
|
93
|
+
| 12.6 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0`, <br/>`0.10.2`, `0.10.1.1`, <br/>`0.10.0` | `0.5.6.post2` | `0.0.21`, `0.0.20` |
|
|
94
|
+
| 12.4 | `0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | | `0.0.20` |
|
|
95
95
|
|
|
96
96
|
### Hygon DTK
|
|
97
97
|
|
|
@@ -118,16 +118,17 @@ The following table lists the supported accelerated backends and their correspon
|
|
|
118
118
|
> - ROCm 7.0 vLLM `0.11.2/0.11.0` are reusing the official ROCm 6.4 PyTorch 2.9 wheel package rather than a ROCm
|
|
119
119
|
7.0 specific PyTorch build. Although supports ROCm 7.0 in vLLM `0.11.2/0.11.0`, `gfx1150/gfx1151` are not supported yet.
|
|
120
120
|
> - SGLang supports `gfx942` only.
|
|
121
|
+
> - ROCm 6.4 vLLM `0.13.0` supports `gfx903 gfx90a gfx942` only.
|
|
121
122
|
|
|
122
123
|
> [!IMPORTANT]
|
|
123
124
|
> - Applied [vLLM[audio] packages](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L720-L724) to vLLM 0.11.2.
|
|
124
125
|
> - Applied [petit-kernel package](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L728) to vLLM 0.11.2 and SGLang 0.5.5.post3.
|
|
125
126
|
|
|
126
|
-
| ROCm Version <br/> (Variant) | vLLM
|
|
127
|
-
|
|
128
|
-
| 7.0 | `0.12.0`,
|
|
129
|
-
| 6.4 | `0.12.0`,
|
|
130
|
-
| 6.3 | `0.10.1.1`, `0.10.0`
|
|
127
|
+
| ROCm Version <br/> (Variant) | vLLM | SGLang |
|
|
128
|
+
|------------------------------|-------------------------------------------------|----------------------------------|
|
|
129
|
+
| 7.0 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0` | `0.5.6.post2` |
|
|
130
|
+
| 6.4 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.10.2` | `0.5.6.post2`, **`0.5.5.post3`** |
|
|
131
|
+
| 6.3 | `0.10.1.1`, `0.10.0` | |
|
|
131
132
|
|
|
132
133
|
## Directory Structure
|
|
133
134
|
|
|
@@ -66,12 +66,12 @@ The following table lists the supported accelerated backends and their correspon
|
|
|
66
66
|
> - Applied [Qwen2.5 VL patched](https://github.com/gpustack/gpustack/issues/3606) to vLLM 0.11.2.
|
|
67
67
|
> - Applied [vLLM[audio] packages](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L720-L724) to vLLM 0.11.2.
|
|
68
68
|
|
|
69
|
-
| CUDA Version <br/> (Variant) | vLLM
|
|
70
|
-
|
|
71
|
-
| 12.9 | `0.12.0`,
|
|
72
|
-
| 12.8 | `0.12.0`,
|
|
73
|
-
| 12.6 | `0.12.0`,
|
|
74
|
-
| 12.4 | `0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0`
|
|
69
|
+
| CUDA Version <br/> (Variant) | vLLM | SGLang | VoxBox |
|
|
70
|
+
|------------------------------|-------------------------------------------------------------------------------------------|-----------------------------------------------------------|--------------------|
|
|
71
|
+
| 12.9 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`** | `0.5.6.post2` | |
|
|
72
|
+
| 12.8 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0`, <br/>`0.10.2`, `0.10.1.1`, <br/>`0.10.0` | `0.5.6.post2`, `0.5.5.post3`, <br/>`0.5.5`, `0.5.4.post3` | `0.0.21`, `0.0.20` |
|
|
73
|
+
| 12.6 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0`, <br/>`0.10.2`, `0.10.1.1`, <br/>`0.10.0` | `0.5.6.post2` | `0.0.21`, `0.0.20` |
|
|
74
|
+
| 12.4 | `0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | | `0.0.20` |
|
|
75
75
|
|
|
76
76
|
### Hygon DTK
|
|
77
77
|
|
|
@@ -98,16 +98,17 @@ The following table lists the supported accelerated backends and their correspon
|
|
|
98
98
|
> - ROCm 7.0 vLLM `0.11.2/0.11.0` are reusing the official ROCm 6.4 PyTorch 2.9 wheel package rather than a ROCm
|
|
99
99
|
7.0 specific PyTorch build. Although supports ROCm 7.0 in vLLM `0.11.2/0.11.0`, `gfx1150/gfx1151` are not supported yet.
|
|
100
100
|
> - SGLang supports `gfx942` only.
|
|
101
|
+
> - ROCm 6.4 vLLM `0.13.0` supports `gfx903 gfx90a gfx942` only.
|
|
101
102
|
|
|
102
103
|
> [!IMPORTANT]
|
|
103
104
|
> - Applied [vLLM[audio] packages](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L720-L724) to vLLM 0.11.2.
|
|
104
105
|
> - Applied [petit-kernel package](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L728) to vLLM 0.11.2 and SGLang 0.5.5.post3.
|
|
105
106
|
|
|
106
|
-
| ROCm Version <br/> (Variant) | vLLM
|
|
107
|
-
|
|
108
|
-
| 7.0 | `0.12.0`,
|
|
109
|
-
| 6.4 | `0.12.0`,
|
|
110
|
-
| 6.3 | `0.10.1.1`, `0.10.0`
|
|
107
|
+
| ROCm Version <br/> (Variant) | vLLM | SGLang |
|
|
108
|
+
|------------------------------|-------------------------------------------------|----------------------------------|
|
|
109
|
+
| 7.0 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0` | `0.5.6.post2` |
|
|
110
|
+
| 6.4 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.10.2` | `0.5.6.post2`, **`0.5.5.post3`** |
|
|
111
|
+
| 6.3 | `0.10.1.1`, `0.10.0` | |
|
|
111
112
|
|
|
112
113
|
## Directory Structure
|
|
113
114
|
|
|
@@ -27,8 +27,8 @@ version_tuple: VERSION_TUPLE
|
|
|
27
27
|
__commit_id__: COMMIT_ID
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
|
|
30
|
-
__version__ = version = '0.1.22.
|
|
31
|
-
__version_tuple__ = version_tuple = (0, 1, 22, '
|
|
30
|
+
__version__ = version = '0.1.22.post4'
|
|
31
|
+
__version_tuple__ = version_tuple = (0, 1, 22, 'post4')
|
|
32
32
|
try:
|
|
33
33
|
from ._version_appendix import git_commit
|
|
34
34
|
__commit_id__ = commit_id = git_commit
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
git_commit = "f3f4d02"
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/runner.py.json
RENAMED
|
@@ -604,6 +604,28 @@
|
|
|
604
604
|
"docker_image": "gpustack/runner:cuda12.9-sglang0.5.6.post2",
|
|
605
605
|
"deprecated": false
|
|
606
606
|
},
|
|
607
|
+
{
|
|
608
|
+
"backend": "cuda",
|
|
609
|
+
"backend_version": "12.9",
|
|
610
|
+
"original_backend_version": "12.9.1",
|
|
611
|
+
"backend_variant": "",
|
|
612
|
+
"service": "vllm",
|
|
613
|
+
"service_version": "0.13.0",
|
|
614
|
+
"platform": "linux/amd64",
|
|
615
|
+
"docker_image": "gpustack/runner:cuda12.9-vllm0.13.0",
|
|
616
|
+
"deprecated": false
|
|
617
|
+
},
|
|
618
|
+
{
|
|
619
|
+
"backend": "cuda",
|
|
620
|
+
"backend_version": "12.9",
|
|
621
|
+
"original_backend_version": "12.9.1",
|
|
622
|
+
"backend_variant": "",
|
|
623
|
+
"service": "vllm",
|
|
624
|
+
"service_version": "0.13.0",
|
|
625
|
+
"platform": "linux/arm64",
|
|
626
|
+
"docker_image": "gpustack/runner:cuda12.9-vllm0.13.0",
|
|
627
|
+
"deprecated": false
|
|
628
|
+
},
|
|
607
629
|
{
|
|
608
630
|
"backend": "cuda",
|
|
609
631
|
"backend_version": "12.9",
|
|
@@ -725,6 +747,28 @@
|
|
|
725
747
|
"docker_image": "gpustack/runner:cuda12.8-sglang0.5.4.post3",
|
|
726
748
|
"deprecated": false
|
|
727
749
|
},
|
|
750
|
+
{
|
|
751
|
+
"backend": "cuda",
|
|
752
|
+
"backend_version": "12.8",
|
|
753
|
+
"original_backend_version": "12.8.1",
|
|
754
|
+
"backend_variant": "",
|
|
755
|
+
"service": "vllm",
|
|
756
|
+
"service_version": "0.13.0",
|
|
757
|
+
"platform": "linux/amd64",
|
|
758
|
+
"docker_image": "gpustack/runner:cuda12.8-vllm0.13.0",
|
|
759
|
+
"deprecated": false
|
|
760
|
+
},
|
|
761
|
+
{
|
|
762
|
+
"backend": "cuda",
|
|
763
|
+
"backend_version": "12.8",
|
|
764
|
+
"original_backend_version": "12.8.1",
|
|
765
|
+
"backend_variant": "",
|
|
766
|
+
"service": "vllm",
|
|
767
|
+
"service_version": "0.13.0",
|
|
768
|
+
"platform": "linux/arm64",
|
|
769
|
+
"docker_image": "gpustack/runner:cuda12.8-vllm0.13.0",
|
|
770
|
+
"deprecated": false
|
|
771
|
+
},
|
|
728
772
|
{
|
|
729
773
|
"backend": "cuda",
|
|
730
774
|
"backend_version": "12.8",
|
|
@@ -857,6 +901,28 @@
|
|
|
857
901
|
"docker_image": "gpustack/runner:cuda12.8-vllm0.10.0",
|
|
858
902
|
"deprecated": false
|
|
859
903
|
},
|
|
904
|
+
{
|
|
905
|
+
"backend": "cuda",
|
|
906
|
+
"backend_version": "12.8",
|
|
907
|
+
"original_backend_version": "12.8.1",
|
|
908
|
+
"backend_variant": "",
|
|
909
|
+
"service": "voxbox",
|
|
910
|
+
"service_version": "0.0.21",
|
|
911
|
+
"platform": "linux/amd64",
|
|
912
|
+
"docker_image": "gpustack/runner:cuda12.8-voxbox0.0.21",
|
|
913
|
+
"deprecated": false
|
|
914
|
+
},
|
|
915
|
+
{
|
|
916
|
+
"backend": "cuda",
|
|
917
|
+
"backend_version": "12.8",
|
|
918
|
+
"original_backend_version": "12.8.1",
|
|
919
|
+
"backend_variant": "",
|
|
920
|
+
"service": "voxbox",
|
|
921
|
+
"service_version": "0.0.21",
|
|
922
|
+
"platform": "linux/arm64",
|
|
923
|
+
"docker_image": "gpustack/runner:cuda12.8-voxbox0.0.21",
|
|
924
|
+
"deprecated": false
|
|
925
|
+
},
|
|
860
926
|
{
|
|
861
927
|
"backend": "cuda",
|
|
862
928
|
"backend_version": "12.8",
|
|
@@ -879,6 +945,28 @@
|
|
|
879
945
|
"docker_image": "gpustack/runner:cuda12.8-voxbox0.0.20",
|
|
880
946
|
"deprecated": false
|
|
881
947
|
},
|
|
948
|
+
{
|
|
949
|
+
"backend": "cuda",
|
|
950
|
+
"backend_version": "12.6",
|
|
951
|
+
"original_backend_version": "12.6.3",
|
|
952
|
+
"backend_variant": "",
|
|
953
|
+
"service": "vllm",
|
|
954
|
+
"service_version": "0.13.0",
|
|
955
|
+
"platform": "linux/amd64",
|
|
956
|
+
"docker_image": "gpustack/runner:cuda12.6-vllm0.13.0",
|
|
957
|
+
"deprecated": false
|
|
958
|
+
},
|
|
959
|
+
{
|
|
960
|
+
"backend": "cuda",
|
|
961
|
+
"backend_version": "12.6",
|
|
962
|
+
"original_backend_version": "12.6.3",
|
|
963
|
+
"backend_variant": "",
|
|
964
|
+
"service": "vllm",
|
|
965
|
+
"service_version": "0.13.0",
|
|
966
|
+
"platform": "linux/arm64",
|
|
967
|
+
"docker_image": "gpustack/runner:cuda12.6-vllm0.13.0",
|
|
968
|
+
"deprecated": false
|
|
969
|
+
},
|
|
882
970
|
{
|
|
883
971
|
"backend": "cuda",
|
|
884
972
|
"backend_version": "12.6",
|
|
@@ -1011,6 +1099,28 @@
|
|
|
1011
1099
|
"docker_image": "gpustack/runner:cuda12.6-vllm0.10.0",
|
|
1012
1100
|
"deprecated": false
|
|
1013
1101
|
},
|
|
1102
|
+
{
|
|
1103
|
+
"backend": "cuda",
|
|
1104
|
+
"backend_version": "12.6",
|
|
1105
|
+
"original_backend_version": "12.6.3",
|
|
1106
|
+
"backend_variant": "",
|
|
1107
|
+
"service": "voxbox",
|
|
1108
|
+
"service_version": "0.0.21",
|
|
1109
|
+
"platform": "linux/amd64",
|
|
1110
|
+
"docker_image": "gpustack/runner:cuda12.6-voxbox0.0.21",
|
|
1111
|
+
"deprecated": false
|
|
1112
|
+
},
|
|
1113
|
+
{
|
|
1114
|
+
"backend": "cuda",
|
|
1115
|
+
"backend_version": "12.6",
|
|
1116
|
+
"original_backend_version": "12.6.3",
|
|
1117
|
+
"backend_variant": "",
|
|
1118
|
+
"service": "voxbox",
|
|
1119
|
+
"service_version": "0.0.21",
|
|
1120
|
+
"platform": "linux/arm64",
|
|
1121
|
+
"docker_image": "gpustack/runner:cuda12.6-voxbox0.0.21",
|
|
1122
|
+
"deprecated": false
|
|
1123
|
+
},
|
|
1014
1124
|
{
|
|
1015
1125
|
"backend": "cuda",
|
|
1016
1126
|
"backend_version": "12.6",
|
|
@@ -1198,6 +1308,17 @@
|
|
|
1198
1308
|
"docker_image": "gpustack/runner:rocm7.0-sglang0.5.6.post2",
|
|
1199
1309
|
"deprecated": false
|
|
1200
1310
|
},
|
|
1311
|
+
{
|
|
1312
|
+
"backend": "rocm",
|
|
1313
|
+
"backend_version": "7.0",
|
|
1314
|
+
"original_backend_version": "7.0.2",
|
|
1315
|
+
"backend_variant": "",
|
|
1316
|
+
"service": "vllm",
|
|
1317
|
+
"service_version": "0.13.0",
|
|
1318
|
+
"platform": "linux/amd64",
|
|
1319
|
+
"docker_image": "gpustack/runner:rocm7.0-vllm0.13.0",
|
|
1320
|
+
"deprecated": false
|
|
1321
|
+
},
|
|
1201
1322
|
{
|
|
1202
1323
|
"backend": "rocm",
|
|
1203
1324
|
"backend_version": "7.0",
|
|
@@ -1253,6 +1374,17 @@
|
|
|
1253
1374
|
"docker_image": "gpustack/runner:rocm6.4-sglang0.5.5.post3",
|
|
1254
1375
|
"deprecated": false
|
|
1255
1376
|
},
|
|
1377
|
+
{
|
|
1378
|
+
"backend": "rocm",
|
|
1379
|
+
"backend_version": "6.4",
|
|
1380
|
+
"original_backend_version": "6.4.4",
|
|
1381
|
+
"backend_variant": "",
|
|
1382
|
+
"service": "vllm",
|
|
1383
|
+
"service_version": "0.13.0",
|
|
1384
|
+
"platform": "linux/amd64",
|
|
1385
|
+
"docker_image": "gpustack/runner:rocm6.4-vllm0.13.0",
|
|
1386
|
+
"deprecated": false
|
|
1387
|
+
},
|
|
1256
1388
|
{
|
|
1257
1389
|
"backend": "rocm",
|
|
1258
1390
|
"backend_version": "6.4",
|
|
@@ -95,11 +95,11 @@ ARG CMAKE_MAX_JOBS
|
|
|
95
95
|
ARG CUDA_VERSION=12.9.1
|
|
96
96
|
ARG CUDA_ARCHS
|
|
97
97
|
ARG VOXBOX_BASE_IMAGE=gpustack/runner:cuda${CUDA_VERSION}-python${PYTHON_VERSION}
|
|
98
|
-
ARG VOXBOX_VERSION=0.0.
|
|
98
|
+
ARG VOXBOX_VERSION=0.0.21
|
|
99
99
|
ARG VOXBOX_TORCH_VERSION=2.7.1
|
|
100
100
|
ARG VOXBOX_TORCH_CUDA_VERSION=${CUDA_VERSION}
|
|
101
101
|
ARG VLLM_BASE_IMAGE=gpustack/runner:cuda${CUDA_VERSION}-python${PYTHON_VERSION}
|
|
102
|
-
ARG VLLM_VERSION=0.
|
|
102
|
+
ARG VLLM_VERSION=0.13.0
|
|
103
103
|
ARG VLLM_TORCH_VERSION=2.9.0
|
|
104
104
|
ARG VLLM_TORCH_CUDA_VERSION=${CUDA_VERSION}
|
|
105
105
|
ARG VLLM_BUILD_BASE_IMAGE=gpustack/runner:cuda${VLLM_TORCH_CUDA_VERSION}-python${PYTHON_VERSION}
|
|
@@ -112,7 +112,7 @@ ARG VLLM_DEEPEP_COMMIT=b57e5e21
|
|
|
112
112
|
ARG VLLM_DEEPGEMM_COMMIT=9b680f42
|
|
113
113
|
ARG VLLM_FLASHINFER_VERSION=0.5.3
|
|
114
114
|
ARG VLLM_FLASHATTENTION_VERSION=2.8.3
|
|
115
|
-
ARG VLLM_LMCACHE_VERSION=0.3.
|
|
115
|
+
ARG VLLM_LMCACHE_VERSION=0.3.11
|
|
116
116
|
ARG VLLM_MOONCAKE_VERSION=0.3.7.post2
|
|
117
117
|
ARG SGLANG_BASE_IMAGE=vllm
|
|
118
118
|
ARG SGLANG_VERSION=0.5.6.post2
|
|
@@ -492,6 +492,7 @@ einops
|
|
|
492
492
|
cuda-python==${CUDA_MAJOR}.${CUDA_MINOR}
|
|
493
493
|
pynvml==${CUDA_MAJOR}
|
|
494
494
|
nvidia-nvshmem-cu${CUDA_MAJOR}
|
|
495
|
+
nvshmem4py-cu${CUDA_MAJOR}
|
|
495
496
|
EOT
|
|
496
497
|
uv pip install \
|
|
497
498
|
-r /tmp/requirements.txt
|
|
@@ -575,6 +576,20 @@ RUN <<EOF
|
|
|
575
576
|
|
|
576
577
|
IFS="." read -r TORCH_MAJOR TORCH_MINOR TORCH_PATCH <<< "${VLLM_TORCH_VERSION}"
|
|
577
578
|
IFS="." read -r CUDA_MAJOR CUDA_MINOR CUDA_PATCH <<< "${VLLM_TORCH_CUDA_VERSION}"
|
|
579
|
+
IFS="." read -r PYTHON_MAJOR PYTHON_MINOR <<< "${PYTHON_VERSION}"
|
|
580
|
+
|
|
581
|
+
PYTHON_MAJOR_MINOR="${PYTHON_MAJOR}${PYTHON_MINOR}"
|
|
582
|
+
for ABI in FALSE TRUE; do
|
|
583
|
+
PREBUILD_URL="https://github.com/Dao-AILab/flash-attention/releases/download/v${VLLM_FLASHATTENTION_VERSION}/flash_attn-${VLLM_FLASHATTENTION_VERSION}+cu${CUDA_MAJOR}torch${TORCH_MAJOR}.${TORCH_MINOR}cxx11abi${ABI}-cp${PYTHON_MAJOR_MINOR}-cp${PYTHON_MAJOR_MINOR}-linux_$(uname -m).whl"
|
|
584
|
+
if curl --retry 3 --retry-connrefused -fsSIL "${PREBUILD_URL}" >/dev/null 2>&1; then
|
|
585
|
+
echo "Downloading prebuilt FlashAttention wheel from ${PREBUILD_URL}..."
|
|
586
|
+
curl --retry 3 --retry-connrefused -fL "${PREBUILD_URL}" -o "/tmp/flash_attn-${VLLM_FLASHATTENTION_VERSION}+cu${CUDA_MAJOR}torch${TORCH_MAJOR}.${TORCH_MINOR}cxx11abi${ABI}-cp${PYTHON_MAJOR_MINOR}-cp${PYTHON_MAJOR_MINOR}-linux_$(uname -m).whl"
|
|
587
|
+
mkdir -p /workspace \
|
|
588
|
+
&& mv /tmp/*.whl /workspace \
|
|
589
|
+
&& tree -hs /workspace
|
|
590
|
+
exit 0
|
|
591
|
+
fi
|
|
592
|
+
done
|
|
578
593
|
|
|
579
594
|
# Support ARM64 only
|
|
580
595
|
if [[ "${TARGETARCH}" != "amd64" ]]; then
|
|
@@ -582,16 +597,6 @@ RUN <<EOF
|
|
|
582
597
|
exit 0
|
|
583
598
|
fi
|
|
584
599
|
|
|
585
|
-
PREBUILD_URL="https://github.com/Dao-AILab/flash-attention/releases/download/v${VLLM_FLASHATTENTION_VERSION}/flash_attn-${VLLM_FLASHATTENTION_VERSION}+cu${CUDA_MAJOR}torch${TORCH_MAJOR}.${TORCH_MINOR}cxx11abiFALSE-cp310-cp310-linux_$(uname -m).whl"
|
|
586
|
-
if curl --retry 3 --retry-connrefused -fsSIL "${PREBUILD_URL}" >/dev/null 2>&1; then
|
|
587
|
-
echo "Downloading prebuilt FlashAttention wheel from ${PREBUILD_URL}..."
|
|
588
|
-
curl --retry 3 --retry-connrefused -fL "${PREBUILD_URL}" -o "/tmp/flash_attn-${VLLM_FLASHATTENTION_VERSION}+cu${CUDA_MAJOR}torch${TORCH_MAJOR}.${TORCH_MINOR}cxx11abiFALSE-cp310-cp310-linux_$(uname -m).whl"
|
|
589
|
-
mkdir -p /workspace \
|
|
590
|
-
&& mv /tmp/*.whl /workspace \
|
|
591
|
-
&& tree -hs /workspace
|
|
592
|
-
exit 0
|
|
593
|
-
fi
|
|
594
|
-
|
|
595
600
|
# Download
|
|
596
601
|
git -C /tmp clone --recursive --shallow-submodules \
|
|
597
602
|
--depth 1 --branch v${VLLM_FLASHATTENTION_VERSION} --single-branch \
|
|
@@ -962,7 +967,7 @@ ARG VLLM_VERSION
|
|
|
962
967
|
|
|
963
968
|
ENV VLLM_VERSION=${VLLM_VERSION}
|
|
964
969
|
|
|
965
|
-
RUN <<EOF
|
|
970
|
+
RUN --mount=type=bind,from=vllm-build-flashattention,source=/,target=/flashattention,rw <<EOF
|
|
966
971
|
# vLLM
|
|
967
972
|
|
|
968
973
|
IFS="." read -r CUDA_MAJOR CUDA_MINOR CUDA_PATCH <<< "${VLLM_TORCH_CUDA_VERSION}"
|
|
@@ -986,6 +991,8 @@ RUN <<EOF
|
|
|
986
991
|
export TORCH_CUDA_ARCH_LIST="${VL_CUDA_ARCHS}"
|
|
987
992
|
export COMPILE_CUSTOM_KERNELS=1
|
|
988
993
|
export NVCC_THREADS=1
|
|
994
|
+
echo "Building vLLM with the following environment variables:"
|
|
995
|
+
env
|
|
989
996
|
|
|
990
997
|
# Install
|
|
991
998
|
git -C /tmp clone --recursive --shallow-submodules \
|
|
@@ -1047,6 +1054,9 @@ RUN --mount=type=bind,from=vllm-build-vllm,source=/,target=/vllm,rw <<EOF
|
|
|
1047
1054
|
export MAX_JOBS="${CMAKE_MAX_JOBS}"
|
|
1048
1055
|
export TORCH_CUDA_ARCH_LIST="${LC_CUDA_ARCHS}"
|
|
1049
1056
|
export NVCC_THREADS=1
|
|
1057
|
+
echo "Building LMCache with the following environment variables:"
|
|
1058
|
+
env
|
|
1059
|
+
|
|
1050
1060
|
git -C /tmp clone --recursive --shallow-submodules \
|
|
1051
1061
|
--depth 1 --branch v${VLLM_LMCACHE_VERSION} --single-branch \
|
|
1052
1062
|
https://github.com/LMCache/LMCache.git lmcache
|
|
@@ -69,10 +69,10 @@
|
|
|
69
69
|
# which is used to build the SGLang from source.
|
|
70
70
|
ARG PYTHON_VERSION=3.12
|
|
71
71
|
ARG CMAKE_MAX_JOBS
|
|
72
|
-
ARG ROCM_VERSION=7.
|
|
72
|
+
ARG ROCM_VERSION=7.0.2
|
|
73
73
|
ARG ROCM_ARCHS
|
|
74
74
|
ARG VLLM_BASE_IMAGE=gpustack/runner:rocm${ROCM_VERSION}-python${PYTHON_VERSION}
|
|
75
|
-
ARG VLLM_VERSION=0.
|
|
75
|
+
ARG VLLM_VERSION=0.13.0
|
|
76
76
|
ARG VLLM_TORCH_VERSION=2.9.1
|
|
77
77
|
ARG VLLM_TORCH_ROCM_VERSION=${ROCM_VERSION}
|
|
78
78
|
ARG VLLM_TORCH_SOURCE=pytorch
|
|
@@ -80,7 +80,7 @@ ARG VLLM_BUILD_BASE_IMAGE=gpustack/runner:rocm${VLLM_TORCH_ROCM_VERSION}-python$
|
|
|
80
80
|
ARG VLLM_TRITON_COMMIT=57c693b6
|
|
81
81
|
ARG VLLM_FLASHATTENTION_VERSION=2.8.3
|
|
82
82
|
ARG VLLM_AITER_VERSION=0.1.7.post5
|
|
83
|
-
ARG VLLM_LMCACHE_VERSION=0.3.
|
|
83
|
+
ARG VLLM_LMCACHE_VERSION=0.3.11
|
|
84
84
|
ARG VLLM_MOONCAKE_VERSION=0.3.7.post2
|
|
85
85
|
ARG SGLANG_BASE_IMAGE=vllm
|
|
86
86
|
ARG SGLANG_VERSION=0.5.6.post2
|
|
@@ -679,12 +679,12 @@ ARG VLLM_VERSION
|
|
|
679
679
|
|
|
680
680
|
ENV VLLM_VERSION=${VLLM_VERSION}
|
|
681
681
|
|
|
682
|
-
RUN --mount=type=bind,from=vllm-build-
|
|
683
|
-
--mount=type=bind,from=vllm-build-flashattention,source=/,target=/flashattention,rw \
|
|
682
|
+
RUN --mount=type=bind,from=vllm-build-flashattention,source=/,target=/flashattention,rw \
|
|
684
683
|
--mount=type=bind,from=vllm-build-aiter,source=/,target=/aiter,rw <<EOF
|
|
685
684
|
# vLLM
|
|
686
685
|
|
|
687
686
|
IFS="." read -r ROCM_MAJOR ROCM_MINOR ROCM_PATCH <<< "${VLLM_TORCH_ROCM_VERSION}"
|
|
687
|
+
IFS="." read -r VL_MAJOR VL_MINOR VL_PATCH <<< "${VLLM_VERSION}"
|
|
688
688
|
|
|
689
689
|
CMAKE_MAX_JOBS="${CMAKE_MAX_JOBS}"
|
|
690
690
|
if [[ -z "${CMAKE_MAX_JOBS}" ]]; then
|
|
@@ -697,6 +697,14 @@ RUN --mount=type=bind,from=vllm-build-triton,source=/,target=/triton,rw \
|
|
|
697
697
|
if [[ -z "${VL_ROCM_ARCHS}" ]]; then
|
|
698
698
|
if (( $(echo "${ROCM_MAJOR}.${ROCM_MINOR} < 7.0" | bc -l) )); then
|
|
699
699
|
VL_ROCM_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100"
|
|
700
|
+
if (( $(echo "${VL_MAJOR}.${VL_MINOR} == 0.13" | bc -l) )); then
|
|
701
|
+
# TODO(thxCode): Temporarily remove gfx1030 for vLLM ROCm build due to build error in ROCm 6.4.4.
|
|
702
|
+
# #15 134.9 /tmp/vllm/build/temp.linux-x86_64-cpython-312/csrc/sampler.hip:564:63: error: local memory (66032) exceeds limit (65536) in 'void vllm::topKPerRowDecode<1024, true, false, true>(float const*, int const*, int*, int, int, int, int, float*, int, int const*)'
|
|
703
|
+
# ##15 134.9 564 | static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
|
|
704
|
+
# ##15 134.9 | ^
|
|
705
|
+
# ##15 134.9 16 warnings and 1 error generated when compiling for gfx1030.
|
|
706
|
+
VL_ROCM_ARCHS="gfx908;gfx90a;gfx942"
|
|
707
|
+
fi
|
|
700
708
|
else
|
|
701
709
|
VL_ROCM_ARCHS="gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
|
|
702
710
|
fi
|
|
@@ -704,6 +712,8 @@ RUN --mount=type=bind,from=vllm-build-triton,source=/,target=/triton,rw \
|
|
|
704
712
|
export MAX_JOBS="${CMAKE_MAX_JOBS}"
|
|
705
713
|
export COMPILE_CUSTOM_KERNELS=1
|
|
706
714
|
export PYTORCH_ROCM_ARCH="${VL_ROCM_ARCHS}"
|
|
715
|
+
echo "Building vLLM with the following environment variables:"
|
|
716
|
+
env
|
|
707
717
|
|
|
708
718
|
# Build
|
|
709
719
|
git -C /tmp clone --recursive --shallow-submodules \
|
|
@@ -712,7 +722,9 @@ RUN --mount=type=bind,from=vllm-build-triton,source=/,target=/triton,rw \
|
|
|
712
722
|
pushd /tmp/vllm \
|
|
713
723
|
&& sed -i "s/\"torch ==.*\"/\"torch\"/g" /tmp/vllm/pyproject.toml \
|
|
714
724
|
&& sed -i "s/\"torch==.*\"/\"torch\"/g" /tmp/vllm/requirements/rocm-build.txt \
|
|
725
|
+
&& sed -i "s/\"torchvision==.*\"/\"torchvision\"/g" /tmp/vllm/requirements/rocm-build.txt \
|
|
715
726
|
&& sed -i "s/\"torchaudio==.*\"/\"torchaudio\"/g" /tmp/vllm/requirements/rocm-build.txt \
|
|
727
|
+
&& sed -i "s/\"triton==.*\"/\"triton\"/g" /tmp/vllm/requirements/rocm-build.txt \
|
|
716
728
|
&& VLLM_TARGET_DEVICE="rocm" python -v -m build --no-isolation --wheel \
|
|
717
729
|
&& tree -hs /tmp/vllm/dist \
|
|
718
730
|
&& mv /tmp/vllm/dist /workspace
|
|
@@ -769,6 +781,8 @@ RUN --mount=type=bind,from=vllm-build-vllm,source=/,target=/vllm,rw <<EOF
|
|
|
769
781
|
export TORCH_DONT_CHECK_COMPILER_ABI=1
|
|
770
782
|
export CXX=hipcc
|
|
771
783
|
export BUILD_WITH_HIP=1
|
|
784
|
+
echo "Building LMCache with the following environment variables:"
|
|
785
|
+
env
|
|
772
786
|
|
|
773
787
|
# Install LMCache
|
|
774
788
|
git -C /tmp clone --recursive --shallow-submodules \
|
|
@@ -1403,7 +1417,7 @@ RUN --mount=type=bind,target=/workspace,rw <<EOF
|
|
|
1403
1417
|
|
|
1404
1418
|
tree -hs /workspace/patches
|
|
1405
1419
|
pushd $(pip show sglang | grep Location: | cut -d" " -f 2) \
|
|
1406
|
-
&& patch -p1 < /workspace/patches
|
|
1420
|
+
&& patch -p1 < /workspace/patches/sglang_*.patch
|
|
1407
1421
|
EOF
|
|
1408
1422
|
|
|
1409
1423
|
## Entrypoint
|
|
@@ -626,6 +626,28 @@
|
|
|
626
626
|
"docker_image": "gpustack/runner:cuda12.9-sglang0.5.6.post2",
|
|
627
627
|
"deprecated": false
|
|
628
628
|
},
|
|
629
|
+
{
|
|
630
|
+
"backend": "cuda",
|
|
631
|
+
"backend_version": "12.9",
|
|
632
|
+
"original_backend_version": "12.9.1",
|
|
633
|
+
"backend_variant": "",
|
|
634
|
+
"service": "vllm",
|
|
635
|
+
"service_version": "0.13.0",
|
|
636
|
+
"platform": "linux/amd64",
|
|
637
|
+
"docker_image": "gpustack/runner:cuda12.9-vllm0.13.0",
|
|
638
|
+
"deprecated": false
|
|
639
|
+
},
|
|
640
|
+
{
|
|
641
|
+
"backend": "cuda",
|
|
642
|
+
"backend_version": "12.9",
|
|
643
|
+
"original_backend_version": "12.9.1",
|
|
644
|
+
"backend_variant": "",
|
|
645
|
+
"service": "vllm",
|
|
646
|
+
"service_version": "0.13.0",
|
|
647
|
+
"platform": "linux/arm64",
|
|
648
|
+
"docker_image": "gpustack/runner:cuda12.9-vllm0.13.0",
|
|
649
|
+
"deprecated": false
|
|
650
|
+
},
|
|
629
651
|
{
|
|
630
652
|
"backend": "cuda",
|
|
631
653
|
"backend_version": "12.9",
|
|
@@ -747,6 +769,28 @@
|
|
|
747
769
|
"docker_image": "gpustack/runner:cuda12.8-sglang0.5.4.post3",
|
|
748
770
|
"deprecated": false
|
|
749
771
|
},
|
|
772
|
+
{
|
|
773
|
+
"backend": "cuda",
|
|
774
|
+
"backend_version": "12.8",
|
|
775
|
+
"original_backend_version": "12.8.1",
|
|
776
|
+
"backend_variant": "",
|
|
777
|
+
"service": "vllm",
|
|
778
|
+
"service_version": "0.13.0",
|
|
779
|
+
"platform": "linux/amd64",
|
|
780
|
+
"docker_image": "gpustack/runner:cuda12.8-vllm0.13.0",
|
|
781
|
+
"deprecated": false
|
|
782
|
+
},
|
|
783
|
+
{
|
|
784
|
+
"backend": "cuda",
|
|
785
|
+
"backend_version": "12.8",
|
|
786
|
+
"original_backend_version": "12.8.1",
|
|
787
|
+
"backend_variant": "",
|
|
788
|
+
"service": "vllm",
|
|
789
|
+
"service_version": "0.13.0",
|
|
790
|
+
"platform": "linux/arm64",
|
|
791
|
+
"docker_image": "gpustack/runner:cuda12.8-vllm0.13.0",
|
|
792
|
+
"deprecated": false
|
|
793
|
+
},
|
|
750
794
|
{
|
|
751
795
|
"backend": "cuda",
|
|
752
796
|
"backend_version": "12.8",
|
|
@@ -879,6 +923,28 @@
|
|
|
879
923
|
"docker_image": "gpustack/runner:cuda12.8-vllm0.10.0",
|
|
880
924
|
"deprecated": false
|
|
881
925
|
},
|
|
926
|
+
{
|
|
927
|
+
"backend": "cuda",
|
|
928
|
+
"backend_version": "12.8",
|
|
929
|
+
"original_backend_version": "12.8.1",
|
|
930
|
+
"backend_variant": "",
|
|
931
|
+
"service": "voxbox",
|
|
932
|
+
"service_version": "0.0.21",
|
|
933
|
+
"platform": "linux/amd64",
|
|
934
|
+
"docker_image": "gpustack/runner:cuda12.8-voxbox0.0.21",
|
|
935
|
+
"deprecated": false
|
|
936
|
+
},
|
|
937
|
+
{
|
|
938
|
+
"backend": "cuda",
|
|
939
|
+
"backend_version": "12.8",
|
|
940
|
+
"original_backend_version": "12.8.1",
|
|
941
|
+
"backend_variant": "",
|
|
942
|
+
"service": "voxbox",
|
|
943
|
+
"service_version": "0.0.21",
|
|
944
|
+
"platform": "linux/arm64",
|
|
945
|
+
"docker_image": "gpustack/runner:cuda12.8-voxbox0.0.21",
|
|
946
|
+
"deprecated": false
|
|
947
|
+
},
|
|
882
948
|
{
|
|
883
949
|
"backend": "cuda",
|
|
884
950
|
"backend_version": "12.8",
|
|
@@ -901,6 +967,28 @@
|
|
|
901
967
|
"docker_image": "gpustack/runner:cuda12.8-voxbox0.0.20",
|
|
902
968
|
"deprecated": false
|
|
903
969
|
},
|
|
970
|
+
{
|
|
971
|
+
"backend": "cuda",
|
|
972
|
+
"backend_version": "12.6",
|
|
973
|
+
"original_backend_version": "12.6.3",
|
|
974
|
+
"backend_variant": "",
|
|
975
|
+
"service": "vllm",
|
|
976
|
+
"service_version": "0.13.0",
|
|
977
|
+
"platform": "linux/amd64",
|
|
978
|
+
"docker_image": "gpustack/runner:cuda12.6-vllm0.13.0",
|
|
979
|
+
"deprecated": false
|
|
980
|
+
},
|
|
981
|
+
{
|
|
982
|
+
"backend": "cuda",
|
|
983
|
+
"backend_version": "12.6",
|
|
984
|
+
"original_backend_version": "12.6.3",
|
|
985
|
+
"backend_variant": "",
|
|
986
|
+
"service": "vllm",
|
|
987
|
+
"service_version": "0.13.0",
|
|
988
|
+
"platform": "linux/arm64",
|
|
989
|
+
"docker_image": "gpustack/runner:cuda12.6-vllm0.13.0",
|
|
990
|
+
"deprecated": false
|
|
991
|
+
},
|
|
904
992
|
{
|
|
905
993
|
"backend": "cuda",
|
|
906
994
|
"backend_version": "12.6",
|
|
@@ -1033,6 +1121,28 @@
|
|
|
1033
1121
|
"docker_image": "gpustack/runner:cuda12.6-vllm0.10.0",
|
|
1034
1122
|
"deprecated": false
|
|
1035
1123
|
},
|
|
1124
|
+
{
|
|
1125
|
+
"backend": "cuda",
|
|
1126
|
+
"backend_version": "12.6",
|
|
1127
|
+
"original_backend_version": "12.6.3",
|
|
1128
|
+
"backend_variant": "",
|
|
1129
|
+
"service": "voxbox",
|
|
1130
|
+
"service_version": "0.0.21",
|
|
1131
|
+
"platform": "linux/amd64",
|
|
1132
|
+
"docker_image": "gpustack/runner:cuda12.6-voxbox0.0.21",
|
|
1133
|
+
"deprecated": false
|
|
1134
|
+
},
|
|
1135
|
+
{
|
|
1136
|
+
"backend": "cuda",
|
|
1137
|
+
"backend_version": "12.6",
|
|
1138
|
+
"original_backend_version": "12.6.3",
|
|
1139
|
+
"backend_variant": "",
|
|
1140
|
+
"service": "voxbox",
|
|
1141
|
+
"service_version": "0.0.21",
|
|
1142
|
+
"platform": "linux/arm64",
|
|
1143
|
+
"docker_image": "gpustack/runner:cuda12.6-voxbox0.0.21",
|
|
1144
|
+
"deprecated": false
|
|
1145
|
+
},
|
|
1036
1146
|
{
|
|
1037
1147
|
"backend": "cuda",
|
|
1038
1148
|
"backend_version": "12.6",
|
|
@@ -1244,6 +1354,17 @@
|
|
|
1244
1354
|
"docker_image": "gpustack/runner:rocm7.0-sglang0.5.6.post2",
|
|
1245
1355
|
"deprecated": false
|
|
1246
1356
|
},
|
|
1357
|
+
{
|
|
1358
|
+
"backend": "rocm",
|
|
1359
|
+
"backend_version": "7.0",
|
|
1360
|
+
"original_backend_version": "7.0.2",
|
|
1361
|
+
"backend_variant": "",
|
|
1362
|
+
"service": "vllm",
|
|
1363
|
+
"service_version": "0.13.0",
|
|
1364
|
+
"platform": "linux/amd64",
|
|
1365
|
+
"docker_image": "gpustack/runner:rocm7.0-vllm0.13.0",
|
|
1366
|
+
"deprecated": false
|
|
1367
|
+
},
|
|
1247
1368
|
{
|
|
1248
1369
|
"backend": "rocm",
|
|
1249
1370
|
"backend_version": "7.0",
|
|
@@ -1299,6 +1420,17 @@
|
|
|
1299
1420
|
"docker_image": "gpustack/runner:rocm6.4-sglang0.5.5.post3",
|
|
1300
1421
|
"deprecated": false
|
|
1301
1422
|
},
|
|
1423
|
+
{
|
|
1424
|
+
"backend": "rocm",
|
|
1425
|
+
"backend_version": "6.4",
|
|
1426
|
+
"original_backend_version": "6.4.4",
|
|
1427
|
+
"backend_variant": "",
|
|
1428
|
+
"service": "vllm",
|
|
1429
|
+
"service_version": "0.13.0",
|
|
1430
|
+
"platform": "linux/amd64",
|
|
1431
|
+
"docker_image": "gpustack/runner:rocm6.4-vllm0.13.0",
|
|
1432
|
+
"deprecated": false
|
|
1433
|
+
},
|
|
1302
1434
|
{
|
|
1303
1435
|
"backend": "rocm",
|
|
1304
1436
|
"backend_version": "6.4",
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
git_commit = "457b969"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/docs/modules/gpustack_runner.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/cmds/__init__.py
RENAMED
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/cmds/__types__.py
RENAMED
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/cmds/images.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/test_runner.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_current_date_time.sh
RENAMED
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_get_temperature.sh
RENAMED
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_get_weather.sh
RENAMED
|
File without changes
|
{gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_square_of_number.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|