gpustack-runner 0.1.22.post2__tar.gz → 0.1.22.post4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/Makefile +1 -1
  2. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/PKG-INFO +13 -12
  3. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/README.md +12 -11
  4. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/_version.py +2 -2
  5. gpustack_runner-0.1.22.post4/gpustack_runner/_version_appendix.py +1 -0
  6. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/runner.py.json +132 -0
  7. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cuda/Dockerfile +24 -14
  8. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/matrix.yaml +0 -1
  9. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/rocm/Dockerfile +20 -6
  10. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_list_runners_by_backend.json +132 -0
  11. gpustack_runner-0.1.22.post2/gpustack_runner/_version_appendix.py +0 -1
  12. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.codespelldict +0 -0
  13. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.codespellrc +0 -0
  14. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.gitattributes +0 -0
  15. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.gitignore +0 -0
  16. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.pre-commit-config.yaml +0 -0
  17. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/.python-version +0 -0
  18. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/LICENSE +0 -0
  19. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/docs/index.md +0 -0
  20. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/docs/modules/gpustack_runner.md +0 -0
  21. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/__init__.py +0 -0
  22. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/__main__.py +0 -0
  23. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/_version.pyi +0 -0
  24. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/cmds/__init__.py +0 -0
  25. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/cmds/__types__.py +0 -0
  26. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/cmds/images.py +0 -0
  27. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/gpustack_runner/runner.py +0 -0
  28. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/hatch.toml +0 -0
  29. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/mkdocs.yml +0 -0
  30. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251020_vllm_install_lmcache/cann/Dockerfile +0 -0
  31. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251020_vllm_install_lmcache/cuda/Dockerfile +0 -0
  32. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251020_vllm_install_lmcache/matrix.yaml +0 -0
  33. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251020_vllm_install_lmcache/rocm/Dockerfile +0 -0
  34. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_client/cann/Dockerfile +0 -0
  35. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_client/cuda/Dockerfile +0 -0
  36. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_client/matrix.yaml +0 -0
  37. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_client/rocm/Dockerfile +0 -0
  38. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_default/cuda/Dockerfile +0 -0
  39. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_default/matrix.yaml +0 -0
  40. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251022_vllm_install_ray_default/rocm/Dockerfile +0 -0
  41. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251024_vllm_install_nvidia_hpcx/cuda/Dockerfile +0 -0
  42. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251024_vllm_install_nvidia_hpcx/matrix.yaml +0 -0
  43. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251024_vllm_reinstall_lmcache/cuda/Dockerfile +0 -0
  44. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251024_vllm_reinstall_lmcache/matrix.yaml +0 -0
  45. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251029_vllm_reinstall_ray/cann/Dockerfile +0 -0
  46. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251029_vllm_reinstall_ray/matrix.yaml +0 -0
  47. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251103_mindie_refresh_entrypoint/cann/Dockerfile +0 -0
  48. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251103_mindie_refresh_entrypoint/matrix.yaml +0 -0
  49. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251105_vllm_polish_nvidia_hpcx/cuda/Dockerfile +0 -0
  50. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251105_vllm_polish_nvidia_hpcx/matrix.yaml +0 -0
  51. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251106_vllm_install_ep_kernel/cuda/Dockerfile +0 -0
  52. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251106_vllm_install_ep_kernel/matrix.yaml +0 -0
  53. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251107_vllm_reinstall_lmcache/cuda/Dockerfile +0 -0
  54. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251107_vllm_reinstall_lmcache/matrix.yaml +0 -0
  55. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251110_sglang_install_diffusion/cuda/Dockerfile +0 -0
  56. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251110_sglang_install_diffusion/matrix.yaml +0 -0
  57. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251110_sglang_install_flashattn/cuda/Dockerfile +0 -0
  58. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251110_sglang_install_flashattn/matrix.yaml +0 -0
  59. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251125_mindie_install_posix_ipc/cann/Dockerfile +0 -0
  60. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251125_mindie_install_posix_ipc/matrix.yaml +0 -0
  61. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251201_vllm_patch_qwen2_5_vl/cuda/Dockerfile +0 -0
  62. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251201_vllm_patch_qwen2_5_vl/cuda/patches/vllm_001_disable_flashatten_in_qwen2_5_vl.patch +0 -0
  63. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251201_vllm_patch_qwen2_5_vl/matrix.yaml +0 -0
  64. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251209_mindie_install_av/cann/Dockerfile +0 -0
  65. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251209_mindie_install_av/matrix.yaml +0 -0
  66. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_mindie_patch_minicpm_qwen2_v2/cann/Dockerfile +0 -0
  67. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_mindie_patch_minicpm_qwen2_v2/cann/patches.zip +0 -0
  68. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_mindie_patch_minicpm_qwen2_v2/matrix.yaml +0 -0
  69. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_sglang_patch_server_args/cuda/Dockerfile +0 -0
  70. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_sglang_patch_server_args/cuda/patches/sglang_001_fix_server_args.patch +0 -0
  71. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251213_sglang_patch_server_args/matrix.yaml +0 -0
  72. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251214_cuda_several_patches/cuda/Dockerfile +0 -0
  73. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251214_cuda_several_patches/matrix.yaml +0 -0
  74. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251215_cann_several_patches/cann/Dockerfile +0 -0
  75. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251215_cann_several_patches/matrix.yaml +0 -0
  76. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251216_sglang_uninstall_runai_model_streamer/cuda/Dockerfile +0 -0
  77. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251216_sglang_uninstall_runai_model_streamer/matrix.yaml +0 -0
  78. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_rocm_install_petit_kernel/matrix.yaml +0 -0
  79. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_rocm_install_petit_kernel/rocm/Dockerfile +0 -0
  80. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_vllm_install_audio_extra/cuda/Dockerfile +0 -0
  81. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_vllm_install_audio_extra/matrix.yaml +0 -0
  82. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/20251219_vllm_install_audio_extra/rocm/Dockerfile +0 -0
  83. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/.post_operation/README.md +0 -0
  84. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cann/Dockerfile +0 -0
  85. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cann/mindie-atb-models_2.2.rc1_linux-amd64_py3.11_torch2.1.0-abi0.tar.gz +0 -0
  86. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cann/mindie-atb-models_2.2.rc1_linux-arm64_py3.11_torch2.1.0-abi0.tar.gz +0 -0
  87. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/cann/patches/mindie.zip +0 -0
  88. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/corex/Dockerfile +0 -0
  89. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/discard_runner.sh +0 -0
  90. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/dtk/Dockerfile +0 -0
  91. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/expand_matrix.sh +0 -0
  92. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/maca/Dockerfile +0 -0
  93. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/merge_runner.sh +0 -0
  94. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/prune_runner.sh +0 -0
  95. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pack/rocm/patches/sglang_001_wrong_vram.patch +0 -0
  96. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pyproject.toml +0 -0
  97. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/pytest.ini +0 -0
  98. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/ruff.toml +0 -0
  99. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/__init__.py +0 -0
  100. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_docker_image.json +0 -0
  101. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_list_backend_runners.json +0 -0
  102. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_list_runners_by_prefix.json +0 -0
  103. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/fixtures/test_list_service_runners.json +0 -0
  104. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tests/gpustack_runner/test_runner.py +0 -0
  105. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/activate +0 -0
  106. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat.sh +0 -0
  107. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_current_date_time.sh +0 -0
  108. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_get_temperature.sh +0 -0
  109. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_get_weather.sh +0 -0
  110. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_square_of_number.sh +0 -0
  111. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_square_root_of_number.sh +0 -0
  112. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/chat_tool_where_am_i.sh +0 -0
  113. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/run_runner.sh +0 -0
  114. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/tools/run_runner_cluster.sh +0 -0
  115. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/uv.lock +0 -0
  116. {gpustack_runner-0.1.22.post2 → gpustack_runner-0.1.22.post4}/uv.toml +0 -0
@@ -117,7 +117,7 @@ package:
117
117
  JOB_EXTRA_ARGS+=("--cache-from=type=registry,ref=gpustack/runner-build-cache:$${TAG_CACHE}"); \
118
118
  done; \
119
119
  fi; \
120
- if [[ "$(PACKAGE_PUSH)" == "true" ]] || [[ "$(PACKAGE_CACHE_PUSH)" == "true" ]]; then \
120
+ if [[ "$(PACKAGE_PUSH)" == "true" || "$(PACKAGE_CACHE_PUSH)" == "true" ]] && [[ -z "$(PACKAGE_POST_OPERATION)" ]]; then \
121
121
  for TAG_CACHE in $${JOB_PLATFORM_CACHE}; do \
122
122
  JOB_EXTRA_ARGS+=("--cache-to=type=registry,ignore-error=true,mode=max,compression=gzip,ref=$(PACKAGE_NAMESPACE)/$(PACKAGE_CACHE_REPOSITORY):$${TAG_CACHE}"); \
123
123
  done; \
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpustack-runner
3
- Version: 0.1.22.post2
3
+ Version: 0.1.22.post4
4
4
  Summary: GPUStack Runner is library for registering runnable accelerated backends and services in GPUStack.
5
5
  Project-URL: Homepage, https://github.com/gpustack/runner
6
6
  Project-URL: Bug Tracker, https://github.com/gpustack/gpustack/issues
@@ -86,12 +86,12 @@ The following table lists the supported accelerated backends and their correspon
86
86
  > - Applied [Qwen2.5 VL patched](https://github.com/gpustack/gpustack/issues/3606) to vLLM 0.11.2.
87
87
  > - Applied [vLLM[audio] packages](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L720-L724) to vLLM 0.11.2.
88
88
 
89
- | CUDA Version <br/> (Variant) | vLLM | SGLang | VoxBox |
90
- |------------------------------|----------------------------------------------------------------------------|-----------------------------------------------------------|----------|
91
- | 12.9 | `0.12.0`, **`0.11.2`** | `0.5.6.post2` | |
92
- | 12.8 | `0.12.0`, **`0.11.2`**, <br/>`0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | `0.5.6.post2`, `0.5.5.post3`, <br/>`0.5.5`, `0.5.4.post3` | `0.0.20` |
93
- | 12.6 | `0.12.0`, **`0.11.2`**, <br/>`0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | `0.5.6.post2` | `0.0.20` |
94
- | 12.4 | `0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | | `0.0.20` |
89
+ | CUDA Version <br/> (Variant) | vLLM | SGLang | VoxBox |
90
+ |------------------------------|-------------------------------------------------------------------------------------------|-----------------------------------------------------------|--------------------|
91
+ | 12.9 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`** | `0.5.6.post2` | |
92
+ | 12.8 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0`, <br/>`0.10.2`, `0.10.1.1`, <br/>`0.10.0` | `0.5.6.post2`, `0.5.5.post3`, <br/>`0.5.5`, `0.5.4.post3` | `0.0.21`, `0.0.20` |
93
+ | 12.6 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0`, <br/>`0.10.2`, `0.10.1.1`, <br/>`0.10.0` | `0.5.6.post2` | `0.0.21`, `0.0.20` |
94
+ | 12.4 | `0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | | `0.0.20` |
95
95
 
96
96
  ### Hygon DTK
97
97
 
@@ -118,16 +118,17 @@ The following table lists the supported accelerated backends and their correspon
118
118
  > - ROCm 7.0 vLLM `0.11.2/0.11.0` are reusing the official ROCm 6.4 PyTorch 2.9 wheel package rather than a ROCm
119
119
  7.0 specific PyTorch build. Although supports ROCm 7.0 in vLLM `0.11.2/0.11.0`, `gfx1150/gfx1151` are not supported yet.
120
120
  > - SGLang supports `gfx942` only.
121
+ > - ROCm 6.4 vLLM `0.13.0` supports `gfx903 gfx90a gfx942` only.
121
122
 
122
123
  > [!IMPORTANT]
123
124
  > - Applied [vLLM[audio] packages](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L720-L724) to vLLM 0.11.2.
124
125
  > - Applied [petit-kernel package](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L728) to vLLM 0.11.2 and SGLang 0.5.5.post3.
125
126
 
126
- | ROCm Version <br/> (Variant) | vLLM | SGLang |
127
- |------------------------------|----------------------------------------|----------------------------------|
128
- | 7.0 | `0.12.0`, **`0.11.2`**, <br/> `0.11.0` | `0.5.6.post2` |
129
- | 6.4 | `0.12.0`, **`0.11.2`**, <br/> `0.10.2` | `0.5.6.post2`, **`0.5.5.post3`** |
130
- | 6.3 | `0.10.1.1`, `0.10.0` | |
127
+ | ROCm Version <br/> (Variant) | vLLM | SGLang |
128
+ |------------------------------|-------------------------------------------------|----------------------------------|
129
+ | 7.0 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0` | `0.5.6.post2` |
130
+ | 6.4 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.10.2` | `0.5.6.post2`, **`0.5.5.post3`** |
131
+ | 6.3 | `0.10.1.1`, `0.10.0` | |
131
132
 
132
133
  ## Directory Structure
133
134
 
@@ -66,12 +66,12 @@ The following table lists the supported accelerated backends and their correspon
66
66
  > - Applied [Qwen2.5 VL patched](https://github.com/gpustack/gpustack/issues/3606) to vLLM 0.11.2.
67
67
  > - Applied [vLLM[audio] packages](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L720-L724) to vLLM 0.11.2.
68
68
 
69
- | CUDA Version <br/> (Variant) | vLLM | SGLang | VoxBox |
70
- |------------------------------|----------------------------------------------------------------------------|-----------------------------------------------------------|----------|
71
- | 12.9 | `0.12.0`, **`0.11.2`** | `0.5.6.post2` | |
72
- | 12.8 | `0.12.0`, **`0.11.2`**, <br/>`0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | `0.5.6.post2`, `0.5.5.post3`, <br/>`0.5.5`, `0.5.4.post3` | `0.0.20` |
73
- | 12.6 | `0.12.0`, **`0.11.2`**, <br/>`0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | `0.5.6.post2` | `0.0.20` |
74
- | 12.4 | `0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | | `0.0.20` |
69
+ | CUDA Version <br/> (Variant) | vLLM | SGLang | VoxBox |
70
+ |------------------------------|-------------------------------------------------------------------------------------------|-----------------------------------------------------------|--------------------|
71
+ | 12.9 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`** | `0.5.6.post2` | |
72
+ | 12.8 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0`, <br/>`0.10.2`, `0.10.1.1`, <br/>`0.10.0` | `0.5.6.post2`, `0.5.5.post3`, <br/>`0.5.5`, `0.5.4.post3` | `0.0.21`, `0.0.20` |
73
+ | 12.6 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0`, <br/>`0.10.2`, `0.10.1.1`, <br/>`0.10.0` | `0.5.6.post2` | `0.0.21`, `0.0.20` |
74
+ | 12.4 | `0.11.0`, `0.10.2`, <br/>`0.10.1.1`, `0.10.0` | | `0.0.20` |
75
75
 
76
76
  ### Hygon DTK
77
77
 
@@ -98,16 +98,17 @@ The following table lists the supported accelerated backends and their correspon
98
98
  > - ROCm 7.0 vLLM `0.11.2/0.11.0` are reusing the official ROCm 6.4 PyTorch 2.9 wheel package rather than a ROCm
99
99
  7.0 specific PyTorch build. Although supports ROCm 7.0 in vLLM `0.11.2/0.11.0`, `gfx1150/gfx1151` are not supported yet.
100
100
  > - SGLang supports `gfx942` only.
101
+ > - ROCm 6.4 vLLM `0.13.0` supports `gfx903 gfx90a gfx942` only.
101
102
 
102
103
  > [!IMPORTANT]
103
104
  > - Applied [vLLM[audio] packages](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L720-L724) to vLLM 0.11.2.
104
105
  > - Applied [petit-kernel package](https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/setup.py#L728) to vLLM 0.11.2 and SGLang 0.5.5.post3.
105
106
 
106
- | ROCm Version <br/> (Variant) | vLLM | SGLang |
107
- |------------------------------|----------------------------------------|----------------------------------|
108
- | 7.0 | `0.12.0`, **`0.11.2`**, <br/> `0.11.0` | `0.5.6.post2` |
109
- | 6.4 | `0.12.0`, **`0.11.2`**, <br/> `0.10.2` | `0.5.6.post2`, **`0.5.5.post3`** |
110
- | 6.3 | `0.10.1.1`, `0.10.0` | |
107
+ | ROCm Version <br/> (Variant) | vLLM | SGLang |
108
+ |------------------------------|-------------------------------------------------|----------------------------------|
109
+ | 7.0 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.11.0` | `0.5.6.post2` |
110
+ | 6.4 | `0.13.0`, `0.12.0`, <br/>**`0.11.2`**, `0.10.2` | `0.5.6.post2`, **`0.5.5.post3`** |
111
+ | 6.3 | `0.10.1.1`, `0.10.0` | |
111
112
 
112
113
  ## Directory Structure
113
114
 
@@ -27,8 +27,8 @@ version_tuple: VERSION_TUPLE
27
27
  __commit_id__: COMMIT_ID
28
28
  commit_id: COMMIT_ID
29
29
 
30
- __version__ = version = '0.1.22.post2'
31
- __version_tuple__ = version_tuple = (0, 1, 22, 'post2')
30
+ __version__ = version = '0.1.22.post4'
31
+ __version_tuple__ = version_tuple = (0, 1, 22, 'post4')
32
32
  try:
33
33
  from ._version_appendix import git_commit
34
34
  __commit_id__ = commit_id = git_commit
@@ -0,0 +1 @@
1
+ git_commit = "f3f4d02"
@@ -604,6 +604,28 @@
604
604
  "docker_image": "gpustack/runner:cuda12.9-sglang0.5.6.post2",
605
605
  "deprecated": false
606
606
  },
607
+ {
608
+ "backend": "cuda",
609
+ "backend_version": "12.9",
610
+ "original_backend_version": "12.9.1",
611
+ "backend_variant": "",
612
+ "service": "vllm",
613
+ "service_version": "0.13.0",
614
+ "platform": "linux/amd64",
615
+ "docker_image": "gpustack/runner:cuda12.9-vllm0.13.0",
616
+ "deprecated": false
617
+ },
618
+ {
619
+ "backend": "cuda",
620
+ "backend_version": "12.9",
621
+ "original_backend_version": "12.9.1",
622
+ "backend_variant": "",
623
+ "service": "vllm",
624
+ "service_version": "0.13.0",
625
+ "platform": "linux/arm64",
626
+ "docker_image": "gpustack/runner:cuda12.9-vllm0.13.0",
627
+ "deprecated": false
628
+ },
607
629
  {
608
630
  "backend": "cuda",
609
631
  "backend_version": "12.9",
@@ -725,6 +747,28 @@
725
747
  "docker_image": "gpustack/runner:cuda12.8-sglang0.5.4.post3",
726
748
  "deprecated": false
727
749
  },
750
+ {
751
+ "backend": "cuda",
752
+ "backend_version": "12.8",
753
+ "original_backend_version": "12.8.1",
754
+ "backend_variant": "",
755
+ "service": "vllm",
756
+ "service_version": "0.13.0",
757
+ "platform": "linux/amd64",
758
+ "docker_image": "gpustack/runner:cuda12.8-vllm0.13.0",
759
+ "deprecated": false
760
+ },
761
+ {
762
+ "backend": "cuda",
763
+ "backend_version": "12.8",
764
+ "original_backend_version": "12.8.1",
765
+ "backend_variant": "",
766
+ "service": "vllm",
767
+ "service_version": "0.13.0",
768
+ "platform": "linux/arm64",
769
+ "docker_image": "gpustack/runner:cuda12.8-vllm0.13.0",
770
+ "deprecated": false
771
+ },
728
772
  {
729
773
  "backend": "cuda",
730
774
  "backend_version": "12.8",
@@ -857,6 +901,28 @@
857
901
  "docker_image": "gpustack/runner:cuda12.8-vllm0.10.0",
858
902
  "deprecated": false
859
903
  },
904
+ {
905
+ "backend": "cuda",
906
+ "backend_version": "12.8",
907
+ "original_backend_version": "12.8.1",
908
+ "backend_variant": "",
909
+ "service": "voxbox",
910
+ "service_version": "0.0.21",
911
+ "platform": "linux/amd64",
912
+ "docker_image": "gpustack/runner:cuda12.8-voxbox0.0.21",
913
+ "deprecated": false
914
+ },
915
+ {
916
+ "backend": "cuda",
917
+ "backend_version": "12.8",
918
+ "original_backend_version": "12.8.1",
919
+ "backend_variant": "",
920
+ "service": "voxbox",
921
+ "service_version": "0.0.21",
922
+ "platform": "linux/arm64",
923
+ "docker_image": "gpustack/runner:cuda12.8-voxbox0.0.21",
924
+ "deprecated": false
925
+ },
860
926
  {
861
927
  "backend": "cuda",
862
928
  "backend_version": "12.8",
@@ -879,6 +945,28 @@
879
945
  "docker_image": "gpustack/runner:cuda12.8-voxbox0.0.20",
880
946
  "deprecated": false
881
947
  },
948
+ {
949
+ "backend": "cuda",
950
+ "backend_version": "12.6",
951
+ "original_backend_version": "12.6.3",
952
+ "backend_variant": "",
953
+ "service": "vllm",
954
+ "service_version": "0.13.0",
955
+ "platform": "linux/amd64",
956
+ "docker_image": "gpustack/runner:cuda12.6-vllm0.13.0",
957
+ "deprecated": false
958
+ },
959
+ {
960
+ "backend": "cuda",
961
+ "backend_version": "12.6",
962
+ "original_backend_version": "12.6.3",
963
+ "backend_variant": "",
964
+ "service": "vllm",
965
+ "service_version": "0.13.0",
966
+ "platform": "linux/arm64",
967
+ "docker_image": "gpustack/runner:cuda12.6-vllm0.13.0",
968
+ "deprecated": false
969
+ },
882
970
  {
883
971
  "backend": "cuda",
884
972
  "backend_version": "12.6",
@@ -1011,6 +1099,28 @@
1011
1099
  "docker_image": "gpustack/runner:cuda12.6-vllm0.10.0",
1012
1100
  "deprecated": false
1013
1101
  },
1102
+ {
1103
+ "backend": "cuda",
1104
+ "backend_version": "12.6",
1105
+ "original_backend_version": "12.6.3",
1106
+ "backend_variant": "",
1107
+ "service": "voxbox",
1108
+ "service_version": "0.0.21",
1109
+ "platform": "linux/amd64",
1110
+ "docker_image": "gpustack/runner:cuda12.6-voxbox0.0.21",
1111
+ "deprecated": false
1112
+ },
1113
+ {
1114
+ "backend": "cuda",
1115
+ "backend_version": "12.6",
1116
+ "original_backend_version": "12.6.3",
1117
+ "backend_variant": "",
1118
+ "service": "voxbox",
1119
+ "service_version": "0.0.21",
1120
+ "platform": "linux/arm64",
1121
+ "docker_image": "gpustack/runner:cuda12.6-voxbox0.0.21",
1122
+ "deprecated": false
1123
+ },
1014
1124
  {
1015
1125
  "backend": "cuda",
1016
1126
  "backend_version": "12.6",
@@ -1198,6 +1308,17 @@
1198
1308
  "docker_image": "gpustack/runner:rocm7.0-sglang0.5.6.post2",
1199
1309
  "deprecated": false
1200
1310
  },
1311
+ {
1312
+ "backend": "rocm",
1313
+ "backend_version": "7.0",
1314
+ "original_backend_version": "7.0.2",
1315
+ "backend_variant": "",
1316
+ "service": "vllm",
1317
+ "service_version": "0.13.0",
1318
+ "platform": "linux/amd64",
1319
+ "docker_image": "gpustack/runner:rocm7.0-vllm0.13.0",
1320
+ "deprecated": false
1321
+ },
1201
1322
  {
1202
1323
  "backend": "rocm",
1203
1324
  "backend_version": "7.0",
@@ -1253,6 +1374,17 @@
1253
1374
  "docker_image": "gpustack/runner:rocm6.4-sglang0.5.5.post3",
1254
1375
  "deprecated": false
1255
1376
  },
1377
+ {
1378
+ "backend": "rocm",
1379
+ "backend_version": "6.4",
1380
+ "original_backend_version": "6.4.4",
1381
+ "backend_variant": "",
1382
+ "service": "vllm",
1383
+ "service_version": "0.13.0",
1384
+ "platform": "linux/amd64",
1385
+ "docker_image": "gpustack/runner:rocm6.4-vllm0.13.0",
1386
+ "deprecated": false
1387
+ },
1256
1388
  {
1257
1389
  "backend": "rocm",
1258
1390
  "backend_version": "6.4",
@@ -95,11 +95,11 @@ ARG CMAKE_MAX_JOBS
95
95
  ARG CUDA_VERSION=12.9.1
96
96
  ARG CUDA_ARCHS
97
97
  ARG VOXBOX_BASE_IMAGE=gpustack/runner:cuda${CUDA_VERSION}-python${PYTHON_VERSION}
98
- ARG VOXBOX_VERSION=0.0.20
98
+ ARG VOXBOX_VERSION=0.0.21
99
99
  ARG VOXBOX_TORCH_VERSION=2.7.1
100
100
  ARG VOXBOX_TORCH_CUDA_VERSION=${CUDA_VERSION}
101
101
  ARG VLLM_BASE_IMAGE=gpustack/runner:cuda${CUDA_VERSION}-python${PYTHON_VERSION}
102
- ARG VLLM_VERSION=0.12.0
102
+ ARG VLLM_VERSION=0.13.0
103
103
  ARG VLLM_TORCH_VERSION=2.9.0
104
104
  ARG VLLM_TORCH_CUDA_VERSION=${CUDA_VERSION}
105
105
  ARG VLLM_BUILD_BASE_IMAGE=gpustack/runner:cuda${VLLM_TORCH_CUDA_VERSION}-python${PYTHON_VERSION}
@@ -112,7 +112,7 @@ ARG VLLM_DEEPEP_COMMIT=b57e5e21
112
112
  ARG VLLM_DEEPGEMM_COMMIT=9b680f42
113
113
  ARG VLLM_FLASHINFER_VERSION=0.5.3
114
114
  ARG VLLM_FLASHATTENTION_VERSION=2.8.3
115
- ARG VLLM_LMCACHE_VERSION=0.3.10.post1
115
+ ARG VLLM_LMCACHE_VERSION=0.3.11
116
116
  ARG VLLM_MOONCAKE_VERSION=0.3.7.post2
117
117
  ARG SGLANG_BASE_IMAGE=vllm
118
118
  ARG SGLANG_VERSION=0.5.6.post2
@@ -492,6 +492,7 @@ einops
492
492
  cuda-python==${CUDA_MAJOR}.${CUDA_MINOR}
493
493
  pynvml==${CUDA_MAJOR}
494
494
  nvidia-nvshmem-cu${CUDA_MAJOR}
495
+ nvshmem4py-cu${CUDA_MAJOR}
495
496
  EOT
496
497
  uv pip install \
497
498
  -r /tmp/requirements.txt
@@ -575,6 +576,20 @@ RUN <<EOF
575
576
 
576
577
  IFS="." read -r TORCH_MAJOR TORCH_MINOR TORCH_PATCH <<< "${VLLM_TORCH_VERSION}"
577
578
  IFS="." read -r CUDA_MAJOR CUDA_MINOR CUDA_PATCH <<< "${VLLM_TORCH_CUDA_VERSION}"
579
+ IFS="." read -r PYTHON_MAJOR PYTHON_MINOR <<< "${PYTHON_VERSION}"
580
+
581
+ PYTHON_MAJOR_MINOR="${PYTHON_MAJOR}${PYTHON_MINOR}"
582
+ for ABI in FALSE TRUE; do
583
+ PREBUILD_URL="https://github.com/Dao-AILab/flash-attention/releases/download/v${VLLM_FLASHATTENTION_VERSION}/flash_attn-${VLLM_FLASHATTENTION_VERSION}+cu${CUDA_MAJOR}torch${TORCH_MAJOR}.${TORCH_MINOR}cxx11abi${ABI}-cp${PYTHON_MAJOR_MINOR}-cp${PYTHON_MAJOR_MINOR}-linux_$(uname -m).whl"
584
+ if curl --retry 3 --retry-connrefused -fsSIL "${PREBUILD_URL}" >/dev/null 2>&1; then
585
+ echo "Downloading prebuilt FlashAttention wheel from ${PREBUILD_URL}..."
586
+ curl --retry 3 --retry-connrefused -fL "${PREBUILD_URL}" -o "/tmp/flash_attn-${VLLM_FLASHATTENTION_VERSION}+cu${CUDA_MAJOR}torch${TORCH_MAJOR}.${TORCH_MINOR}cxx11abi${ABI}-cp${PYTHON_MAJOR_MINOR}-cp${PYTHON_MAJOR_MINOR}-linux_$(uname -m).whl"
587
+ mkdir -p /workspace \
588
+ && mv /tmp/*.whl /workspace \
589
+ && tree -hs /workspace
590
+ exit 0
591
+ fi
592
+ done
578
593
 
579
594
  # Support ARM64 only
580
595
  if [[ "${TARGETARCH}" != "amd64" ]]; then
@@ -582,16 +597,6 @@ RUN <<EOF
582
597
  exit 0
583
598
  fi
584
599
 
585
- PREBUILD_URL="https://github.com/Dao-AILab/flash-attention/releases/download/v${VLLM_FLASHATTENTION_VERSION}/flash_attn-${VLLM_FLASHATTENTION_VERSION}+cu${CUDA_MAJOR}torch${TORCH_MAJOR}.${TORCH_MINOR}cxx11abiFALSE-cp310-cp310-linux_$(uname -m).whl"
586
- if curl --retry 3 --retry-connrefused -fsSIL "${PREBUILD_URL}" >/dev/null 2>&1; then
587
- echo "Downloading prebuilt FlashAttention wheel from ${PREBUILD_URL}..."
588
- curl --retry 3 --retry-connrefused -fL "${PREBUILD_URL}" -o "/tmp/flash_attn-${VLLM_FLASHATTENTION_VERSION}+cu${CUDA_MAJOR}torch${TORCH_MAJOR}.${TORCH_MINOR}cxx11abiFALSE-cp310-cp310-linux_$(uname -m).whl"
589
- mkdir -p /workspace \
590
- && mv /tmp/*.whl /workspace \
591
- && tree -hs /workspace
592
- exit 0
593
- fi
594
-
595
600
  # Download
596
601
  git -C /tmp clone --recursive --shallow-submodules \
597
602
  --depth 1 --branch v${VLLM_FLASHATTENTION_VERSION} --single-branch \
@@ -962,7 +967,7 @@ ARG VLLM_VERSION
962
967
 
963
968
  ENV VLLM_VERSION=${VLLM_VERSION}
964
969
 
965
- RUN <<EOF
970
+ RUN --mount=type=bind,from=vllm-build-flashattention,source=/,target=/flashattention,rw <<EOF
966
971
  # vLLM
967
972
 
968
973
  IFS="." read -r CUDA_MAJOR CUDA_MINOR CUDA_PATCH <<< "${VLLM_TORCH_CUDA_VERSION}"
@@ -986,6 +991,8 @@ RUN <<EOF
986
991
  export TORCH_CUDA_ARCH_LIST="${VL_CUDA_ARCHS}"
987
992
  export COMPILE_CUSTOM_KERNELS=1
988
993
  export NVCC_THREADS=1
994
+ echo "Building vLLM with the following environment variables:"
995
+ env
989
996
 
990
997
  # Install
991
998
  git -C /tmp clone --recursive --shallow-submodules \
@@ -1047,6 +1054,9 @@ RUN --mount=type=bind,from=vllm-build-vllm,source=/,target=/vllm,rw <<EOF
1047
1054
  export MAX_JOBS="${CMAKE_MAX_JOBS}"
1048
1055
  export TORCH_CUDA_ARCH_LIST="${LC_CUDA_ARCHS}"
1049
1056
  export NVCC_THREADS=1
1057
+ echo "Building LMCache with the following environment variables:"
1058
+ env
1059
+
1050
1060
  git -C /tmp clone --recursive --shallow-submodules \
1051
1061
  --depth 1 --branch v${VLLM_LMCACHE_VERSION} --single-branch \
1052
1062
  https://github.com/LMCache/LMCache.git lmcache
@@ -102,7 +102,6 @@ rules:
102
102
  ##
103
103
  - backend: "cuda"
104
104
  services:
105
- - "voxbox"
106
105
  - "vllm"
107
106
  - "sglang"
108
107
  args:
@@ -69,10 +69,10 @@
69
69
  # which is used to build the SGLang from source.
70
70
  ARG PYTHON_VERSION=3.12
71
71
  ARG CMAKE_MAX_JOBS
72
- ARG ROCM_VERSION=7.1.1
72
+ ARG ROCM_VERSION=7.0.2
73
73
  ARG ROCM_ARCHS
74
74
  ARG VLLM_BASE_IMAGE=gpustack/runner:rocm${ROCM_VERSION}-python${PYTHON_VERSION}
75
- ARG VLLM_VERSION=0.12.0
75
+ ARG VLLM_VERSION=0.13.0
76
76
  ARG VLLM_TORCH_VERSION=2.9.1
77
77
  ARG VLLM_TORCH_ROCM_VERSION=${ROCM_VERSION}
78
78
  ARG VLLM_TORCH_SOURCE=pytorch
@@ -80,7 +80,7 @@ ARG VLLM_BUILD_BASE_IMAGE=gpustack/runner:rocm${VLLM_TORCH_ROCM_VERSION}-python$
80
80
  ARG VLLM_TRITON_COMMIT=57c693b6
81
81
  ARG VLLM_FLASHATTENTION_VERSION=2.8.3
82
82
  ARG VLLM_AITER_VERSION=0.1.7.post5
83
- ARG VLLM_LMCACHE_VERSION=0.3.10.post1
83
+ ARG VLLM_LMCACHE_VERSION=0.3.11
84
84
  ARG VLLM_MOONCAKE_VERSION=0.3.7.post2
85
85
  ARG SGLANG_BASE_IMAGE=vllm
86
86
  ARG SGLANG_VERSION=0.5.6.post2
@@ -679,12 +679,12 @@ ARG VLLM_VERSION
679
679
 
680
680
  ENV VLLM_VERSION=${VLLM_VERSION}
681
681
 
682
- RUN --mount=type=bind,from=vllm-build-triton,source=/,target=/triton,rw \
683
- --mount=type=bind,from=vllm-build-flashattention,source=/,target=/flashattention,rw \
682
+ RUN --mount=type=bind,from=vllm-build-flashattention,source=/,target=/flashattention,rw \
684
683
  --mount=type=bind,from=vllm-build-aiter,source=/,target=/aiter,rw <<EOF
685
684
  # vLLM
686
685
 
687
686
  IFS="." read -r ROCM_MAJOR ROCM_MINOR ROCM_PATCH <<< "${VLLM_TORCH_ROCM_VERSION}"
687
+ IFS="." read -r VL_MAJOR VL_MINOR VL_PATCH <<< "${VLLM_VERSION}"
688
688
 
689
689
  CMAKE_MAX_JOBS="${CMAKE_MAX_JOBS}"
690
690
  if [[ -z "${CMAKE_MAX_JOBS}" ]]; then
@@ -697,6 +697,14 @@ RUN --mount=type=bind,from=vllm-build-triton,source=/,target=/triton,rw \
697
697
  if [[ -z "${VL_ROCM_ARCHS}" ]]; then
698
698
  if (( $(echo "${ROCM_MAJOR}.${ROCM_MINOR} < 7.0" | bc -l) )); then
699
699
  VL_ROCM_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100"
700
+ if (( $(echo "${VL_MAJOR}.${VL_MINOR} == 0.13" | bc -l) )); then
701
+ # TODO(thxCode): Temporarily remove gfx1030 for vLLM ROCm build due to build error in ROCm 6.4.4.
702
+ # #15 134.9 /tmp/vllm/build/temp.linux-x86_64-cpython-312/csrc/sampler.hip:564:63: error: local memory (66032) exceeds limit (65536) in 'void vllm::topKPerRowDecode<1024, true, false, true>(float const*, int const*, int*, int, int, int, int, float*, int, int const*)'
703
+ # ##15 134.9 564 | static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
704
+ # ##15 134.9 | ^
705
+ # ##15 134.9 16 warnings and 1 error generated when compiling for gfx1030.
706
+ VL_ROCM_ARCHS="gfx908;gfx90a;gfx942"
707
+ fi
700
708
  else
701
709
  VL_ROCM_ARCHS="gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
702
710
  fi
@@ -704,6 +712,8 @@ RUN --mount=type=bind,from=vllm-build-triton,source=/,target=/triton,rw \
704
712
  export MAX_JOBS="${CMAKE_MAX_JOBS}"
705
713
  export COMPILE_CUSTOM_KERNELS=1
706
714
  export PYTORCH_ROCM_ARCH="${VL_ROCM_ARCHS}"
715
+ echo "Building vLLM with the following environment variables:"
716
+ env
707
717
 
708
718
  # Build
709
719
  git -C /tmp clone --recursive --shallow-submodules \
@@ -712,7 +722,9 @@ RUN --mount=type=bind,from=vllm-build-triton,source=/,target=/triton,rw \
712
722
  pushd /tmp/vllm \
713
723
  && sed -i "s/\"torch ==.*\"/\"torch\"/g" /tmp/vllm/pyproject.toml \
714
724
  && sed -i "s/\"torch==.*\"/\"torch\"/g" /tmp/vllm/requirements/rocm-build.txt \
725
+ && sed -i "s/\"torchvision==.*\"/\"torchvision\"/g" /tmp/vllm/requirements/rocm-build.txt \
715
726
  && sed -i "s/\"torchaudio==.*\"/\"torchaudio\"/g" /tmp/vllm/requirements/rocm-build.txt \
727
+ && sed -i "s/\"triton==.*\"/\"triton\"/g" /tmp/vllm/requirements/rocm-build.txt \
716
728
  && VLLM_TARGET_DEVICE="rocm" python -v -m build --no-isolation --wheel \
717
729
  && tree -hs /tmp/vllm/dist \
718
730
  && mv /tmp/vllm/dist /workspace
@@ -769,6 +781,8 @@ RUN --mount=type=bind,from=vllm-build-vllm,source=/,target=/vllm,rw <<EOF
769
781
  export TORCH_DONT_CHECK_COMPILER_ABI=1
770
782
  export CXX=hipcc
771
783
  export BUILD_WITH_HIP=1
784
+ echo "Building LMCache with the following environment variables:"
785
+ env
772
786
 
773
787
  # Install LMCache
774
788
  git -C /tmp clone --recursive --shallow-submodules \
@@ -1403,7 +1417,7 @@ RUN --mount=type=bind,target=/workspace,rw <<EOF
1403
1417
 
1404
1418
  tree -hs /workspace/patches
1405
1419
  pushd $(pip show sglang | grep Location: | cut -d" " -f 2) \
1406
- && patch -p1 < /workspace/patches/*.patch
1420
+ && patch -p1 < /workspace/patches/sglang_*.patch
1407
1421
  EOF
1408
1422
 
1409
1423
  ## Entrypoint
@@ -626,6 +626,28 @@
626
626
  "docker_image": "gpustack/runner:cuda12.9-sglang0.5.6.post2",
627
627
  "deprecated": false
628
628
  },
629
+ {
630
+ "backend": "cuda",
631
+ "backend_version": "12.9",
632
+ "original_backend_version": "12.9.1",
633
+ "backend_variant": "",
634
+ "service": "vllm",
635
+ "service_version": "0.13.0",
636
+ "platform": "linux/amd64",
637
+ "docker_image": "gpustack/runner:cuda12.9-vllm0.13.0",
638
+ "deprecated": false
639
+ },
640
+ {
641
+ "backend": "cuda",
642
+ "backend_version": "12.9",
643
+ "original_backend_version": "12.9.1",
644
+ "backend_variant": "",
645
+ "service": "vllm",
646
+ "service_version": "0.13.0",
647
+ "platform": "linux/arm64",
648
+ "docker_image": "gpustack/runner:cuda12.9-vllm0.13.0",
649
+ "deprecated": false
650
+ },
629
651
  {
630
652
  "backend": "cuda",
631
653
  "backend_version": "12.9",
@@ -747,6 +769,28 @@
747
769
  "docker_image": "gpustack/runner:cuda12.8-sglang0.5.4.post3",
748
770
  "deprecated": false
749
771
  },
772
+ {
773
+ "backend": "cuda",
774
+ "backend_version": "12.8",
775
+ "original_backend_version": "12.8.1",
776
+ "backend_variant": "",
777
+ "service": "vllm",
778
+ "service_version": "0.13.0",
779
+ "platform": "linux/amd64",
780
+ "docker_image": "gpustack/runner:cuda12.8-vllm0.13.0",
781
+ "deprecated": false
782
+ },
783
+ {
784
+ "backend": "cuda",
785
+ "backend_version": "12.8",
786
+ "original_backend_version": "12.8.1",
787
+ "backend_variant": "",
788
+ "service": "vllm",
789
+ "service_version": "0.13.0",
790
+ "platform": "linux/arm64",
791
+ "docker_image": "gpustack/runner:cuda12.8-vllm0.13.0",
792
+ "deprecated": false
793
+ },
750
794
  {
751
795
  "backend": "cuda",
752
796
  "backend_version": "12.8",
@@ -879,6 +923,28 @@
879
923
  "docker_image": "gpustack/runner:cuda12.8-vllm0.10.0",
880
924
  "deprecated": false
881
925
  },
926
+ {
927
+ "backend": "cuda",
928
+ "backend_version": "12.8",
929
+ "original_backend_version": "12.8.1",
930
+ "backend_variant": "",
931
+ "service": "voxbox",
932
+ "service_version": "0.0.21",
933
+ "platform": "linux/amd64",
934
+ "docker_image": "gpustack/runner:cuda12.8-voxbox0.0.21",
935
+ "deprecated": false
936
+ },
937
+ {
938
+ "backend": "cuda",
939
+ "backend_version": "12.8",
940
+ "original_backend_version": "12.8.1",
941
+ "backend_variant": "",
942
+ "service": "voxbox",
943
+ "service_version": "0.0.21",
944
+ "platform": "linux/arm64",
945
+ "docker_image": "gpustack/runner:cuda12.8-voxbox0.0.21",
946
+ "deprecated": false
947
+ },
882
948
  {
883
949
  "backend": "cuda",
884
950
  "backend_version": "12.8",
@@ -901,6 +967,28 @@
901
967
  "docker_image": "gpustack/runner:cuda12.8-voxbox0.0.20",
902
968
  "deprecated": false
903
969
  },
970
+ {
971
+ "backend": "cuda",
972
+ "backend_version": "12.6",
973
+ "original_backend_version": "12.6.3",
974
+ "backend_variant": "",
975
+ "service": "vllm",
976
+ "service_version": "0.13.0",
977
+ "platform": "linux/amd64",
978
+ "docker_image": "gpustack/runner:cuda12.6-vllm0.13.0",
979
+ "deprecated": false
980
+ },
981
+ {
982
+ "backend": "cuda",
983
+ "backend_version": "12.6",
984
+ "original_backend_version": "12.6.3",
985
+ "backend_variant": "",
986
+ "service": "vllm",
987
+ "service_version": "0.13.0",
988
+ "platform": "linux/arm64",
989
+ "docker_image": "gpustack/runner:cuda12.6-vllm0.13.0",
990
+ "deprecated": false
991
+ },
904
992
  {
905
993
  "backend": "cuda",
906
994
  "backend_version": "12.6",
@@ -1033,6 +1121,28 @@
1033
1121
  "docker_image": "gpustack/runner:cuda12.6-vllm0.10.0",
1034
1122
  "deprecated": false
1035
1123
  },
1124
+ {
1125
+ "backend": "cuda",
1126
+ "backend_version": "12.6",
1127
+ "original_backend_version": "12.6.3",
1128
+ "backend_variant": "",
1129
+ "service": "voxbox",
1130
+ "service_version": "0.0.21",
1131
+ "platform": "linux/amd64",
1132
+ "docker_image": "gpustack/runner:cuda12.6-voxbox0.0.21",
1133
+ "deprecated": false
1134
+ },
1135
+ {
1136
+ "backend": "cuda",
1137
+ "backend_version": "12.6",
1138
+ "original_backend_version": "12.6.3",
1139
+ "backend_variant": "",
1140
+ "service": "voxbox",
1141
+ "service_version": "0.0.21",
1142
+ "platform": "linux/arm64",
1143
+ "docker_image": "gpustack/runner:cuda12.6-voxbox0.0.21",
1144
+ "deprecated": false
1145
+ },
1036
1146
  {
1037
1147
  "backend": "cuda",
1038
1148
  "backend_version": "12.6",
@@ -1244,6 +1354,17 @@
1244
1354
  "docker_image": "gpustack/runner:rocm7.0-sglang0.5.6.post2",
1245
1355
  "deprecated": false
1246
1356
  },
1357
+ {
1358
+ "backend": "rocm",
1359
+ "backend_version": "7.0",
1360
+ "original_backend_version": "7.0.2",
1361
+ "backend_variant": "",
1362
+ "service": "vllm",
1363
+ "service_version": "0.13.0",
1364
+ "platform": "linux/amd64",
1365
+ "docker_image": "gpustack/runner:rocm7.0-vllm0.13.0",
1366
+ "deprecated": false
1367
+ },
1247
1368
  {
1248
1369
  "backend": "rocm",
1249
1370
  "backend_version": "7.0",
@@ -1299,6 +1420,17 @@
1299
1420
  "docker_image": "gpustack/runner:rocm6.4-sglang0.5.5.post3",
1300
1421
  "deprecated": false
1301
1422
  },
1423
+ {
1424
+ "backend": "rocm",
1425
+ "backend_version": "6.4",
1426
+ "original_backend_version": "6.4.4",
1427
+ "backend_variant": "",
1428
+ "service": "vllm",
1429
+ "service_version": "0.13.0",
1430
+ "platform": "linux/amd64",
1431
+ "docker_image": "gpustack/runner:rocm6.4-vllm0.13.0",
1432
+ "deprecated": false
1433
+ },
1302
1434
  {
1303
1435
  "backend": "rocm",
1304
1436
  "backend_version": "6.4",
@@ -1 +0,0 @@
1
- git_commit = "457b969"