PyPI - vllm-ascend - Versions diffs - 0.11.0rc1__tar.gz → 0.11.0rc2__tar.gz - Mend

vllm-ascend 0.11.0rc1tar.gz → 0.11.0rc2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (629) hide show

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/_accuracy_test.yaml RENAMED Viewed

@@ -30,7 +30,7 @@ jobs:
     runs-on: ${{ inputs.runner }}
     name: ${{ inputs.model_name }} accuracy
     container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
       env:
         VLLM_USE_MODELSCOPE: True
         # 1. If version specified (work_dispatch), do specified branch accuracy test

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/_e2e_test.yaml RENAMED Viewed

@@ -106,8 +106,8 @@ jobs:
           # ------------------------------------ v1 spec decode test ------------------------------------ #
           pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
           pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
-          # Fix me: OOM error
-          #pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+          # Fix me: test_eagle_correctness OOM error
+          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
           pytest -sv tests/e2e/singlecard/ops/

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/accuracy_test.yaml RENAMED Viewed

@@ -68,5 +68,5 @@ jobs:
     with:
       vllm: v0.11.0
       runner:  linux-aarch64-${{ matrix.runner }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
       model_name: ${{ matrix.model_name }}

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/multi_node_test.yaml RENAMED Viewed

@@ -23,7 +23,7 @@ jobs:
     # This is a runner with no NPU for k8s controller
     runs-on: linux-aarch64-a3-0
     container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+      image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
       env:
         KUBECONFIG: /tmp/kubeconfig
         KUBECTL: /root/.cache/.kube/kubectl

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/nightly_benchmarks.yaml RENAMED Viewed

@@ -56,7 +56,7 @@ jobs:
             vllm_use_v1: 1
       max-parallel: 1
     container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
       volumes:
         - /usr/local/dcmi:/usr/local/dcmi
         - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/release_whl.yml RENAMED Viewed

@@ -57,7 +57,13 @@ jobs:
     - name: Print
       run: |
         lscpu
+    - name: Free up disk space
+      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      with:
+        tool-cache: true
+        docker-images: false
     - name: Build wheel
       run: |
         ls

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/vllm_ascend_dist.yaml RENAMED Viewed

@@ -47,7 +47,7 @@ jobs:
     name: vLLM Ascend test
     runs-on: ${{ matrix.os }}
     container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+      image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
       env:
         DEBIAN_FRONTEND: noninteractive
     steps:
@@ -97,4 +97,4 @@ jobs:
           VLLM_USE_MODELSCOPE: True
         run: |
           # TODO: enable more tests
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/vllm_ascend_test.yaml RENAMED Viewed

@@ -145,5 +145,5 @@ jobs:
     with:
       vllm: ${{ matrix.vllm_version }}
       runner: linux-aarch64-a2
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
       type: light

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/vllm_ascend_test_310p.yaml RENAMED Viewed

@@ -58,7 +58,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     container:
       # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-310p-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-310p-ubuntu22.04-py3.11
       env:
         VLLM_LOGGING_LEVEL: ERROR
         VLLM_USE_MODELSCOPE: True

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/vllm_ascend_test_full.yaml RENAMED Viewed

@@ -76,5 +76,5 @@ jobs:
     with:
       vllm: ${{ matrix.vllm_version }}
       runner: linux-aarch64-a2
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
       type: full

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/vllm_ascend_test_full_vllm_main.yaml RENAMED Viewed

@@ -41,5 +41,5 @@ jobs:
     with:
       vllm: main
       runner: linux-aarch64-a2
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
       type: full

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/vllm_ascend_test_models.yaml RENAMED Viewed

@@ -79,7 +79,7 @@ jobs:
     with:
       vllm: v0.11.0
       runner:  linux-aarch64-${{ matrix.runner }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
       model_name: ${{ matrix.model_name }}
       upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/.github/workflows/vllm_ascend_test_pd.yaml RENAMED Viewed

@@ -49,7 +49,7 @@ jobs:
     runs-on: linux-arm64-npu-static-8
     container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
       volumes:
         - /usr/local/dcmi:/usr/local/dcmi
         - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
@@ -109,4 +109,4 @@ jobs:
       - name: Run vllm-project/vllm-ascend PD Disaggregation edge test
         run: |
           git config --global --add safe.directory/__w/vllm-ascend/vllm-ascend
-          bash tests/e2e/pd_disaggreate/run_edge_case_test.sh
+          bash tests/e2e/pd_disaggreate/run_edge_case_test.sh

vllm_ascend-0.11.0rc1/Dockerfile.a3 → vllm_ascend-0.11.0rc2/Dockerfile RENAMED Viewed

@@ -15,7 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #
-FROM quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
@@ -40,7 +40,7 @@ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.11.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
     python3 -m pip cache purge
@@ -57,4 +57,4 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
     python3 -m pip cache purge
-CMD ["/bin/bash"]
+CMD ["/bin/bash"]

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/Dockerfile.310p RENAMED Viewed

@@ -15,7 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #
-FROM quay.io/ascend/cann:8.3.rc1-310p-ubuntu22.04-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-310p-ubuntu22.04-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
@@ -40,7 +40,7 @@ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.11.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
     python3 -m pip cache purge

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/Dockerfile.310p.openEuler RENAMED Viewed

@@ -15,7 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #
-FROM quay.io/ascend/cann:8.3.rc1-310p-openeuler24.03-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
@@ -38,7 +38,7 @@ ARG VLLM_TAG=v0.11.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
     python3 -m pip cache purge

vllm_ascend-0.11.0rc1/Dockerfile → vllm_ascend-0.11.0rc2/Dockerfile.a3 RENAMED Viewed

@@ -15,7 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #
-FROM quay.io/ascend/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
@@ -40,7 +40,7 @@ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.11.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
     python3 -m pip cache purge

vllm_ascend-0.11.0rc1/Dockerfile.openEuler → vllm_ascend-0.11.0rc2/Dockerfile.a3.openEuler RENAMED Viewed

@@ -15,7 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #
-FROM quay.io/ascend/cann:8.3.rc1-910b-openeuler24.03-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
@@ -38,7 +38,7 @@ ARG VLLM_TAG=v0.11.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
     python3 -m pip cache purge

vllm_ascend-0.11.0rc1/Dockerfile.a3.openEuler → vllm_ascend-0.11.0rc2/Dockerfile.openEuler RENAMED Viewed

@@ -15,7 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #
-FROM quay.io/ascend/cann:8.3.rc1-a3-openeuler24.03-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-910b-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
@@ -38,7 +38,7 @@ ARG VLLM_TAG=v0.11.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
     python3 -m pip cache purge
@@ -55,4 +55,4 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
     python3 -m pip cache purge
-CMD ["/bin/bash"]
+CMD ["/bin/bash"]

{vllm_ascend-0.11.0rc1/vllm_ascend.egg-info → vllm_ascend-0.11.0rc2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vllm_ascend
-Version: 0.11.0rc1
+Version: 0.11.0rc2
 Summary: vLLM Ascend backend plugin
 Home-page: https://github.com/vllm-project/vllm-ascend
 Author: vLLM-Ascend team

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/docs/source/conf.py RENAMED Viewed

@@ -75,7 +75,7 @@ myst_substitutions = {
     'pip_vllm_ascend_version': "0.11.0rc0",
     'pip_vllm_version': "0.11.0",
     # CANN image tag
-    'cann_image_tag': "8.3.rc1-910b-ubuntu22.04-py3.11",
+    'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
     # vllm version in ci
     'ci_vllm_version': 'v0.11.0rc3',
 }

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/docs/source/installation.md RENAMED Viewed

@@ -79,19 +79,19 @@ source vllm-ascend-env/bin/activate
 pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
 # Download and install the CANN package.
-wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC1/Ascend-cann-toolkit_8.3.RC1_linux-"$(uname -i)".run
-chmod +x ./Ascend-cann-toolkit_8.3.RC1_linux-"$(uname -i)".run
-./Ascend-cann-toolkit_8.3.RC1_linux-"$(uname -i)".run --full
-# https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.3.rc1_linux-aarch64.run
+wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC2/Ascend-cann-toolkit_8.3.RC2_linux-"$(uname -i)".run
+chmod +x ./Ascend-cann-toolkit_8.3.RC2_linux-"$(uname -i)".run
+./Ascend-cann-toolkit_8.3.RC2_linux-"$(uname -i)".run --full
+# https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.3.rc2_linux-aarch64.run
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
-wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC1/Ascend-cann-kernels-910b_8.3.RC1_linux-"$(uname -i)".run
-chmod +x ./Ascend-cann-kernels-910b_8.3.RC1_linux-"$(uname -i)".run
-./Ascend-cann-kernels-910b_8.3.RC1_linux-"$(uname -i)".run --install
+wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC2/Ascend-cann-kernels-910b_8.3.RC2_linux-"$(uname -i)".run
+chmod +x ./Ascend-cann-kernels-910b_8.3.RC2_linux-"$(uname -i)".run
+./Ascend-cann-kernels-910b_8.3.RC2_linux-"$(uname -i)".run --install
-wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC1/Ascend-cann-nnal_8.3.RC1_linux-"$(uname -i)".run
-chmod +x ./Ascend-cann-nnal_8.3.RC1_linux-"$(uname -i)".run
-./Ascend-cann-nnal_8.3.RC1_linux-"$(uname -i)".run --install
+wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC2/Ascend-cann-nnal_8.3.RC2_linux-"$(uname -i)".run
+chmod +x ./Ascend-cann-nnal_8.3.RC2_linux-"$(uname -i)".run
+./Ascend-cann-nnal_8.3.RC2_linux-"$(uname -i)".run --install
 source /usr/local/Ascend/nnal/atb/set_env.sh
 ```

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/docs/source/tutorials/multi_npu_qwen3_next.md RENAMED Viewed

@@ -51,7 +51,7 @@ Install the Ascend BiSheng toolkit:
 wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/Ascend-BiSheng-toolkit_aarch64.run
 chmod a+x Ascend-BiSheng-toolkit_aarch64.run
 ./Ascend-BiSheng-toolkit_aarch64.run --install
-source /usr/local/Ascend/8.3.RC1/bisheng_toolkit/set_env.sh
+source /usr/local/Ascend/8.3.RC2/bisheng_toolkit/set_env.sh
 ```
 Install Triton Ascend:
@@ -75,7 +75,7 @@ Coming soon ...
 Please make sure you have already executed the command:
 ```bash
-source /usr/local/Ascend/8.3.RC1/bisheng_toolkit/set_env.sh
+source /usr/local/Ascend/8.3.RC2/bisheng_toolkit/set_env.sh
 ```
 :::::{tab-set}

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/docs/source/user_guide/feature_guide/eplb_swift_balancer.md RENAMED Viewed

@@ -12,6 +12,13 @@ Expert balancing for MoE models in LLM serving is essential for optimal performa
 - Adaptive Scaling: Automatically adjusts to workload fluctuations while maintaining stable performance.
 - Fault Tolerance: Redundant expert placement ensures system resilience during hardware failures.
+## Support Scenarios
+### Models:
+DeepseekV3/V3.1/R1、Qwen3-MOE
+### MOE QuantType:
+W8A8-dynamic
 ## How to Use EPLB
 ### Dynamic EPLB

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py RENAMED Viewed

@@ -88,6 +88,7 @@ import argparse
 import asyncio
 import functools
 import heapq
+import ipaddress
 import os
 import sys
 import threading
@@ -116,6 +117,12 @@ class ServerState:
         self.host = host
         self.port = port
         self.url = f'http://{host}:{port}/v1'
+        try:
+            ip = ipaddress.ip_address(self.host)
+            if isinstance(ip, ipaddress.IPv6Address):
+                self.url = f'http://[{host}]:{port}/v1'
+        except Exception:
+            pass
         self.client = httpx.AsyncClient(timeout=None,
                                         base_url=self.url,
                                         limits=httpx.Limits(
@@ -356,6 +363,9 @@ async def send_request_to_service(client: httpx.AsyncClient,
     req_data = req_data.copy()
     req_data["stream"] = False
     req_data["max_tokens"] = 1
+    req_data["min_tokens"] = 1
+    if "max_completion_tokens" in req_data:
+        req_data["max_completion_tokens"] = 1
     if "stream_options" in req_data:
         del req_data["stream_options"]
     headers = {

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py RENAMED Viewed

@@ -88,6 +88,7 @@ import argparse
 import asyncio
 import functools
 import heapq
+import ipaddress
 import json
 import os
 import sys
@@ -118,6 +119,12 @@ class ServerState:
         self.host = host
         self.port = port
         self.url = f'http://{host}:{port}/v1'
+        try:
+            ip = ipaddress.ip_address(self.host)
+            if isinstance(ip, ipaddress.IPv6Address):
+                self.url = f'http://[{host}]:{port}/v1'
+        except Exception:
+            pass
         self.client = httpx.AsyncClient(timeout=None,
                                         base_url=self.url,
                                         limits=httpx.Limits(
@@ -366,6 +373,8 @@ async def send_request_to_service(client: httpx.AsyncClient,
     req_data["stream"] = False
     req_data["max_tokens"] = 1
     req_data["min_tokens"] = 1
+    if "max_completion_tokens" in req_data:
+        req_data["max_completion_tokens"] = 1
     if "stream_options" in req_data:
         del req_data["stream_options"]
     headers = {

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/requirements-dev.txt RENAMED Viewed

@@ -2,7 +2,7 @@
 -r requirements.txt
 modelscope
 openai
-pytest >= 6.0
+pytest >= 6.0,<9.0.0
 pytest-asyncio
 pytest-mock
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/tests/e2e/nightly/multi_node/scripts/lws.yaml RENAMED Viewed

@@ -15,7 +15,7 @@ spec:
       spec:
         containers:
           - name: vllm-leader
-            image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+            image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
             env:
               - name: WORKSPACE
                 value: "/root/workspace"
@@ -70,7 +70,7 @@ spec:
       spec:
         containers:
           - name: vllm-worker
-            image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+            image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
             env:
               - name: WORKSPACE
                 value: "/root/workspace"

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py RENAMED Viewed

@@ -13,7 +13,7 @@ from tests.e2e.conftest import VllmRunner
 @pytest.fixture
 def test_prompts():
     prompt_types = ["repeat", "sentence"]
-    num_prompts = 10
+    num_prompts = 100
     prompts = []
     random.seed(0)
@@ -70,7 +70,6 @@ def test_ngram_correctness(
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using ngram speculative decoding.
     '''
-    pytest.skip("Not current support for the test.")
     ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False)
     ref_outputs = ref_llm.chat(test_prompts, sampling_config)
     del ref_llm
@@ -96,7 +95,7 @@ def test_ngram_correctness(
     # Heuristic: expect at least 70% of the prompts to match exactly
     # Upon failure, inspect the outputs to check for inaccuracy.
-    assert matches > int(0.7 * len(ref_outputs))
+    assert matches > int(0.66 * len(ref_outputs))
 @pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
@@ -110,7 +109,7 @@ def test_eagle_correctness(
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using eagle speculative decoding.
     '''
+    pytest.skip("exist OOM error")
     ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
     ref_outputs = ref_llm.chat(test_prompts, sampling_config)
     del ref_llm

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/tests/e2e/vllm_interface/vllm_test.cfg RENAMED Viewed

@@ -1,2 +1,2 @@
 # Base docker image used to build the vllm-ascend e2e test image, which is built in the vLLM repository
-BASE_IMAGE_NAME="quay.io/ascend/cann:8.3.rc1-910b-ubuntu22.04-py3.11"
+BASE_IMAGE_NAME="quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11"

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/tests/ut/ops/test_rotary_embedding.py RENAMED Viewed

@@ -7,6 +7,7 @@ from transformers.configuration_utils import PretrainedConfig
 from vllm.config import ModelConfig, VllmConfig
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding)
+from vllm.platforms import CpuArchEnum
 from tests.ut.base import TestBase
 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
@@ -424,11 +425,14 @@ class TestAscendMRotaryEmbedding(unittest.TestCase):
         return vllm_config
     @patch('torch_npu.npu_mrope')
+    @patch('vllm_ascend.platform.NPUPlatform.get_cpu_architecture')
     @patch('vllm.config.ModelConfig.__post_init__', MagicMock())
     @patch('vllm.config.VllmConfig.__post_init__', MagicMock())
     @patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1))
     @patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1))
-    def test_forward_oot_1d_positions(self, mock_npu_mrope):
+    def test_forward_oot_1d_positions(self, mock_cpu_arc, mock_npu_mrope):
+        mock_cpu_arc.return_value = CpuArchEnum.ARM
         mock_npu_mrope.return_value = (torch.zeros_like(self.query),
                                        torch.zeros_like(self.key))
@@ -443,11 +447,14 @@ class TestAscendMRotaryEmbedding(unittest.TestCase):
         self.assertEqual(result_q.shape, self.query.shape)
     @patch('torch_npu.npu_mrope')
+    @patch('vllm_ascend.platform.NPUPlatform.get_cpu_architecture')
     @patch('vllm.config.ModelConfig.__post_init__', MagicMock())
     @patch('vllm.config.VllmConfig.__post_init__', MagicMock())
     @patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1))
     @patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1))
-    def test_forward_oot_2d_positions(self, mock_npu_mrope):
+    def test_forward_oot_2d_positions(self, mock_cpu_arc, mock_npu_mrope):
+        mock_cpu_arc.return_value = CpuArchEnum.ARM
         mock_npu_mrope.return_value = (torch.zeros_like(self.query),
                                        torch.zeros_like(self.key))

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/vllm_ascend/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.11.0rc1'
-__version_tuple__ = version_tuple = (0, 11, 0, 'rc1')
+__version__ = version = '0.11.0rc2'
+__version_tuple__ = version_tuple = (0, 11, 0, 'rc2')
-__commit_id__ = commit_id = 'gc5fe179ce'
+__commit_id__ = commit_id = 'ga2e4c3fe7'

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/vllm_ascend/attention/attention_v1.py RENAMED Viewed

@@ -115,7 +115,7 @@ class AscendAttentionBackend(AttentionBackend):
     @staticmethod
     def get_supported_block_size() -> list[int]:
-        return [64]
+        return [128]
 class AscendAttentionState(Enum):
@@ -191,6 +191,14 @@ class AscendAttentionMetadataBuilder:
         self.max_num_blocks_per_req = cdiv(
             self.model_config.max_model_len,
             AscendAttentionBackend.get_supported_block_size()[0])
+        self.speculative_config = vllm_config.speculative_config
+        self.decode_threshold = 1
+        if self.speculative_config:
+            spec_token_num = self.speculative_config.num_speculative_tokens
+            self.decode_threshold += spec_token_num
+            assert self.decode_threshold <= 16, f"decode_threshold exceeded \
+                npu_fused_infer_attention_score TND layout's limit of 16, \
+                got {self.decode_threshold}"
     def reorder_batch(self, input_batch,
                       scheduler_output: "SchedulerOutput") -> bool:

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/vllm_ascend/attention/mla_v1.py RENAMED Viewed

@@ -1166,6 +1166,8 @@ class AscendMLAImpl(MLAAttentionImpl):
                 dim=-1,
             )
             q_c = self.q_a_layernorm(q_c)
+            # allgather need contiguous data
+            kv_no_split = kv_no_split.contiguous()
         else:
             q_c = hidden_states
             kv_no_split = self.kv_a_proj_with_mqa(hidden_states)[0]

{vllm_ascend-0.11.0rc1 → vllm_ascend-0.11.0rc2}/vllm_ascend/compilation/acl_graph.py RENAMED Viewed

@@ -213,26 +213,24 @@ def update_attn_params(update_stream, forward_context, runtime_shape):
             ) = param
             seq_lens = forward_context.attn_metadata[key].seq_lens
-        # When using FULL_DECODE_ONLY, there are some rare bugs for FULL_DECODE_ONLY
-        # mode with GQA. This is triggered by getting workspace for _npu_paged_attention
-        # in torch_npu. On some rare cases, _npu_paged_attention with smaller seq_lens
-        # might encounter a bigger workspace, while currently we use max_model_len to
-        # calculate max workspace in capturing. So additional get_workspace is added
-        # here to avoid such bugs.
-        # TODO(Angazenn): we will remove this once _npu_paged_attention is fully
-        # replaced by npu_fused_infer_attention_score which does not contain such bugs.
-        workspace = torch_npu._npu_paged_attention_get_workspace(
-            query=query,
-            key_cache=key_cache,
-            value_cache=value_cache,
-            num_kv_heads=num_kv_heads,
-            num_heads=num_heads,
-            scale_value=scale,
-            block_table=block_table,
-            context_lens=seq_lens,
-            out=output)
-        with torch.npu.stream(update_stream):
+            # When using FULL_DECODE_ONLY, there are some rare bugs for FULL_DECODE_ONLY
+            # mode with GQA. This is triggered by getting workspace for _npu_paged_attention
+            # in torch_npu. On some rare cases, _npu_paged_attention with smaller seq_lens
+            # might encounter a bigger workspace, while currently we use max_model_len to
+            # calculate max workspace in capturing. So additional get_workspace is added
+            # here to avoid such bugs.
+            # TODO(Angazenn): we will remove this once _npu_paged_attention is fully
+            # replaced by npu_fused_infer_attention_score which does not contain such bugs.
+            workspace = torch_npu._npu_paged_attention_get_workspace(
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                num_kv_heads=num_kv_heads,
+                num_heads=num_heads,
+                scale_value=scale,
+                block_table=block_table,
+                context_lens=seq_lens,
+                out=output)
             torch.npu.graph_task_update_begin(update_stream, handle)
             torch_npu._npu_paged_attention(query=query,
                                            key_cache=key_cache,
@@ -280,7 +278,8 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
             else:
                 seq_lens_list = seq_lens_list + [0] * (runtime_shape -
                                                        len(seq_lens_list))
-                torch.npu.graph_task_update_begin(update_stream, handle)
+            torch.npu.graph_task_update_begin(update_stream, handle)
             torch_npu.npu_fused_infer_attention_score.out(
                 q_nope,

vllm-ascend 0.11.0rc1__tar.gz → 0.11.0rc2__tar.gz

vllm-ascend 0.11.0rc1tar.gz → 0.11.0rc2tar.gz