vllm-ascend 0.9.1rc1__tar.gz → 0.9.1rc2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/Dockerfile.buildwheel +5 -8
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/dependabot.yml +0 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/accuracy_test.yaml +1 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/image_openeuler.yml +6 -15
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/image_ubuntu.yml +4 -10
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/nightly_benchmarks.yaml +5 -10
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/release_code.yml +1 -1
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/release_whl.yml +35 -6
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/vllm_ascend_doctest.yaml +8 -13
- vllm_ascend-0.9.1rc2/.github/workflows/vllm_ascend_test.yaml +242 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/vllm_ascend_test_long_term.yaml +13 -14
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/vllm_ascend_test_pd.yaml +6 -7
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/Dockerfile +2 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/Dockerfile.openEuler +2 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/PKG-INFO +3 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/README.md +2 -2
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/README.zh.md +2 -2
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/csrc/kernels/get_masked_input_and_mask_kernel.cpp +30 -63
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/csrc/kernels/pos_encoding_kernels.cpp +5 -15
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/csrc/kernels/utils.h +1 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/requirements-test.txt +1 -1
- vllm_ascend-0.9.1rc2/docs/source/assets/multi_node_dp.png +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/community/contributors.md +18 -1
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/community/governance.md +2 -2
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/community/user_stories/llamafactory.md +1 -1
- {vllm_ascend-0.9.1rc1/docs/source/developer_guide → vllm_ascend-0.9.1rc2/docs/source/community}/versioning_policy.md +4 -2
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/conf.py +5 -5
- vllm_ascend-0.9.1rc1/docs/source/developer_guide/contributing.md → vllm_ascend-0.9.1rc2/docs/source/developer_guide/contribution/index.md +44 -46
- vllm_ascend-0.9.1rc2/docs/source/developer_guide/contribution/testing.md +285 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/accuracy_report/index.md +1 -1
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/index.md +2 -9
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/using_evalscope.md +3 -1
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/using_lm_eval.md +2 -1
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/using_opencompass.md +4 -1
- vllm_ascend-0.9.1rc2/docs/source/developer_guide/feature_guide/index.md +9 -0
- vllm_ascend-0.9.1rc2/docs/source/developer_guide/feature_guide/patch.md +85 -0
- vllm_ascend-0.9.1rc2/docs/source/developer_guide/modeling/adding_a_new_model.md +259 -0
- vllm_ascend-0.9.1rc2/docs/source/developer_guide/modeling/adding_a_new_multimodal_model.md +3 -0
- vllm_ascend-0.9.1rc2/docs/source/developer_guide/modeling/index.md +10 -0
- vllm_ascend-0.9.1rc2/docs/source/developer_guide/performance/index.md +9 -0
- vllm_ascend-0.9.1rc2/docs/source/developer_guide/performance/optimization_and_tuning.md +183 -0
- {vllm_ascend-0.9.1rc1/docs/source/developer_guide/evaluation → vllm_ascend-0.9.1rc2/docs/source/developer_guide/performance}/performance_benchmark.md +7 -0
- {vllm_ascend-0.9.1rc1/docs/source/developer_guide/evaluation → vllm_ascend-0.9.1rc2/docs/source/developer_guide/performance}/profile_execute_duration.md +2 -1
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/faqs.md +53 -9
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/index.md +8 -8
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/installation.md +23 -18
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/quick_start.md +14 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/index.md +1 -2
- vllm_ascend-0.9.1rc2/docs/source/tutorials/multi_node.md +203 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/multi_npu_quantization.md +6 -3
- vllm_ascend-0.9.1rc2/docs/source/tutorials/multi_npu_qwen3_moe.md +108 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/single_npu.md +72 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/single_npu_multimodal.md +4 -3
- {vllm_ascend-0.9.1rc1/docs/source/user_guide → vllm_ascend-0.9.1rc2/docs/source/user_guide/configuration}/additional_config.md +10 -11
- {vllm_ascend-0.9.1rc1/docs/source/user_guide → vllm_ascend-0.9.1rc2/docs/source/user_guide/configuration}/env_vars.md +1 -1
- vllm_ascend-0.9.1rc2/docs/source/user_guide/configuration/index.md +10 -0
- {vllm_ascend-0.9.1rc1/docs/source/user_guide → vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide}/graph_mode.md +4 -9
- vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/images/structured_output_1.png +0 -0
- vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/index.md +13 -0
- vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/lora.md +8 -0
- {vllm_ascend-0.9.1rc1/docs/source/user_guide → vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide}/quantization.md +21 -2
- vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/sleep_mode.md +114 -0
- vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/structured_output.md +163 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/user_guide/release_notes.md +31 -7
- vllm_ascend-0.9.1rc2/docs/source/user_guide/support_matrix/index.md +10 -0
- vllm_ascend-0.9.1rc2/docs/source/user_guide/support_matrix/supported_features.md +51 -0
- {vllm_ascend-0.9.1rc1/docs/source/user_guide → vllm_ascend-0.9.1rc2/docs/source/user_guide/support_matrix}/supported_models.md +3 -2
- vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/README.md +246 -0
- vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/gen_ranktable.py +120 -0
- vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/gen_ranktable.sh +79 -0
- vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py +435 -0
- vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/run_server.sh +32 -0
- vllm_ascend-0.9.1rc2/examples/dp_offline/data_parallel.py +226 -0
- vllm_ascend-0.9.1rc2/examples/dp_offline/run_dp.sh +28 -0
- vllm_ascend-0.9.1rc2/examples/eplb_generate_map.py +77 -0
- vllm_ascend-0.9.1rc2/examples/external_online_dp/launch_dp_program.py +34 -0
- vllm_ascend-0.9.1rc2/examples/external_online_dp/run_dp_template.sh +51 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/offline_dualbatch_overlap_npu.py +1 -1
- vllm_ascend-0.9.1rc2/examples/run_dp_server.sh +33 -0
- vllm_ascend-0.9.1rc1/examples/run_dp_attention_etp16.sh → vllm_ascend-0.9.1rc2/examples/run_dp_with_cached_graph_etp16.sh +18 -16
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/format.sh +1 -1
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/pyproject.toml +3 -1
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/requirements-dev.txt +1 -2
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/requirements-lint.txt +1 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/requirements.txt +6 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/e2e/common.sh +1 -24
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/e2e/doctests/001-quickstart-test.sh +2 -0
- vllm_ascend-0.9.1rc2/tests/e2e/pd_disaggreate/run_edge_case_test.sh +141 -0
- vllm_ascend-0.9.1rc2/tests/e2e/pd_disaggreate/test_edge_cases.py +81 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/e2e/run_doctests.sh +0 -6
- vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0/e2e/test_eagle_correctness.py +344 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/test_medusa_correctness.py +2 -2
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/test_mlp_correctness.py +2 -2
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/test_ngram_correctness.py +2 -2
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_dynamic_spec_decode.py +2 -2
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_multi_step_worker.py +1 -1
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_ngram_worker.py +1 -1
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_spec_decode_worker.py +4 -4
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode/e2e → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v1}/test_v1_mtp_correctness.py +66 -1
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode/e2e → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v1}/test_v1_spec_decode.py +6 -12
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/multicard/test_data_parallel.py +1 -1
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/multicard/test_ilama_lora_tp2.py +2 -2
- vllm_ascend-0.9.1rc2/tests/multicard/test_model_qwen3_w4a8.py +65 -0
- vllm_ascend-0.9.1rc2/tests/multicard/test_multimodal_context_parallel.py +82 -0
- vllm_ascend-0.9.1rc2/tests/multicard/test_offline_inference_distributed.py +230 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/multicard/test_torchair_graph_mode.py +8 -5
- vllm_ascend-0.9.1rc2/tests/multicard/test_w4a8_deepseek.py +67 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/core/test_ascend_scheduler.py +26 -50
- vllm_ascend-0.9.1rc2/tests/singlecard/ops/test_fused_moe.py +196 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/ops/test_rotary_embedding.py +67 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/sample/test_rejection_sampler.py +20 -11
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_aclgraph.py +25 -2
- vllm_ascend-0.9.1rc2/tests/singlecard/test_ascend_config.py +233 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_offline_inference.py +4 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_scheduler.py +4 -20
- vllm_ascend-0.9.1rc2/tests/ut/kv_connector/test_llmdatadist_connector.py +42 -0
- vllm_ascend-0.9.1rc2/tests/ut/kv_connector/test_remote_decode_lifecycle.py +123 -0
- vllm_ascend-0.9.1rc2/tests/ut/kv_connector/test_remote_prefill_lifecycle.py +242 -0
- vllm_ascend-0.9.1rc2/tests/ut/kv_connector/utils.py +194 -0
- vllm_ascend-0.9.1rc2/tests/ut/patch/worker/patch_common/test_patch_sampler.py +44 -0
- vllm_ascend-0.9.1rc2/tests/ut/test_distributed_tensor_parallel.py +139 -0
- vllm_ascend-0.9.1rc2/tests/ut/test_token_dispatcher.py +69 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/__init__.py +7 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/_version.py +2 -2
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ascend_config.py +89 -15
- vllm_ascend-0.9.1rc2/vllm_ascend/ascend_forward_context.py +137 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/attention/attention.py +4 -40
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/attention/attention_v1.py +121 -82
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/attention/mla_v1.py +291 -215
- vllm_ascend-0.9.1rc2/vllm_ascend/attention/utils.py +23 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/compilation/piecewise_backend.py +57 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/core/scheduler.py +50 -21
- vllm_ascend-0.9.1rc2/vllm_ascend/cpu_binding.py +329 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/device_allocator/camem.py +1 -1
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/__init__.py +5 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/distributed/context_parallel_utils.py +110 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py +923 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/distributed/parallel_state.py +49 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/distributed/tensor_parallel.py +248 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/envs.py +56 -3
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/adaptor/abstract_adaptor.py +44 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/adaptor/vllm_adaptor.py +212 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/eplb_device_transfer_loader.py +136 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/eplb_utils.py +75 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/eplb_worker.py +442 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_abstract.py +41 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_dynamic_ep.py +388 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py +770 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_factory.py +25 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_random.py +29 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/eplb/eplb_updator.py +222 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/lora/__init__.py +0 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/lora/punica_wrapper/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/__init__.py +21 -8
- vllm_ascend-0.9.1rc2/vllm_ascend/models/deepseek_dbo.py +1085 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/deepseek_mtp.py +26 -6
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/deepseek_v2.py +194 -52
- vllm_ascend-0.9.1rc2/vllm_ascend/models/qwen2.py +372 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/qwen2_5_vl.py +146 -5
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/qwen2_5_vl_without_padding.py +98 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/models/qwen3.py +472 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/models/qwen3_dbo.py +552 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/models/qwen3_moe.py +268 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/multistream/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/base.py +2 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/metadata.py +2 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/ms_split.py +136 -12
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/activation.py +1 -6
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/attention.py +19 -15
- vllm_ascend-0.9.1rc2/vllm_ascend/ops/comm_utils.py +127 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/common_fused_moe.py +24 -17
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/fused_moe.py +531 -289
- vllm_ascend-0.9.1rc2/vllm_ascend/ops/layernorm.py +77 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/ops/moe_dispatcher/__init__.py +0 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py +578 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/rotary_embedding.py +65 -39
- vllm_ascend-0.9.1rc2/vllm_ascend/ops/sequence_parallel.py +119 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/vocab_parallel_embedding.py +14 -7
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/__init__.py +24 -15
- vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/__init__.py +25 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_cache_manager.py +13 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_configs.py +77 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_core.py +132 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_core_client.py +26 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_decorator.py +154 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_common/patch_distributed.py +83 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py +0 -16
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_sampler.py +8 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py +7 -5
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/platform.py +16 -24
- vllm_ascend-0.9.1rc2/vllm_ascend/quantization/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/quantization/func_wrapper.py +32 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/quantization/quant_config.py +40 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/quantization/quantizer.py +21 -9
- vllm_ascend-0.9.1rc2/vllm_ascend/quantization/w4a8_dynamic.py +393 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/quantization/w8a8.py +20 -6
- vllm_ascend-0.9.1rc2/vllm_ascend/quantization/w8a8_dynamic.py +1055 -0
- vllm_ascend-0.9.1rc2/vllm_ascend/sample/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/sample/rejection_sampler.py +101 -50
- vllm_ascend-0.9.1rc2/vllm_ascend/soc_info.py +14 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/utils.py +170 -123
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/draft_model_runner.py +10 -9
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/model_runner.py +7 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/model_runner_v1.py +589 -472
- vllm_ascend-0.9.1rc2/vllm_ascend/worker/mtp_proposer_v1.py +437 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/pooling_model_runner.py +3 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/worker.py +9 -18
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/worker_v1.py +51 -22
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/PKG-INFO +3 -3
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/SOURCES.txt +133 -73
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/requires.txt +4 -1
- vllm_ascend-0.9.1rc1/.github/workflows/doc_codespell.yaml +0 -33
- vllm_ascend-0.9.1rc1/.github/workflows/image_310p_openeuler.yml +0 -114
- vllm_ascend-0.9.1rc1/.github/workflows/image_310p_ubuntu.yml +0 -110
- vllm_ascend-0.9.1rc1/.github/workflows/vllm_ascend_test.yaml +0 -379
- vllm_ascend-0.9.1rc1/Dockerfile.310p +0 -61
- vllm_ascend-0.9.1rc1/Dockerfile.310p.openEuler +0 -58
- vllm_ascend-0.9.1rc1/codecov.yml +0 -30
- vllm_ascend-0.9.1rc1/docs/source/tutorials/multi_node.md +0 -195
- vllm_ascend-0.9.1rc1/docs/source/tutorials/multi_npu_moge.md +0 -117
- vllm_ascend-0.9.1rc1/docs/source/tutorials/single_node_300i.md +0 -304
- vllm_ascend-0.9.1rc1/docs/source/user_guide/release.template.md +0 -13
- vllm_ascend-0.9.1rc1/docs/source/user_guide/suppoted_features.md +0 -49
- vllm_ascend-0.9.1rc1/examples/dp_offline/data_parallel.py +0 -85
- vllm_ascend-0.9.1rc1/examples/dp_offline/run_dp.sh +0 -19
- vllm_ascend-0.9.1rc1/examples/run_dp_server.sh +0 -30
- vllm_ascend-0.9.1rc1/tests/e2e/doctests/002-pip-binary-installation-test.sh +0 -42
- vllm_ascend-0.9.1rc1/tests/e2e/multicard/test_offline_inference_distributed.py +0 -114
- vllm_ascend-0.9.1rc1/tests/e2e/singlecard/ops/test_fused_moe.py +0 -100
- vllm_ascend-0.9.1rc1/tests/ut/fake_weight/config.json +0 -28
- vllm_ascend-0.9.1rc1/tests/ut/test_ascend_config.py +0 -244
- vllm_ascend-0.9.1rc1/tests/ut/worker/test_worker_v1.py +0 -1
- vllm_ascend-0.9.1rc1/vllm_ascend/distributed/parallel_state.py +0 -77
- vllm_ascend-0.9.1rc1/vllm_ascend/models/deepseek_dbo.py +0 -977
- vllm_ascend-0.9.1rc1/vllm_ascend/models/pangu_moe.py +0 -639
- vllm_ascend-0.9.1rc1/vllm_ascend/models/qwen3_moe.py +0 -35
- vllm_ascend-0.9.1rc1/vllm_ascend/ops/layernorm.py +0 -49
- vllm_ascend-0.9.1rc1/vllm_ascend/patch/platform/patch_common/patch_distributed.py +0 -153
- vllm_ascend-0.9.1rc1/vllm_ascend/patch/worker/patch_0_9_1/__init__.py +0 -16
- vllm_ascend-0.9.1rc1/vllm_ascend/quantization/w8a8_dynamic.py +0 -723
- vllm_ascend-0.9.1rc1/vllm_ascend/worker/eagle_proposer_v1.py +0 -429
- vllm_ascend-0.9.1rc1/vllm_ascend/worker/mtp_proposer_v1.py +0 -225
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/100-documentation.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/110-user-story.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/200-installation.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/300-usage.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/400-bug-report.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/500-feature-request.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/600-new-model.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/700-performance-discussion.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/750-RFC.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/800-others.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/actionlint.yaml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/labeler.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/accuracy_report.yaml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/label_merge_conflict.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/labeler.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/matchers/actionlint.json +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/matchers/mypy.json +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/matchers/ruff.json +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/shellcheck.yml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.gitignore +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.readthedocs.yaml +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/CMakeLists.txt +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/CODE_OF_CONDUCT.md +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/DCO +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/LICENSE +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/README.md +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/ops/ben_vocabparallelembedding.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/requirements-bench.txt +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/convert_json_to_markdown.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/patch_benchmark_dataset.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/perf_result_template.md +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/run-performance-benchmarks.sh +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/run_accuracy.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/tests/latency-tests.json +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/tests/serving-tests.json +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/benchmarks/tests/throughput-tests.json +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/cmake/utils.cmake +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/collect_env.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/csrc/camem_allocator.cpp +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/csrc/kernels/advance_step.cpp +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/csrc/kernels/types.h +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/csrc/ops.h +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/csrc/torch_binding.cpp +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/csrc/utils.h +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/Makefile +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/README.md +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/requirements-docs.txt +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/_templates/sections/header.html +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/community/user_stories/index.md +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/logos/vllm-ascend-logo-text-dark.png +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/logos/vllm-ascend-logo-text-light.png +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/multi_npu.md +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/disaggregated_prefill_offline.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/dp_proxy.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/find_device_ips.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/run_decode_server.sh +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/run_prefill_server.sh +0 -0
- /vllm_ascend-0.9.1rc1/tests/__init__.py → /vllm_ascend-0.9.1rc2/examples/external_online_dp/README.md +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/offline_disaggregated_prefill_npu.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/offline_distributed_inference_npu.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/offline_inference_audio_language.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/offline_inference_npu.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/offline_inference_npu_v1.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/offline_multi_step_custom_ops.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/prompt_embedding_inference.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/examples/run_dp_attention_etp16_benmark.sh +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/mypy.ini +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/packages.txt +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/pytest.ini +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/setup.cfg +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/setup.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode/e2e → vllm_ascend-0.9.1rc2/tests}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/conftest.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/e2e/pd_disaggreate/setup_pd.sh +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/e2e/pd_disaggreate/test_pd_e2e.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/e2e/run_disagg_pd.sh +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/conftest.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/singlecard → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0/e2e}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/conftest.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/test_mtp_correctness.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_utils.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/utils.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/long_term/test_accuracy.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/long_term/test_deepseek_v2_lite_tp2_accuracy.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/model_utils.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/multicard/test_dynamic_npugraph_batchsize.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/multicard/test_pyhccl_distributed.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/singlecard → vllm_ascend-0.9.1rc2/tests}/ops/test_vocabparallelembedding.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/singlecard/compile → vllm_ascend-0.9.1rc2/tests/singlecard}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/singlecard/core → vllm_ascend-0.9.1rc2/tests/singlecard/compile}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/compile/test_simple.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/singlecard/ops → vllm_ascend-0.9.1rc2/tests/singlecard/core}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/core/test_ascend_scheduler_e2e.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e/singlecard/sample → vllm_ascend-0.9.1rc2/tests/singlecard/ops}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/ops/test_multi_step.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/attention → vllm_ascend-0.9.1rc2/tests/singlecard/sample}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_camem.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_chunked.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_guided_decoding.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_ilama_lora.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_profile_execute_duration.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_prompt_embedding.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_pyhccl.py +0 -0
- {vllm_ascend-0.9.1rc1/tests/e2e → vllm_ascend-0.9.1rc2/tests}/singlecard/test_sampler.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/ut/ops/test_expert_load_balancer.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tests/utils.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tools/actionlint.sh +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tools/check_repo.sh +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tools/mypy.sh +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tools/png-lint.sh +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tools/shellcheck.sh +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/tools/sphinx-lint.sh +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/compilation → vllm_ascend-0.9.1rc2/vllm_ascend/attention}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/core → vllm_ascend-0.9.1rc2/vllm_ascend/compilation}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/device_allocator → vllm_ascend-0.9.1rc2/vllm_ascend/core}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/core/schedule_config.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/distributed/device_communicators → vllm_ascend-0.9.1rc2/vllm_ascend/device_allocator}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/communicator.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/distributed/kv_transfer → vllm_ascend-0.9.1rc2/vllm_ascend/distributed/device_communicators}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/device_communicators/pyhccl.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/lora → vllm_ascend-0.9.1rc2/vllm_ascend/distributed/kv_transfer}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/kv_transfer/simple_buffer.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/kv_transfer/simple_connector.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/kv_transfer/simple_pipe.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/kv_transfer/utils.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/llmdatadist_connector.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/lora/punica_wrapper → vllm_ascend-0.9.1rc2/vllm_ascend/eplb}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/multistream → vllm_ascend-0.9.1rc2/vllm_ascend/eplb/adaptor}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/quantization → vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/sample → vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/lora/punica_wrapper/punica_npu.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/qwen2_vl.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/context.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/decorator.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/layers.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/cache.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/expert_load_balancer.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/platform/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/platform/patch_common/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/patch/platform/patch_0_9_1 → vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_main}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1/vllm_ascend/patch/platform/patch_main → vllm_ascend-0.9.1rc2/vllm_ascend/patch/worker/patch_0_9_1}/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_distributed.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_eagle.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_minicpm.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_utils.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_main/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/__init__.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/cache_engine.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/multi_step_runner.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/multi_step_worker.py +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/dependency_links.txt +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/entry_points.txt +0 -0
- {vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/top_level.txt +0 -0
|
@@ -14,18 +14,17 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
# This file is a part of the vllm-ascend project.
|
|
16
16
|
#
|
|
17
|
-
ARG PY_VERSION=3.
|
|
18
|
-
FROM quay.io/ascend/
|
|
17
|
+
ARG PY_VERSION=3.11
|
|
18
|
+
FROM quay.io/ascend/manylinux:8.0.0-910b-manylinux_2_28-py${PY_VERSION}
|
|
19
19
|
|
|
20
20
|
ARG COMPILE_CUSTOM_KERNELS=1
|
|
21
21
|
|
|
22
22
|
# Define environments
|
|
23
23
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
24
24
|
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
|
|
25
|
-
RUN
|
|
26
|
-
|
|
27
|
-
rm -rf /var/cache/
|
|
28
|
-
rm -rf /var/lib/apt/lists/*
|
|
25
|
+
RUN yum update -y && \
|
|
26
|
+
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
|
|
27
|
+
rm -rf /var/cache/yum
|
|
29
28
|
|
|
30
29
|
WORKDIR /workspace
|
|
31
30
|
|
|
@@ -41,8 +40,6 @@ RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
|
|
41
40
|
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
|
42
41
|
cd vllm-ascend && \
|
|
43
42
|
python3 setup.py bdist_wheel && \
|
|
44
|
-
ls -l dist && \
|
|
45
|
-
for f in dist/*.whl; do mv "$f" "$(echo "$f" | sed -e 's/-linux_x86_64\.whl$/-manylinux1_x86_64.whl/' -e 's/-linux_aarch64\.whl$/-manylinux2014_aarch64.whl/')"; done && \
|
|
46
43
|
ls -l dist
|
|
47
44
|
|
|
48
45
|
CMD ["/bin/bash"]
|
|
@@ -117,7 +117,7 @@ jobs:
|
|
|
117
117
|
fail-fast: false
|
|
118
118
|
name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}
|
|
119
119
|
container:
|
|
120
|
-
image:
|
|
120
|
+
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
|
121
121
|
env:
|
|
122
122
|
HF_ENDPOINT: https://hf-mirror.com
|
|
123
123
|
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
@@ -173,8 +173,6 @@ jobs:
|
|
|
173
173
|
|
|
174
174
|
- name: Install vllm-project/vllm-ascend
|
|
175
175
|
working-directory: ./vllm-ascend
|
|
176
|
-
env:
|
|
177
|
-
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
178
176
|
run: |
|
|
179
177
|
pip install -r requirements-dev.txt
|
|
180
178
|
pip install -e .
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
name: 'image
|
|
1
|
+
name: 'image'
|
|
2
2
|
# This is a docker build check and publish job:
|
|
3
3
|
# 1. PR Triggered docker image build check
|
|
4
4
|
# - is for image build check
|
|
@@ -39,13 +39,9 @@ on:
|
|
|
39
39
|
|
|
40
40
|
jobs:
|
|
41
41
|
build:
|
|
42
|
-
name: vllm-ascend image
|
|
43
|
-
runs-on:
|
|
44
|
-
|
|
45
|
-
github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
|
|
46
|
-
'ubuntu-latest' ||
|
|
47
|
-
'ubuntu-24.04-arm'
|
|
48
|
-
}}
|
|
42
|
+
name: vllm-ascend openEuler image
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
|
|
49
45
|
steps:
|
|
50
46
|
- uses: actions/checkout@v4
|
|
51
47
|
|
|
@@ -94,15 +90,10 @@ jobs:
|
|
|
94
90
|
username: ${{ vars.QUAY_USERNAME }}
|
|
95
91
|
password: ${{ secrets.QUAY_PASSWORD }}
|
|
96
92
|
|
|
97
|
-
- name: Build and push
|
|
93
|
+
- name: Build and push
|
|
98
94
|
uses: docker/build-push-action@v6
|
|
99
95
|
with:
|
|
100
|
-
platforms:
|
|
101
|
-
${{
|
|
102
|
-
github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
|
|
103
|
-
'linux/amd64,linux/arm64' ||
|
|
104
|
-
'linux/arm64'
|
|
105
|
-
}}
|
|
96
|
+
platforms: linux/amd64,linux/arm64
|
|
106
97
|
# use the current repo path as the build context, ensure .git is contained
|
|
107
98
|
context: .
|
|
108
99
|
# only trigger when tag, branch/main push
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
name: 'image
|
|
1
|
+
name: 'image'
|
|
2
2
|
# This is a docker build check and publish job:
|
|
3
3
|
# 1. PR Triggered docker image build check
|
|
4
4
|
# - is for image build check
|
|
@@ -39,7 +39,7 @@ on:
|
|
|
39
39
|
jobs:
|
|
40
40
|
|
|
41
41
|
build:
|
|
42
|
-
name: vllm-ascend image
|
|
42
|
+
name: vllm-ascend Ubuntu image
|
|
43
43
|
runs-on: ubuntu-latest
|
|
44
44
|
|
|
45
45
|
steps:
|
|
@@ -90,18 +90,12 @@ jobs:
|
|
|
90
90
|
username: ${{ vars.QUAY_USERNAME }}
|
|
91
91
|
password: ${{ secrets.QUAY_PASSWORD }}
|
|
92
92
|
|
|
93
|
-
- name: Build and push
|
|
93
|
+
- name: Build and push
|
|
94
94
|
uses: docker/build-push-action@v6
|
|
95
95
|
with:
|
|
96
|
-
platforms:
|
|
97
|
-
${{
|
|
98
|
-
github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
|
|
99
|
-
'linux/amd64,linux/arm64' ||
|
|
100
|
-
'linux/amd64'
|
|
101
|
-
}}
|
|
96
|
+
platforms: linux/amd64,linux/arm64
|
|
102
97
|
# use the current repo path as the build context, ensure .git is contained
|
|
103
98
|
context: .
|
|
104
|
-
file: Dockerfile
|
|
105
99
|
# only trigger when tag, branch/main push
|
|
106
100
|
push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
|
|
107
101
|
labels: ${{ steps.meta.outputs.labels }}
|
|
@@ -18,11 +18,7 @@
|
|
|
18
18
|
name: 'Benchmarks / Performance'
|
|
19
19
|
# This workflow runs nightly benchmarks for vllm-ascend.
|
|
20
20
|
|
|
21
|
-
on:
|
|
22
|
-
schedule:
|
|
23
|
-
# Run at 02:00 everyday
|
|
24
|
-
- cron: '00 18 * * *'
|
|
25
|
-
|
|
21
|
+
on:
|
|
26
22
|
workflow_dispatch:
|
|
27
23
|
# Allow manual triggering of the workflow
|
|
28
24
|
|
|
@@ -51,11 +47,14 @@ jobs:
|
|
|
51
47
|
matrix:
|
|
52
48
|
include:
|
|
53
49
|
- vllm_branch: v0.9.1
|
|
50
|
+
vllm_ascend_branch: main
|
|
51
|
+
vllm_use_v1: 0
|
|
52
|
+
- vllm_branch: v0.9.0
|
|
54
53
|
vllm_ascend_branch: main
|
|
55
54
|
vllm_use_v1: 1
|
|
56
55
|
max-parallel: 1
|
|
57
56
|
container:
|
|
58
|
-
image:
|
|
57
|
+
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
|
59
58
|
volumes:
|
|
60
59
|
- /usr/local/dcmi:/usr/local/dcmi
|
|
61
60
|
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
|
|
@@ -112,8 +111,6 @@ jobs:
|
|
|
112
111
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
113
112
|
|
|
114
113
|
- name: Install vllm-project/vllm-ascend
|
|
115
|
-
env:
|
|
116
|
-
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
117
114
|
run: |
|
|
118
115
|
pip install -e .
|
|
119
116
|
pip install -r benchmarks/requirements-bench.txt
|
|
@@ -164,8 +161,6 @@ jobs:
|
|
|
164
161
|
cp -r benchmarks/* /github/home/benchmarks/
|
|
165
162
|
|
|
166
163
|
- name: Run benchmark iteration
|
|
167
|
-
env:
|
|
168
|
-
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
169
164
|
if: github.event_name != 'pull_request'
|
|
170
165
|
run: |
|
|
171
166
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
|
@@ -71,16 +71,11 @@ jobs:
|
|
|
71
71
|
--build-arg PY_VERSION=${{ matrix.python-version }} \
|
|
72
72
|
-t wheel:v1 .
|
|
73
73
|
docker run --rm \
|
|
74
|
+
-u $(id -u):$(id -g) \
|
|
74
75
|
-v $(pwd):/outpwd \
|
|
75
76
|
wheel:v1 \
|
|
76
77
|
bash -c "cp -r /workspace/vllm-ascend/dist /outpwd"
|
|
77
78
|
ls dist
|
|
78
|
-
|
|
79
|
-
- name: Archive wheel
|
|
80
|
-
uses: actions/upload-artifact@v4
|
|
81
|
-
with:
|
|
82
|
-
name: vllm-ascend-${{ matrix.os }}-py${{ matrix.python-version }}-wheel
|
|
83
|
-
path: dist/*
|
|
84
79
|
|
|
85
80
|
- name: Set up Python ${{ matrix.python-version }}
|
|
86
81
|
if: startsWith(github.ref, 'refs/tags/')
|
|
@@ -88,6 +83,40 @@ jobs:
|
|
|
88
83
|
with:
|
|
89
84
|
python-version: ${{ matrix.python-version }}
|
|
90
85
|
|
|
86
|
+
- name: Repair wheels with auditwheel
|
|
87
|
+
run: |
|
|
88
|
+
python3 -m pip install auditwheel
|
|
89
|
+
python3 -m pip install patchelf
|
|
90
|
+
mkdir -p dist/repaired
|
|
91
|
+
for whl in dist/*.whl; do
|
|
92
|
+
auditwheel repair "$whl" -w dist/repaired/ \
|
|
93
|
+
--exclude libplatform.so \
|
|
94
|
+
--exclude libregister.so \
|
|
95
|
+
--exclude libge_common_base.so \
|
|
96
|
+
--exclude libc10.so \
|
|
97
|
+
--exclude libc_sec.so \
|
|
98
|
+
--exclude "libascend*.so" \
|
|
99
|
+
--exclude "libtorch*.so"
|
|
100
|
+
done
|
|
101
|
+
rm -f dist/*.whl
|
|
102
|
+
mv dist/repaired/*.whl dist/
|
|
103
|
+
rmdir dist/repaired
|
|
104
|
+
ls dist
|
|
105
|
+
|
|
106
|
+
- name: Verify automatic platform tags
|
|
107
|
+
run: |
|
|
108
|
+
cd dist
|
|
109
|
+
for wheel in *.whl; do
|
|
110
|
+
echo "verification file: $wheel"
|
|
111
|
+
auditwheel show "$wheel"
|
|
112
|
+
done
|
|
113
|
+
|
|
114
|
+
- name: Archive wheel
|
|
115
|
+
uses: actions/upload-artifact@v4
|
|
116
|
+
with:
|
|
117
|
+
name: vllm-ascend-${{ matrix.os }}-py${{ matrix.python-version }}-wheel
|
|
118
|
+
path: dist/*
|
|
119
|
+
|
|
91
120
|
- name: Release
|
|
92
121
|
if: startsWith(github.ref, 'refs/tags/')
|
|
93
122
|
run: |
|
|
@@ -29,9 +29,6 @@ on:
|
|
|
29
29
|
- 'tests/e2e/doctests/**'
|
|
30
30
|
- 'tests/e2e/common.sh'
|
|
31
31
|
- 'tests/e2e/run_doctests.sh'
|
|
32
|
-
schedule:
|
|
33
|
-
# Runs every 4 hours
|
|
34
|
-
- cron: '0 */4 * * *'
|
|
35
32
|
|
|
36
33
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
|
37
34
|
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
|
@@ -46,7 +43,7 @@ jobs:
|
|
|
46
43
|
# Each version should be tested
|
|
47
44
|
fail-fast: false
|
|
48
45
|
matrix:
|
|
49
|
-
vllm_verison: [v0.
|
|
46
|
+
vllm_verison: [main, v0.7.3-dev, main-openeuler, v0.7.3-dev-openeuler]
|
|
50
47
|
name: vLLM Ascend test
|
|
51
48
|
runs-on: linux-arm64-npu-1
|
|
52
49
|
container:
|
|
@@ -70,13 +67,13 @@ jobs:
|
|
|
70
67
|
run: |
|
|
71
68
|
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
|
72
69
|
apt-get update -y
|
|
73
|
-
apt install
|
|
70
|
+
apt install git curl -y
|
|
74
71
|
|
|
75
72
|
- name: Config OS mirrors - openEuler
|
|
76
73
|
if: ${{ endsWith(matrix.vllm_verison, '-openeuler') }}
|
|
77
74
|
run: |
|
|
78
75
|
yum update -y
|
|
79
|
-
yum install
|
|
76
|
+
yum install git curl -y
|
|
80
77
|
|
|
81
78
|
- name: Config pip mirrors
|
|
82
79
|
run: |
|
|
@@ -88,13 +85,11 @@ jobs:
|
|
|
88
85
|
- name: Run vllm-ascend/tests/e2e/run_doctests.sh
|
|
89
86
|
run: |
|
|
90
87
|
# PWD: /__w/vllm-ascend/vllm-ascend
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
# TODO(yikun): Remove this after conf.py merged
|
|
97
|
-
cp docs/source/conf.py /vllm-workspace/vllm-ascend/docs/source/
|
|
88
|
+
# Address old branch like v0.7.3:
|
|
89
|
+
if [ ! -d /vllm-workspace/vllm-ascend/tests/e2e ]; then
|
|
90
|
+
echo "Warning: the doctest path doesn't exists, copy now"
|
|
91
|
+
cp -r tests/e2e /vllm-workspace/vllm-ascend/tests/
|
|
92
|
+
fi
|
|
98
93
|
|
|
99
94
|
# Simulate container to enter directory
|
|
100
95
|
cd /workspace
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
# This file is a part of the vllm-ascend project.
|
|
16
|
+
#
|
|
17
|
+
|
|
18
|
+
name: 'test'
|
|
19
|
+
|
|
20
|
+
on:
|
|
21
|
+
pull_request:
|
|
22
|
+
branches:
|
|
23
|
+
- 'main'
|
|
24
|
+
- '*-dev'
|
|
25
|
+
paths:
|
|
26
|
+
- '*.txt'
|
|
27
|
+
- '**/*.py'
|
|
28
|
+
- '.github/workflows/vllm_ascend_test.yaml'
|
|
29
|
+
- '!docs/**'
|
|
30
|
+
- 'pytest.ini'
|
|
31
|
+
- '!benchmarks/**'
|
|
32
|
+
- 'tools/mypy.sh'
|
|
33
|
+
- 'mypy.ini'
|
|
34
|
+
- '.github/workflows/*.ya?ml'
|
|
35
|
+
- '.github/workflows/actionlint.*'
|
|
36
|
+
- '.github/workflows/matchers/actionlint.json'
|
|
37
|
+
|
|
38
|
+
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
|
39
|
+
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
|
40
|
+
# It's used to activate ascend-toolkit environment variables.
|
|
41
|
+
defaults:
|
|
42
|
+
run:
|
|
43
|
+
shell: bash -el {0}
|
|
44
|
+
|
|
45
|
+
jobs:
|
|
46
|
+
lint:
|
|
47
|
+
runs-on: ubuntu-latest
|
|
48
|
+
strategy:
|
|
49
|
+
matrix:
|
|
50
|
+
python-version: ["3.11"]
|
|
51
|
+
vllm_version: [v0.9.1]
|
|
52
|
+
steps:
|
|
53
|
+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
54
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
55
|
+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
|
56
|
+
with:
|
|
57
|
+
python-version: ${{ matrix.python-version }}
|
|
58
|
+
- name: Install dependencies
|
|
59
|
+
run: |
|
|
60
|
+
python -m pip install --upgrade pip
|
|
61
|
+
pip install -r requirements-lint.txt
|
|
62
|
+
- name: Run codespell check
|
|
63
|
+
run: |
|
|
64
|
+
CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
|
|
65
|
+
CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn')
|
|
66
|
+
|
|
67
|
+
codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
|
|
68
|
+
- name: Analysing the code with ruff
|
|
69
|
+
run: |
|
|
70
|
+
echo "::add-matcher::.github/workflows/matchers/ruff.json"
|
|
71
|
+
ruff check --output-format github .
|
|
72
|
+
- name: Run isort
|
|
73
|
+
run: |
|
|
74
|
+
isort . --check-only
|
|
75
|
+
- name: Running yapf
|
|
76
|
+
run: |
|
|
77
|
+
python -m pip install --upgrade pip
|
|
78
|
+
pip install toml
|
|
79
|
+
pip install yapf==0.32.0
|
|
80
|
+
yapf --diff --recursive .
|
|
81
|
+
|
|
82
|
+
- name: Install dependencies
|
|
83
|
+
run: |
|
|
84
|
+
pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
|
85
|
+
|
|
86
|
+
- name: Checkout vllm-project/vllm repo
|
|
87
|
+
uses: actions/checkout@v4
|
|
88
|
+
with:
|
|
89
|
+
repository: vllm-project/vllm
|
|
90
|
+
ref: ${{ matrix.vllm_version }}
|
|
91
|
+
path: vllm-empty
|
|
92
|
+
|
|
93
|
+
- name: Actionlint Check
|
|
94
|
+
env:
|
|
95
|
+
SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
|
|
96
|
+
run: |
|
|
97
|
+
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
|
98
|
+
tools/actionlint.sh -color
|
|
99
|
+
|
|
100
|
+
- name: Install vllm-project/vllm from source
|
|
101
|
+
working-directory: vllm-empty
|
|
102
|
+
run: |
|
|
103
|
+
pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
|
104
|
+
VLLM_TARGET_DEVICE=empty pip install .
|
|
105
|
+
|
|
106
|
+
- name: Mypy Check
|
|
107
|
+
run: |
|
|
108
|
+
echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
|
109
|
+
tools/mypy.sh 1 ${{ matrix.python-version }}
|
|
110
|
+
|
|
111
|
+
e2e:
|
|
112
|
+
needs: [lint]
|
|
113
|
+
if: ${{ needs.lint.result == 'success' }}
|
|
114
|
+
strategy:
|
|
115
|
+
max-parallel: 2
|
|
116
|
+
matrix:
|
|
117
|
+
os: [linux-arm64-npu-1, linux-arm64-npu-4]
|
|
118
|
+
vllm_version: [v0.9.1]
|
|
119
|
+
concurrency:
|
|
120
|
+
group: >
|
|
121
|
+
${{
|
|
122
|
+
matrix.os == 'linux-arm64-npu-4'
|
|
123
|
+
&& github.event.pull_request.number
|
|
124
|
+
&& format('pr-{0}-limit-npu-4', github.event.pull_request.number)
|
|
125
|
+
|| format('job-{0}-{1}-{2}', matrix.os, matrix.vllm_version, github.event.pull_request.number)
|
|
126
|
+
}}
|
|
127
|
+
cancel-in-progress: false
|
|
128
|
+
name: vLLM Ascend test
|
|
129
|
+
runs-on: ${{ matrix.os }}
|
|
130
|
+
container:
|
|
131
|
+
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
|
132
|
+
env:
|
|
133
|
+
HF_ENDPOINT: https://hf-mirror.com
|
|
134
|
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
135
|
+
VLLM_LOGGING_LEVEL: ERROR
|
|
136
|
+
steps:
|
|
137
|
+
- name: Check npu and CANN info
|
|
138
|
+
run: |
|
|
139
|
+
npu-smi info
|
|
140
|
+
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
|
141
|
+
|
|
142
|
+
- name: Config mirrors
|
|
143
|
+
run: |
|
|
144
|
+
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
|
145
|
+
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
|
146
|
+
apt-get update -y
|
|
147
|
+
apt install git -y
|
|
148
|
+
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
|
149
|
+
|
|
150
|
+
- name: Checkout vllm-project/vllm-ascend repo
|
|
151
|
+
uses: actions/checkout@v4
|
|
152
|
+
|
|
153
|
+
- name: Install system dependencies
|
|
154
|
+
run: |
|
|
155
|
+
apt-get -y install `cat packages.txt`
|
|
156
|
+
apt-get -y install gcc g++ cmake libnuma-dev
|
|
157
|
+
|
|
158
|
+
- name: Checkout vllm-project/vllm repo
|
|
159
|
+
uses: actions/checkout@v4
|
|
160
|
+
with:
|
|
161
|
+
repository: vllm-project/vllm
|
|
162
|
+
ref: ${{ matrix.vllm_version }}
|
|
163
|
+
path: ./vllm-empty
|
|
164
|
+
|
|
165
|
+
- name: Install vllm-project/vllm from source
|
|
166
|
+
working-directory: ./vllm-empty
|
|
167
|
+
run: |
|
|
168
|
+
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
169
|
+
|
|
170
|
+
- name: Install vllm-project/vllm-ascend
|
|
171
|
+
run: |
|
|
172
|
+
pip install -r requirements-dev.txt
|
|
173
|
+
pip install -v -e .
|
|
174
|
+
|
|
175
|
+
- name: Run vllm-project/vllm-ascend test for V1 Engine
|
|
176
|
+
env:
|
|
177
|
+
VLLM_USE_V1: 1
|
|
178
|
+
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
179
|
+
run: |
|
|
180
|
+
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
|
181
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
|
|
182
|
+
# guided decoding doesn't work, fix it later
|
|
183
|
+
# pytest -sv tests/singlecard/test_guided_decoding.py.py
|
|
184
|
+
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
|
|
185
|
+
pytest -sv tests/singlecard/test_ascend_config.py
|
|
186
|
+
pytest -sv tests/singlecard/test_camem.py
|
|
187
|
+
pytest -sv tests/singlecard/core/test_ascend_scheduler.py
|
|
188
|
+
pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py
|
|
189
|
+
pytest -sv tests/singlecard/ \
|
|
190
|
+
--ignore=tests/singlecard/test_offline_inference.py \
|
|
191
|
+
--ignore=tests/singlecard/test_guided_decoding.py \
|
|
192
|
+
--ignore=tests/singlecard/test_ascend_config.py \
|
|
193
|
+
--ignore=tests/singlecard/test_camem.py \
|
|
194
|
+
--ignore=tests/singlecard/core/test_ascend_scheduler.py \
|
|
195
|
+
--ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
|
|
196
|
+
else
|
|
197
|
+
pytest -sv tests/multicard/test_ilama_lora_tp2.py
|
|
198
|
+
# To avoid oom, we need to run the test in a single process.
|
|
199
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_w4a8_deepseek.py::test_deepseek_W4A8
|
|
200
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
|
|
201
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
|
|
202
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
|
|
203
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
|
|
204
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
|
|
205
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_w8a8_ep_dbo
|
|
206
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
|
|
207
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ_with_flashcomm_v1
|
|
208
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_with_flashcomm_v2
|
|
209
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py --ignore=tests/multicard/test_w4a8_deepseek.py
|
|
210
|
+
fi
|
|
211
|
+
|
|
212
|
+
- name: Run vllm-project/vllm-ascend test on V0 engine
|
|
213
|
+
if: ${{ github.event_name == 'schedule' }}
|
|
214
|
+
env:
|
|
215
|
+
VLLM_USE_V1: 0
|
|
216
|
+
run: |
|
|
217
|
+
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
|
218
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
|
|
219
|
+
# guided decoding doesn't work, fix it later
|
|
220
|
+
# pytest -sv tests/singlecard/test_guided_decoding.py.py
|
|
221
|
+
pytest -sv tests/singlecard/test_camem.py
|
|
222
|
+
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
|
|
223
|
+
pytest -sv tests/singlecard/test_ascend_config.py
|
|
224
|
+
pytest -sv tests/singlecard/test_prompt_embedding.py
|
|
225
|
+
pytest -sv tests/singlecard/ \
|
|
226
|
+
--ignore=tests/singlecard/test_offline_inference.py \
|
|
227
|
+
--ignore=tests/singlecard/test_guided_decoding.py \
|
|
228
|
+
--ignore=tests/singlecard/test_camem.py \
|
|
229
|
+
--ignore=tests/singlecard/test_ascend_config.py \
|
|
230
|
+
--ignore=tests/singlecard/test_prompt_embedding.py \
|
|
231
|
+
--ignore=tests/singlecard/core/test_ascend_scheduler.py \
|
|
232
|
+
--ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
|
|
233
|
+
else
|
|
234
|
+
pytest -sv tests/multicard/test_ilama_lora_tp2.py
|
|
235
|
+
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
|
|
236
|
+
# To avoid oom, we need to run the test in a single process.
|
|
237
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
|
|
238
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
|
|
239
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
|
|
240
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
|
|
241
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
|
|
242
|
+
fi
|
{vllm_ascend-0.9.1rc1 → vllm_ascend-0.9.1rc2}/.github/workflows/vllm_ascend_test_long_term.yaml
RENAMED
|
@@ -17,9 +17,6 @@
|
|
|
17
17
|
name: 'e2e test / long-term-test'
|
|
18
18
|
|
|
19
19
|
on:
|
|
20
|
-
schedule:
|
|
21
|
-
# Runs at 23:00 UTC (7:00 AM Beijing) every day
|
|
22
|
-
- cron: '0 23 * * *'
|
|
23
20
|
pull_request:
|
|
24
21
|
types: [ labeled ]
|
|
25
22
|
|
|
@@ -43,12 +40,12 @@ jobs:
|
|
|
43
40
|
max-parallel: 2
|
|
44
41
|
matrix:
|
|
45
42
|
os: [linux-arm64-npu-1, linux-arm64-npu-4]
|
|
46
|
-
vllm_version: [
|
|
43
|
+
vllm_version: [v0.9.1]
|
|
47
44
|
name: vLLM Ascend long term test
|
|
48
45
|
runs-on: ${{ matrix.os }}
|
|
49
46
|
container:
|
|
50
47
|
# TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
|
|
51
|
-
image:
|
|
48
|
+
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
|
52
49
|
env:
|
|
53
50
|
HF_ENDPOINT: https://hf-mirror.com
|
|
54
51
|
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
@@ -88,8 +85,6 @@ jobs:
|
|
|
88
85
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
89
86
|
|
|
90
87
|
- name: Install vllm-project/vllm-ascend
|
|
91
|
-
env:
|
|
92
|
-
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
93
88
|
run: |
|
|
94
89
|
pip install -r requirements-dev.txt
|
|
95
90
|
pip install -v -e .
|
|
@@ -97,13 +92,17 @@ jobs:
|
|
|
97
92
|
- name: Run vllm-project/vllm-ascend long term test
|
|
98
93
|
run: |
|
|
99
94
|
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
|
100
|
-
# spec decode test
|
|
101
|
-
VLLM_USE_MODELSCOPE=True pytest -sv tests/
|
|
95
|
+
# v0 spec decode test
|
|
96
|
+
# VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py # it needs a clean process
|
|
97
|
+
# pytest -sv tests/long_term/spec_decode_v0 --ignore=tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
|
|
98
|
+
# v1 spec decode test
|
|
99
|
+
# TODO: revert me when test_v1_mtp_correctness.py is fixed
|
|
100
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
|
|
102
101
|
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
|
|
103
|
-
VLLM_USE_MODELSCOPE=True pytest -sv tests/
|
|
104
|
-
|
|
105
|
-
pytest -sv tests/
|
|
106
|
-
pytest -sv tests/e2e/long_term/test_accuracy.py
|
|
102
|
+
# VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_spec_decode.py
|
|
103
|
+
# accuracy test single card
|
|
104
|
+
pytest -sv tests/long_term/test_accuracy.py
|
|
107
105
|
else
|
|
108
|
-
|
|
106
|
+
# accuracy test multi card
|
|
107
|
+
VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
|
|
109
108
|
fi
|
|
@@ -17,9 +17,6 @@
|
|
|
17
17
|
name: 'e2e test / pd-disaggregation'
|
|
18
18
|
|
|
19
19
|
on:
|
|
20
|
-
schedule:
|
|
21
|
-
# Runs at 23:00 UTC (7:00 AM Beijing) every day
|
|
22
|
-
- cron: '0 23 * * *'
|
|
23
20
|
pull_request:
|
|
24
21
|
types: [ labeled ]
|
|
25
22
|
|
|
@@ -41,12 +38,12 @@ jobs:
|
|
|
41
38
|
if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
|
|
42
39
|
strategy:
|
|
43
40
|
matrix:
|
|
44
|
-
vllm_verison: [
|
|
41
|
+
vllm_verison: [v0.9.1]
|
|
45
42
|
name: vLLM Ascend prefilling decoding disaggregation test
|
|
46
43
|
runs-on: linux-arm64-npu-static-8
|
|
47
44
|
|
|
48
45
|
container:
|
|
49
|
-
image:
|
|
46
|
+
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
|
50
47
|
volumes:
|
|
51
48
|
- /usr/local/dcmi:/usr/local/dcmi
|
|
52
49
|
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
|
|
@@ -97,8 +94,6 @@ jobs:
|
|
|
97
94
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
98
95
|
|
|
99
96
|
- name: Install vllm-project/vllm-ascend
|
|
100
|
-
env:
|
|
101
|
-
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
102
97
|
run: |
|
|
103
98
|
pip install -r requirements-dev.txt
|
|
104
99
|
pip install -v -e .
|
|
@@ -106,3 +101,7 @@ jobs:
|
|
|
106
101
|
- name: Run vllm-project/vllm-ascend PD Disaggregation test
|
|
107
102
|
run: |
|
|
108
103
|
pytest -sv tests/e2e/pd_disaggreate/test_pd_e2e.py
|
|
104
|
+
|
|
105
|
+
- name: Run vllm-project/vllm-ascend PD Disaggregation edge test
|
|
106
|
+
run: |
|
|
107
|
+
bash tests/e2e/pd_disaggreate/run_edge_case_test.sh
|