vllm-ascend 0.9.0rc2__tar.gz → 0.9.1rc2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (389) hide show
  1. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/Dockerfile.buildwheel +5 -8
  2. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/dependabot.yml +0 -3
  3. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/accuracy_test.yaml +3 -4
  4. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/image_openeuler.yml +6 -0
  5. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/image_ubuntu.yml +6 -0
  6. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/nightly_benchmarks.yaml +21 -19
  7. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/release_code.yml +1 -1
  8. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/release_whl.yml +35 -6
  9. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/vllm_ascend_doctest.yaml +0 -3
  10. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/vllm_ascend_test.yaml +33 -13
  11. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/vllm_ascend_test_long_term.yaml +12 -10
  12. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/vllm_ascend_test_pd.yaml +6 -5
  13. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/CMakeLists.txt +0 -2
  14. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/Dockerfile +2 -2
  15. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/Dockerfile.openEuler +2 -2
  16. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/PKG-INFO +3 -3
  17. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/README.md +2 -2
  18. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/README.zh.md +2 -2
  19. vllm_ascend-0.9.1rc2/benchmarks/ops/ben_vocabparallelembedding.py +144 -0
  20. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/tests/latency-tests.json +10 -0
  21. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/tests/serving-tests.json +24 -0
  22. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/tests/throughput-tests.json +11 -0
  23. vllm_ascend-0.9.1rc2/csrc/kernels/get_masked_input_and_mask_kernel.cpp +345 -0
  24. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/csrc/ops.h +14 -0
  25. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/csrc/torch_binding.cpp +116 -0
  26. vllm_ascend-0.9.1rc2/docs/source/assets/multi_node_dp.png +0 -0
  27. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/community/contributors.md +19 -1
  28. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/community/governance.md +2 -2
  29. vllm_ascend-0.9.1rc2/docs/source/community/user_stories/index.md +19 -0
  30. vllm_ascend-0.9.1rc2/docs/source/community/user_stories/llamafactory.md +19 -0
  31. {vllm_ascend-0.9.0rc2/docs/source/developer_guide → vllm_ascend-0.9.1rc2/docs/source/community}/versioning_policy.md +6 -2
  32. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/conf.py +6 -1
  33. vllm_ascend-0.9.0rc2/docs/source/developer_guide/contributing.md → vllm_ascend-0.9.1rc2/docs/source/developer_guide/contribution/index.md +44 -46
  34. vllm_ascend-0.9.1rc2/docs/source/developer_guide/contribution/testing.md +285 -0
  35. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/accuracy_report/index.md +1 -1
  36. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/index.md +2 -9
  37. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/using_evalscope.md +3 -1
  38. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/using_lm_eval.md +2 -1
  39. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/developer_guide/evaluation/using_opencompass.md +4 -1
  40. vllm_ascend-0.9.1rc2/docs/source/developer_guide/feature_guide/index.md +9 -0
  41. vllm_ascend-0.9.1rc2/docs/source/developer_guide/feature_guide/patch.md +85 -0
  42. vllm_ascend-0.9.1rc2/docs/source/developer_guide/modeling/adding_a_new_model.md +259 -0
  43. vllm_ascend-0.9.1rc2/docs/source/developer_guide/modeling/adding_a_new_multimodal_model.md +3 -0
  44. vllm_ascend-0.9.1rc2/docs/source/developer_guide/modeling/index.md +10 -0
  45. vllm_ascend-0.9.1rc2/docs/source/developer_guide/performance/index.md +9 -0
  46. vllm_ascend-0.9.1rc2/docs/source/developer_guide/performance/optimization_and_tuning.md +183 -0
  47. {vllm_ascend-0.9.0rc2/docs/source/developer_guide/evaluation → vllm_ascend-0.9.1rc2/docs/source/developer_guide/performance}/performance_benchmark.md +7 -0
  48. {vllm_ascend-0.9.0rc2/docs/source/developer_guide/evaluation → vllm_ascend-0.9.1rc2/docs/source/developer_guide/performance}/profile_execute_duration.md +7 -1
  49. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/faqs.md +52 -8
  50. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/index.md +9 -14
  51. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/installation.md +28 -21
  52. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/quick_start.md +14 -0
  53. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/index.md +1 -0
  54. vllm_ascend-0.9.1rc2/docs/source/tutorials/multi_node.md +203 -0
  55. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/multi_npu_quantization.md +6 -3
  56. vllm_ascend-0.9.1rc2/docs/source/tutorials/multi_npu_qwen3_moe.md +108 -0
  57. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/single_npu.md +72 -3
  58. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/single_npu_multimodal.md +4 -3
  59. {vllm_ascend-0.9.0rc2/docs/source/user_guide → vllm_ascend-0.9.1rc2/docs/source/user_guide/configuration}/additional_config.md +19 -16
  60. {vllm_ascend-0.9.0rc2/docs/source/user_guide → vllm_ascend-0.9.1rc2/docs/source/user_guide/configuration}/env_vars.md +1 -1
  61. vllm_ascend-0.9.1rc2/docs/source/user_guide/configuration/index.md +10 -0
  62. {vllm_ascend-0.9.0rc2/docs/source/user_guide → vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide}/graph_mode.md +10 -14
  63. vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/images/structured_output_1.png +0 -0
  64. vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/index.md +13 -0
  65. vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/lora.md +8 -0
  66. vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/quantization.md +125 -0
  67. vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/sleep_mode.md +114 -0
  68. vllm_ascend-0.9.1rc2/docs/source/user_guide/feature_guide/structured_output.md +163 -0
  69. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/user_guide/release_notes.md +68 -5
  70. vllm_ascend-0.9.1rc2/docs/source/user_guide/support_matrix/index.md +10 -0
  71. vllm_ascend-0.9.1rc2/docs/source/user_guide/support_matrix/supported_features.md +51 -0
  72. {vllm_ascend-0.9.0rc2/docs/source/user_guide → vllm_ascend-0.9.1rc2/docs/source/user_guide/support_matrix}/supported_models.md +3 -2
  73. vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/README.md +246 -0
  74. vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/gen_ranktable.py +120 -0
  75. vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/gen_ranktable.sh +79 -0
  76. vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py +435 -0
  77. vllm_ascend-0.9.1rc2/examples/disaggregate_prefill_v1/run_server.sh +32 -0
  78. vllm_ascend-0.9.1rc2/examples/disaggregated_prefill/find_device_ips.py +69 -0
  79. vllm_ascend-0.9.1rc2/examples/dp_offline/data_parallel.py +226 -0
  80. vllm_ascend-0.9.1rc2/examples/dp_offline/run_dp.sh +28 -0
  81. vllm_ascend-0.9.1rc2/examples/eplb_generate_map.py +77 -0
  82. vllm_ascend-0.9.1rc2/examples/external_online_dp/launch_dp_program.py +34 -0
  83. vllm_ascend-0.9.1rc2/examples/external_online_dp/run_dp_template.sh +51 -0
  84. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/offline_dualbatch_overlap_npu.py +1 -1
  85. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/offline_multi_step_custom_ops.py +0 -3
  86. vllm_ascend-0.9.1rc2/examples/run_dp_attention_etp16_benmark.sh +56 -0
  87. vllm_ascend-0.9.1rc2/examples/run_dp_server.sh +33 -0
  88. vllm_ascend-0.9.1rc2/examples/run_dp_with_cached_graph_etp16.sh +25 -0
  89. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/mypy.ini +3 -0
  90. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/pyproject.toml +3 -1
  91. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/requirements-dev.txt +1 -0
  92. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/requirements-lint.txt +1 -0
  93. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/requirements.txt +8 -1
  94. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/setup.py +1 -1
  95. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/conftest.py +8 -1
  96. vllm_ascend-0.9.1rc2/tests/e2e/pd_disaggreate/run_edge_case_test.sh +141 -0
  97. vllm_ascend-0.9.1rc2/tests/e2e/pd_disaggreate/test_edge_cases.py +81 -0
  98. vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0/e2e/test_eagle_correctness.py +344 -0
  99. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/test_medusa_correctness.py +2 -2
  100. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/test_mlp_correctness.py +2 -2
  101. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/test_ngram_correctness.py +2 -2
  102. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_dynamic_spec_decode.py +2 -2
  103. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_multi_step_worker.py +1 -1
  104. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_ngram_worker.py +1 -1
  105. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_spec_decode_worker.py +4 -4
  106. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode/e2e → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v1}/test_v1_mtp_correctness.py +66 -1
  107. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py +1 -1
  108. vllm_ascend-0.9.1rc2/tests/multicard/test_data_parallel.py +66 -0
  109. vllm_ascend-0.9.1rc2/tests/multicard/test_model_qwen3_w4a8.py +65 -0
  110. vllm_ascend-0.9.1rc2/tests/multicard/test_multimodal_context_parallel.py +82 -0
  111. vllm_ascend-0.9.1rc2/tests/multicard/test_offline_inference_distributed.py +230 -0
  112. vllm_ascend-0.9.1rc2/tests/multicard/test_torchair_graph_mode.py +83 -0
  113. vllm_ascend-0.9.1rc2/tests/multicard/test_w4a8_deepseek.py +67 -0
  114. vllm_ascend-0.9.1rc2/tests/ops/test_vocabparallelembedding.py +91 -0
  115. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/compile/test_simple.py +8 -24
  116. vllm_ascend-0.9.1rc2/tests/singlecard/core/test_ascend_scheduler.py +792 -0
  117. vllm_ascend-0.9.1rc2/tests/singlecard/core/test_ascend_scheduler_e2e.py +40 -0
  118. vllm_ascend-0.9.1rc2/tests/singlecard/ops/test_fused_moe.py +196 -0
  119. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/ops/test_rotary_embedding.py +70 -1
  120. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/sample/test_rejection_sampler.py +20 -11
  121. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_aclgraph.py +25 -2
  122. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_ascend_config.py +64 -20
  123. vllm_ascend-0.9.1rc2/tests/singlecard/test_chunked.py +74 -0
  124. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_offline_inference.py +5 -1
  125. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_scheduler.py +18 -43
  126. vllm_ascend-0.9.1rc2/tests/ut/kv_connector/test_llmdatadist_connector.py +42 -0
  127. vllm_ascend-0.9.1rc2/tests/ut/kv_connector/test_remote_decode_lifecycle.py +123 -0
  128. vllm_ascend-0.9.1rc2/tests/ut/kv_connector/test_remote_prefill_lifecycle.py +242 -0
  129. vllm_ascend-0.9.1rc2/tests/ut/kv_connector/utils.py +194 -0
  130. vllm_ascend-0.9.1rc2/tests/ut/ops/test_expert_load_balancer.py +147 -0
  131. vllm_ascend-0.9.1rc2/tests/ut/patch/worker/patch_common/test_patch_sampler.py +44 -0
  132. vllm_ascend-0.9.1rc2/tests/ut/test_distributed_tensor_parallel.py +139 -0
  133. vllm_ascend-0.9.1rc2/tests/ut/test_token_dispatcher.py +69 -0
  134. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/__init__.py +7 -0
  135. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/_version.py +2 -2
  136. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ascend_config.py +86 -13
  137. vllm_ascend-0.9.1rc2/vllm_ascend/ascend_forward_context.py +137 -0
  138. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/attention/attention.py +4 -13
  139. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/attention/attention_v1.py +121 -38
  140. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/attention/mla_v1.py +535 -189
  141. vllm_ascend-0.9.1rc2/vllm_ascend/attention/utils.py +23 -0
  142. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/compilation/piecewise_backend.py +57 -6
  143. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/core/scheduler.py +192 -59
  144. vllm_ascend-0.9.1rc2/vllm_ascend/cpu_binding.py +329 -0
  145. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/__init__.py +5 -0
  146. vllm_ascend-0.9.1rc2/vllm_ascend/distributed/context_parallel_utils.py +110 -0
  147. vllm_ascend-0.9.1rc2/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py +923 -0
  148. vllm_ascend-0.9.1rc2/vllm_ascend/distributed/parallel_state.py +49 -0
  149. vllm_ascend-0.9.1rc2/vllm_ascend/distributed/tensor_parallel.py +248 -0
  150. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/envs.py +56 -20
  151. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/adaptor/abstract_adaptor.py +44 -0
  152. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/adaptor/vllm_adaptor.py +212 -0
  153. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/eplb_device_transfer_loader.py +136 -0
  154. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/eplb_utils.py +75 -0
  155. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/eplb_worker.py +442 -0
  156. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/__init__.py +0 -0
  157. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_abstract.py +41 -0
  158. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_dynamic_ep.py +388 -0
  159. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py +770 -0
  160. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_factory.py +25 -0
  161. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core/policy/policy_random.py +29 -0
  162. vllm_ascend-0.9.1rc2/vllm_ascend/eplb/eplb_updator.py +222 -0
  163. vllm_ascend-0.9.1rc2/vllm_ascend/lora/__init__.py +0 -0
  164. vllm_ascend-0.9.1rc2/vllm_ascend/lora/punica_wrapper/__init__.py +0 -0
  165. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/__init__.py +22 -5
  166. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/deepseek_dbo.py +507 -540
  167. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/deepseek_mtp.py +26 -6
  168. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/deepseek_v2.py +321 -147
  169. vllm_ascend-0.9.1rc2/vllm_ascend/models/qwen2.py +372 -0
  170. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/qwen2_5_vl.py +146 -5
  171. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/qwen2_5_vl_without_padding.py +98 -0
  172. vllm_ascend-0.9.1rc2/vllm_ascend/models/qwen3.py +472 -0
  173. vllm_ascend-0.9.1rc2/vllm_ascend/models/qwen3_dbo.py +552 -0
  174. vllm_ascend-0.9.1rc2/vllm_ascend/models/qwen3_moe.py +268 -0
  175. vllm_ascend-0.9.1rc2/vllm_ascend/multistream/__init__.py +0 -0
  176. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/base.py +2 -0
  177. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/metadata.py +2 -0
  178. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/ms_split.py +136 -12
  179. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/attention.py +19 -15
  180. vllm_ascend-0.9.1rc2/vllm_ascend/ops/comm_utils.py +127 -0
  181. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/common_fused_moe.py +24 -1
  182. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/fused_moe.py +599 -226
  183. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/layernorm.py +37 -0
  184. vllm_ascend-0.9.1rc2/vllm_ascend/ops/moe_dispatcher/__init__.py +0 -0
  185. vllm_ascend-0.9.1rc2/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py +578 -0
  186. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/rotary_embedding.py +66 -36
  187. vllm_ascend-0.9.1rc2/vllm_ascend/ops/sequence_parallel.py +119 -0
  188. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/vocab_parallel_embedding.py +14 -7
  189. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/__init__.py +15 -26
  190. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/platform/__init__.py +2 -2
  191. {vllm_ascend-0.9.0rc2/vllm_ascend/patch/platform/patch_0_9_0 → vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1}/__init__.py +9 -1
  192. vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_cache_manager.py +13 -0
  193. vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_configs.py +77 -0
  194. vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_core.py +132 -0
  195. vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_core_client.py +26 -0
  196. vllm_ascend-0.9.1rc2/vllm_ascend/patch/platform/patch_0_9_1/patch_decorator.py +154 -0
  197. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/platform/patch_common/patch_distributed.py +7 -23
  198. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/__init__.py +2 -2
  199. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py +0 -16
  200. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_sampler.py +8 -3
  201. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py +7 -5
  202. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/platform.py +16 -26
  203. vllm_ascend-0.9.1rc2/vllm_ascend/quantization/__init__.py +0 -0
  204. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/quantization/func_wrapper.py +32 -0
  205. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/quantization/quant_config.py +40 -0
  206. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/quantization/quantizer.py +21 -9
  207. vllm_ascend-0.9.1rc2/vllm_ascend/quantization/w4a8_dynamic.py +393 -0
  208. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/quantization/w8a8.py +20 -6
  209. vllm_ascend-0.9.1rc2/vllm_ascend/quantization/w8a8_dynamic.py +1055 -0
  210. vllm_ascend-0.9.1rc2/vllm_ascend/sample/__init__.py +0 -0
  211. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/sample/rejection_sampler.py +101 -50
  212. vllm_ascend-0.9.1rc2/vllm_ascend/soc_info.py +14 -0
  213. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/utils.py +227 -2
  214. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/draft_model_runner.py +10 -9
  215. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/model_runner.py +7 -3
  216. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/model_runner_v1.py +602 -199
  217. vllm_ascend-0.9.1rc2/vllm_ascend/worker/mtp_proposer_v1.py +437 -0
  218. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/pooling_model_runner.py +3 -3
  219. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/worker.py +3 -6
  220. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/worker_v1.py +54 -27
  221. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/PKG-INFO +3 -3
  222. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/SOURCES.txt +116 -32
  223. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/requires.txt +4 -1
  224. vllm_ascend-0.9.0rc2/.github/workflows/actionlint.yml +0 -53
  225. vllm_ascend-0.9.0rc2/docs/source/tutorials/multi_node.md +0 -195
  226. vllm_ascend-0.9.0rc2/docs/source/user_guide/release.template.md +0 -13
  227. vllm_ascend-0.9.0rc2/docs/source/user_guide/suppoted_features.md +0 -49
  228. vllm_ascend-0.9.0rc2/docs/source/user_stories/example.md +0 -15
  229. vllm_ascend-0.9.0rc2/docs/source/user_stories/index.md +0 -22
  230. vllm_ascend-0.9.0rc2/examples/disaggregated_prefill/find_device_ips.py +0 -67
  231. vllm_ascend-0.9.0rc2/examples/dp_offline/data_parallel.py +0 -85
  232. vllm_ascend-0.9.0rc2/examples/dp_offline/run_dp.sh +0 -19
  233. vllm_ascend-0.9.0rc2/examples/run_dp_server.sh +0 -30
  234. vllm_ascend-0.9.0rc2/tests/multicard/test_offline_inference_distributed.py +0 -97
  235. vllm_ascend-0.9.0rc2/tests/singlecard/ops/test_fused_moe.py +0 -100
  236. vllm_ascend-0.9.0rc2/vllm_ascend/distributed/parallel_state.py +0 -77
  237. vllm_ascend-0.9.0rc2/vllm_ascend/models/qwen3_moe.py +0 -35
  238. vllm_ascend-0.9.0rc2/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py +0 -116
  239. vllm_ascend-0.9.0rc2/vllm_ascend/quantization/w8a8_dynamic.py +0 -753
  240. vllm_ascend-0.9.0rc2/vllm_ascend/worker/mtp_proposer_v1.py +0 -225
  241. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/100-documentation.yml +0 -0
  242. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/110-user-story.yml +0 -0
  243. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/200-installation.yml +0 -0
  244. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/300-usage.yml +0 -0
  245. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/400-bug-report.yml +0 -0
  246. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/500-feature-request.yml +0 -0
  247. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/600-new-model.yml +0 -0
  248. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/700-performance-discussion.yml +0 -0
  249. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/750-RFC.yml +0 -0
  250. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/800-others.yml +0 -0
  251. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  252. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  253. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/actionlint.yaml +0 -0
  254. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/labeler.yml +0 -0
  255. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/accuracy_report.yaml +0 -0
  256. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/label_merge_conflict.yml +0 -0
  257. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/labeler.yml +0 -0
  258. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/matchers/actionlint.json +0 -0
  259. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/matchers/mypy.json +0 -0
  260. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/matchers/ruff.json +0 -0
  261. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.github/workflows/shellcheck.yml +0 -0
  262. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.gitignore +0 -0
  263. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/.readthedocs.yaml +0 -0
  264. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/CODE_OF_CONDUCT.md +0 -0
  265. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/DCO +0 -0
  266. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/LICENSE +0 -0
  267. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/README.md +0 -0
  268. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/requirements-bench.txt +0 -0
  269. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/convert_json_to_markdown.py +0 -0
  270. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/patch_benchmark_dataset.py +0 -0
  271. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/perf_result_template.md +0 -0
  272. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/run-performance-benchmarks.sh +0 -0
  273. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/benchmarks/scripts/run_accuracy.py +0 -0
  274. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/cmake/utils.cmake +0 -0
  275. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/collect_env.py +0 -0
  276. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/csrc/camem_allocator.cpp +0 -0
  277. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/csrc/kernels/advance_step.cpp +0 -0
  278. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/csrc/kernels/pos_encoding_kernels.cpp +0 -0
  279. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/csrc/kernels/types.h +0 -0
  280. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/csrc/kernels/utils.h +0 -0
  281. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/csrc/utils.h +0 -0
  282. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/Makefile +0 -0
  283. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/README.md +0 -0
  284. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/requirements-docs.txt +0 -0
  285. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/requirements-test.txt +0 -0
  286. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/_templates/sections/header.html +0 -0
  287. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/logos/vllm-ascend-logo-text-dark.png +0 -0
  288. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/logos/vllm-ascend-logo-text-light.png +0 -0
  289. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/docs/source/tutorials/multi_npu.md +0 -0
  290. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/disaggregated_prefill_offline.py +0 -0
  291. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/dp_proxy.py +0 -0
  292. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py +0 -0
  293. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/run_decode_server.sh +0 -0
  294. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/disaggregated_prefill/run_prefill_server.sh +0 -0
  295. /vllm_ascend-0.9.0rc2/tests/__init__.py → /vllm_ascend-0.9.1rc2/examples/external_online_dp/README.md +0 -0
  296. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/offline_disaggregated_prefill_npu.py +0 -0
  297. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/offline_distributed_inference_npu.py +0 -0
  298. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/offline_inference_audio_language.py +0 -0
  299. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/offline_inference_npu.py +0 -0
  300. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/offline_inference_npu_v1.py +0 -0
  301. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/examples/prompt_embedding_inference.py +0 -0
  302. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/format.sh +0 -0
  303. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/packages.txt +0 -0
  304. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/pytest.ini +0 -0
  305. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/setup.cfg +0 -0
  306. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode/e2e → vllm_ascend-0.9.1rc2/tests}/__init__.py +0 -0
  307. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/e2e/common.sh +0 -0
  308. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/e2e/doctests/001-quickstart-test.sh +0 -0
  309. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/e2e/pd_disaggreate/setup_pd.sh +0 -0
  310. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/e2e/pd_disaggreate/test_pd_e2e.py +0 -0
  311. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/e2e/run_disagg_pd.sh +0 -0
  312. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/e2e/run_doctests.sh +0 -0
  313. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/__init__.py +0 -0
  314. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/conftest.py +0 -0
  315. {vllm_ascend-0.9.0rc2/tests/singlecard → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0/e2e}/__init__.py +0 -0
  316. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/conftest.py +0 -0
  317. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/e2e/test_mtp_correctness.py +0 -0
  318. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/test_utils.py +0 -0
  319. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v0}/utils.py +0 -0
  320. {vllm_ascend-0.9.0rc2/tests/long_term/spec_decode/e2e → vllm_ascend-0.9.1rc2/tests/long_term/spec_decode_v1}/test_v1_spec_decode.py +0 -0
  321. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/long_term/test_accuracy.py +0 -0
  322. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/model_utils.py +0 -0
  323. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/multicard/test_dynamic_npugraph_batchsize.py +0 -0
  324. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/multicard/test_ilama_lora_tp2.py +0 -0
  325. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/multicard/test_pyhccl_distributed.py +0 -0
  326. {vllm_ascend-0.9.0rc2/tests/singlecard/compile → vllm_ascend-0.9.1rc2/tests/singlecard}/__init__.py +0 -0
  327. {vllm_ascend-0.9.0rc2/tests/singlecard/ops → vllm_ascend-0.9.1rc2/tests/singlecard/compile}/__init__.py +0 -0
  328. {vllm_ascend-0.9.0rc2/tests/singlecard/sample → vllm_ascend-0.9.1rc2/tests/singlecard/core}/__init__.py +0 -0
  329. {vllm_ascend-0.9.0rc2/vllm_ascend/attention → vllm_ascend-0.9.1rc2/tests/singlecard/ops}/__init__.py +0 -0
  330. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/ops/test_multi_step.py +0 -0
  331. {vllm_ascend-0.9.0rc2/vllm_ascend/compilation → vllm_ascend-0.9.1rc2/tests/singlecard/sample}/__init__.py +0 -0
  332. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_camem.py +0 -0
  333. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_guided_decoding.py +0 -0
  334. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_ilama_lora.py +0 -0
  335. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_profile_execute_duration.py +0 -0
  336. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_prompt_embedding.py +0 -0
  337. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_pyhccl.py +0 -0
  338. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/singlecard/test_sampler.py +0 -0
  339. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tests/utils.py +0 -0
  340. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tools/actionlint.sh +0 -0
  341. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tools/check_repo.sh +0 -0
  342. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tools/mypy.sh +0 -0
  343. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tools/png-lint.sh +0 -0
  344. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tools/shellcheck.sh +0 -0
  345. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/tools/sphinx-lint.sh +0 -0
  346. {vllm_ascend-0.9.0rc2/vllm_ascend/core → vllm_ascend-0.9.1rc2/vllm_ascend/attention}/__init__.py +0 -0
  347. {vllm_ascend-0.9.0rc2/vllm_ascend/device_allocator → vllm_ascend-0.9.1rc2/vllm_ascend/compilation}/__init__.py +0 -0
  348. {vllm_ascend-0.9.0rc2/vllm_ascend/distributed/device_communicators → vllm_ascend-0.9.1rc2/vllm_ascend/core}/__init__.py +0 -0
  349. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/core/schedule_config.py +0 -0
  350. {vllm_ascend-0.9.0rc2/vllm_ascend/distributed/kv_transfer → vllm_ascend-0.9.1rc2/vllm_ascend/device_allocator}/__init__.py +0 -0
  351. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/device_allocator/camem.py +0 -0
  352. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/communicator.py +0 -0
  353. {vllm_ascend-0.9.0rc2/vllm_ascend/lora → vllm_ascend-0.9.1rc2/vllm_ascend/distributed/device_communicators}/__init__.py +0 -0
  354. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/device_communicators/pyhccl.py +0 -0
  355. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py +0 -0
  356. {vllm_ascend-0.9.0rc2/vllm_ascend/lora/punica_wrapper → vllm_ascend-0.9.1rc2/vllm_ascend/distributed/kv_transfer}/__init__.py +0 -0
  357. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/kv_transfer/simple_buffer.py +0 -0
  358. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/kv_transfer/simple_connector.py +0 -0
  359. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/kv_transfer/simple_pipe.py +0 -0
  360. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/kv_transfer/utils.py +0 -0
  361. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/distributed/llmdatadist_connector.py +0 -0
  362. {vllm_ascend-0.9.0rc2/vllm_ascend/multistream → vllm_ascend-0.9.1rc2/vllm_ascend/eplb}/__init__.py +0 -0
  363. {vllm_ascend-0.9.0rc2/vllm_ascend/quantization → vllm_ascend-0.9.1rc2/vllm_ascend/eplb/adaptor}/__init__.py +0 -0
  364. {vllm_ascend-0.9.0rc2/vllm_ascend/sample → vllm_ascend-0.9.1rc2/vllm_ascend/eplb/core}/__init__.py +0 -0
  365. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/lora/punica_wrapper/punica_npu.py +0 -0
  366. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/models/qwen2_vl.py +0 -0
  367. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/context.py +0 -0
  368. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/decorator.py +0 -0
  369. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/multistream/layers.py +0 -0
  370. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/__init__.py +0 -0
  371. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/activation.py +0 -0
  372. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/cache.py +0 -0
  373. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/ops/expert_load_balancer.py +0 -0
  374. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/platform/patch_common/__init__.py +0 -0
  375. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/platform/patch_main/__init__.py +0 -0
  376. {vllm_ascend-0.9.0rc2/vllm_ascend/patch/worker/patch_0_9_0 → vllm_ascend-0.9.1rc2/vllm_ascend/patch/worker/patch_0_9_1}/__init__.py +0 -0
  377. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/__init__.py +0 -0
  378. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_distributed.py +0 -0
  379. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_eagle.py +0 -0
  380. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_minicpm.py +0 -0
  381. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_common/patch_utils.py +0 -0
  382. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/patch/worker/patch_main/__init__.py +0 -0
  383. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/__init__.py +0 -0
  384. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/cache_engine.py +0 -0
  385. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/multi_step_runner.py +0 -0
  386. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend/worker/multi_step_worker.py +0 -0
  387. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/dependency_links.txt +0 -0
  388. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/entry_points.txt +0 -0
  389. {vllm_ascend-0.9.0rc2 → vllm_ascend-0.9.1rc2}/vllm_ascend.egg-info/top_level.txt +0 -0
@@ -14,18 +14,17 @@
14
14
  # limitations under the License.
15
15
  # This file is a part of the vllm-ascend project.
16
16
  #
17
- ARG PY_VERSION=3.10
18
- FROM quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py${PY_VERSION}
17
+ ARG PY_VERSION=3.11
18
+ FROM quay.io/ascend/manylinux:8.0.0-910b-manylinux_2_28-py${PY_VERSION}
19
19
 
20
20
  ARG COMPILE_CUSTOM_KERNELS=1
21
21
 
22
22
  # Define environments
23
23
  ENV DEBIAN_FRONTEND=noninteractive
24
24
  ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
25
- RUN apt-get update -y && \
26
- apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
27
- rm -rf /var/cache/apt/* && \
28
- rm -rf /var/lib/apt/lists/*
25
+ RUN yum update -y && \
26
+ yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
27
+ rm -rf /var/cache/yum
29
28
 
30
29
  WORKDIR /workspace
31
30
 
@@ -41,8 +40,6 @@ RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
41
40
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
42
41
  cd vllm-ascend && \
43
42
  python3 setup.py bdist_wheel && \
44
- ls -l dist && \
45
- for f in dist/*.whl; do mv "$f" "$(echo "$f" | sed -e 's/-linux_x86_64\.whl$/-manylinux1_x86_64.whl/' -e 's/-linux_aarch64\.whl$/-manylinux2014_aarch64.whl/')"; done && \
46
43
  ls -l dist
47
44
 
48
45
  CMD ["/bin/bash"]
@@ -2,9 +2,6 @@ version: 2
2
2
  updates:
3
3
  - package-ecosystem: "github-actions"
4
4
  directory: "/"
5
- schedule:
6
- # Check for updates to GitHub Actions every week
7
- interval: "weekly"
8
5
  open-pull-requests-limit: 2
9
6
  reviewers:
10
7
  - "Yikun"
@@ -34,8 +34,7 @@ on:
34
34
  # Current supported vLLM versions
35
35
  options:
36
36
  - main
37
- - v0.9.0.1
38
- - v0.9.0
37
+ - v0.9.1
39
38
  - v0.7.3
40
39
  vllm-ascend-version:
41
40
  description: 'vllm-ascend version:'
@@ -118,7 +117,7 @@ jobs:
118
117
  fail-fast: false
119
118
  name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}
120
119
  container:
121
- image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
120
+ image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
122
121
  env:
123
122
  HF_ENDPOINT: https://hf-mirror.com
124
123
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -159,7 +158,7 @@ jobs:
159
158
  repository: vllm-project/vllm
160
159
  path: ./vllm-empty
161
160
  # Please also update this when bump matched version
162
- ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }}
161
+ ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }}
163
162
 
164
163
  - name: Install vllm-project/vllm from source
165
164
  working-directory: ./vllm-empty
@@ -19,6 +19,12 @@ on:
19
19
  - '.github/workflows/image_openeuler.yml'
20
20
  - 'Dockerfile.openEuler'
21
21
  - 'vllm_ascend/**'
22
+ - 'setup.py'
23
+ - 'pyproject.toml'
24
+ - 'requirements.txt'
25
+ - 'cmake/**'
26
+ - 'CMakeLists.txt'
27
+ - 'csrc/**'
22
28
  push:
23
29
  # Publish image when tagging, the Dockerfile in tag will be build as tag image
24
30
  branches:
@@ -19,6 +19,12 @@ on:
19
19
  - '.github/workflows/image_ubuntu.yml'
20
20
  - 'Dockerfile'
21
21
  - 'vllm_ascend/**'
22
+ - 'setup.py'
23
+ - 'pyproject.toml'
24
+ - 'requirements.txt'
25
+ - 'cmake/**'
26
+ - 'CMakeLists.txt'
27
+ - 'csrc/**'
22
28
  push:
23
29
  # Publish image when tagging, the Dockerfile in tag will be build as tag image
24
30
  branches:
@@ -18,11 +18,7 @@
18
18
  name: 'Benchmarks / Performance'
19
19
  # This workflow runs nightly benchmarks for vllm-ascend.
20
20
 
21
- on:
22
- schedule:
23
- # Run at 02:00 everyday
24
- - cron: '00 18 * * *'
25
-
21
+ on:
26
22
  workflow_dispatch:
27
23
  # Allow manual triggering of the workflow
28
24
 
@@ -45,15 +41,20 @@ jobs:
45
41
  test:
46
42
  if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
47
43
 
48
- name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}
44
+ name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }}
49
45
  runs-on: 'linux-arm64-npu-static-8'
50
46
  strategy:
51
47
  matrix:
52
48
  include:
49
+ - vllm_branch: v0.9.1
50
+ vllm_ascend_branch: main
51
+ vllm_use_v1: 0
53
52
  - vllm_branch: v0.9.0
54
53
  vllm_ascend_branch: main
54
+ vllm_use_v1: 1
55
+ max-parallel: 1
55
56
  container:
56
- image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
57
+ image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
57
58
  volumes:
58
59
  - /usr/local/dcmi:/usr/local/dcmi
59
60
  - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
@@ -71,6 +72,7 @@ jobs:
71
72
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
72
73
  ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }}
73
74
  ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }}
75
+ VLLM_USE_V1: ${{ matrix.vllm_use_v1 }}
74
76
  steps:
75
77
  - name: Check npu and CANN info
76
78
  run: |
@@ -140,7 +142,7 @@ jobs:
140
142
  - name: Install elastic_tool
141
143
  if: github.event_name != 'pull_request'
142
144
  run: |
143
- pip install escli-tool==0.2.1
145
+ pip install escli-tool==0.2.2
144
146
 
145
147
  - name: Collect pr info from vllm-project/vllm-ascend
146
148
  if: github.event_name != 'pull_request'
@@ -164,10 +166,10 @@ jobs:
164
166
  while IFS= read -r line || [[ -n "$line" ]]; do
165
167
  commit_id=${line%% *}
166
168
  commit_title=${line#* }
167
- commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict)
168
- commit_time_no_tz=${commit_time::19}
169
169
 
170
170
  git checkout $commit_id
171
+ commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict)
172
+ commit_time_no_tz=${commit_time::19}
171
173
  pip install -e .
172
174
 
173
175
  echo "------------------------"
@@ -177,17 +179,17 @@ jobs:
177
179
  echo "vllm branch: ${{ matrix.vllm_branch }}"
178
180
  echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}"
179
181
  echo "------------------------"
182
+
180
183
  cd /github/home
181
184
  bash benchmarks/scripts/run-performance-benchmarks.sh
182
185
  # send the result to es
183
- if [[ "${{ github.event_name }}" != "pull request" ]]; then
184
- escli add --vllm_branch ${{ matrix.vllm_branch }} \
185
- --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \
186
- --commit_id $commit_id \
187
- --commit_title "$commit_title" \
188
- --created_at "$commit_time_no_tz" \
189
- --res_dir ./benchmarks/results
190
- rm -rf ./benchmarks/results
191
- fi
186
+ escli add --vllm_branch ${{ matrix.vllm_branch }} \
187
+ --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \
188
+ --commit_id $commit_id \
189
+ --commit_title "$commit_title" \
190
+ --created_at "$commit_time_no_tz" \
191
+ --res_dir ./benchmarks/results \
192
+ --extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}'
193
+ rm -rf ./benchmarks/results
192
194
  cd -
193
195
  done < commit_log.txt
@@ -53,7 +53,7 @@ jobs:
53
53
  runs-on: ubuntu-latest
54
54
  strategy:
55
55
  matrix:
56
- python-version: ["3.10"]
56
+ python-version: ["3.11"]
57
57
  steps:
58
58
  - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
59
59
 
@@ -71,16 +71,11 @@ jobs:
71
71
  --build-arg PY_VERSION=${{ matrix.python-version }} \
72
72
  -t wheel:v1 .
73
73
  docker run --rm \
74
+ -u $(id -u):$(id -g) \
74
75
  -v $(pwd):/outpwd \
75
76
  wheel:v1 \
76
77
  bash -c "cp -r /workspace/vllm-ascend/dist /outpwd"
77
78
  ls dist
78
-
79
- - name: Archive wheel
80
- uses: actions/upload-artifact@v4
81
- with:
82
- name: vllm-ascend-${{ matrix.os }}-py${{ matrix.python-version }}-wheel
83
- path: dist/*
84
79
 
85
80
  - name: Set up Python ${{ matrix.python-version }}
86
81
  if: startsWith(github.ref, 'refs/tags/')
@@ -88,6 +83,40 @@ jobs:
88
83
  with:
89
84
  python-version: ${{ matrix.python-version }}
90
85
 
86
+ - name: Repair wheels with auditwheel
87
+ run: |
88
+ python3 -m pip install auditwheel
89
+ python3 -m pip install patchelf
90
+ mkdir -p dist/repaired
91
+ for whl in dist/*.whl; do
92
+ auditwheel repair "$whl" -w dist/repaired/ \
93
+ --exclude libplatform.so \
94
+ --exclude libregister.so \
95
+ --exclude libge_common_base.so \
96
+ --exclude libc10.so \
97
+ --exclude libc_sec.so \
98
+ --exclude "libascend*.so" \
99
+ --exclude "libtorch*.so"
100
+ done
101
+ rm -f dist/*.whl
102
+ mv dist/repaired/*.whl dist/
103
+ rmdir dist/repaired
104
+ ls dist
105
+
106
+ - name: Verify automatic platform tags
107
+ run: |
108
+ cd dist
109
+ for wheel in *.whl; do
110
+ echo "verification file: $wheel"
111
+ auditwheel show "$wheel"
112
+ done
113
+
114
+ - name: Archive wheel
115
+ uses: actions/upload-artifact@v4
116
+ with:
117
+ name: vllm-ascend-${{ matrix.os }}-py${{ matrix.python-version }}-wheel
118
+ path: dist/*
119
+
91
120
  - name: Release
92
121
  if: startsWith(github.ref, 'refs/tags/')
93
122
  run: |
@@ -29,9 +29,6 @@ on:
29
29
  - 'tests/e2e/doctests/**'
30
30
  - 'tests/e2e/common.sh'
31
31
  - 'tests/e2e/run_doctests.sh'
32
- schedule:
33
- # Runs every 4 hours
34
- - cron: '0 */4 * * *'
35
32
 
36
33
  # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
37
34
  # declared as "shell: bash -el {0}" on steps that need to be properly activated.
@@ -18,8 +18,6 @@
18
18
  name: 'test'
19
19
 
20
20
  on:
21
- schedule:
22
- - cron: '0 23 * * *'
23
21
  pull_request:
24
22
  branches:
25
23
  - 'main'
@@ -33,6 +31,9 @@ on:
33
31
  - '!benchmarks/**'
34
32
  - 'tools/mypy.sh'
35
33
  - 'mypy.ini'
34
+ - '.github/workflows/*.ya?ml'
35
+ - '.github/workflows/actionlint.*'
36
+ - '.github/workflows/matchers/actionlint.json'
36
37
 
37
38
  # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
38
39
  # declared as "shell: bash -el {0}" on steps that need to be properly activated.
@@ -46,7 +47,8 @@ jobs:
46
47
  runs-on: ubuntu-latest
47
48
  strategy:
48
49
  matrix:
49
- python-version: ["3.10"]
50
+ python-version: ["3.11"]
51
+ vllm_version: [v0.9.1]
50
52
  steps:
51
53
  - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
52
54
  - name: Set up Python ${{ matrix.python-version }}
@@ -85,8 +87,16 @@ jobs:
85
87
  uses: actions/checkout@v4
86
88
  with:
87
89
  repository: vllm-project/vllm
90
+ ref: ${{ matrix.vllm_version }}
88
91
  path: vllm-empty
89
92
 
93
+ - name: Actionlint Check
94
+ env:
95
+ SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
96
+ run: |
97
+ echo "::add-matcher::.github/workflows/matchers/actionlint.json"
98
+ tools/actionlint.sh -color
99
+
90
100
  - name: Install vllm-project/vllm from source
91
101
  working-directory: vllm-empty
92
102
  run: |
@@ -105,7 +115,7 @@ jobs:
105
115
  max-parallel: 2
106
116
  matrix:
107
117
  os: [linux-arm64-npu-1, linux-arm64-npu-4]
108
- vllm_version: [main, v0.9.0]
118
+ vllm_version: [v0.9.1]
109
119
  concurrency:
110
120
  group: >
111
121
  ${{
@@ -118,8 +128,7 @@ jobs:
118
128
  name: vLLM Ascend test
119
129
  runs-on: ${{ matrix.os }}
120
130
  container:
121
- # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
122
- image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
131
+ image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
123
132
  env:
124
133
  HF_ENDPOINT: https://hf-mirror.com
125
134
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -170,34 +179,43 @@ jobs:
170
179
  run: |
171
180
  if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
172
181
  VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
173
- pytest -sv tests/singlecard/test_scheduler.py
174
182
  # guided decoding doesn't work, fix it later
175
183
  # pytest -sv tests/singlecard/test_guided_decoding.py.py
176
184
  # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
177
185
  pytest -sv tests/singlecard/test_ascend_config.py
178
186
  pytest -sv tests/singlecard/test_camem.py
187
+ pytest -sv tests/singlecard/core/test_ascend_scheduler.py
188
+ pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py
179
189
  pytest -sv tests/singlecard/ \
180
190
  --ignore=tests/singlecard/test_offline_inference.py \
181
- --ignore=tests/singlecard/test_scheduler.py \
182
191
  --ignore=tests/singlecard/test_guided_decoding.py \
183
192
  --ignore=tests/singlecard/test_ascend_config.py \
184
- --ignore=tests/singlecard/test_camem.py
193
+ --ignore=tests/singlecard/test_camem.py \
194
+ --ignore=tests/singlecard/core/test_ascend_scheduler.py \
195
+ --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
185
196
  else
186
197
  pytest -sv tests/multicard/test_ilama_lora_tp2.py
187
198
  # To avoid oom, we need to run the test in a single process.
199
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_w4a8_deepseek.py::test_deepseek_W4A8
188
200
  VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
189
201
  VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
190
202
  VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
191
- VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
203
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
204
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
205
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_w8a8_ep_dbo
206
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
207
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ_with_flashcomm_v1
208
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_with_flashcomm_v2
209
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py --ignore=tests/multicard/test_w4a8_deepseek.py
192
210
  fi
193
211
 
194
212
  - name: Run vllm-project/vllm-ascend test on V0 engine
213
+ if: ${{ github.event_name == 'schedule' }}
195
214
  env:
196
215
  VLLM_USE_V1: 0
197
216
  run: |
198
217
  if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
199
218
  VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
200
- pytest -sv tests/singlecard/test_scheduler.py
201
219
  # guided decoding doesn't work, fix it later
202
220
  # pytest -sv tests/singlecard/test_guided_decoding.py.py
203
221
  pytest -sv tests/singlecard/test_camem.py
@@ -206,11 +224,12 @@ jobs:
206
224
  pytest -sv tests/singlecard/test_prompt_embedding.py
207
225
  pytest -sv tests/singlecard/ \
208
226
  --ignore=tests/singlecard/test_offline_inference.py \
209
- --ignore=tests/singlecard/test_scheduler.py \
210
227
  --ignore=tests/singlecard/test_guided_decoding.py \
211
228
  --ignore=tests/singlecard/test_camem.py \
212
229
  --ignore=tests/singlecard/test_ascend_config.py \
213
- --ignore=tests/singlecard/test_prompt_embedding.py
230
+ --ignore=tests/singlecard/test_prompt_embedding.py \
231
+ --ignore=tests/singlecard/core/test_ascend_scheduler.py \
232
+ --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
214
233
  else
215
234
  pytest -sv tests/multicard/test_ilama_lora_tp2.py
216
235
  # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
@@ -218,5 +237,6 @@ jobs:
218
237
  VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
219
238
  VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
220
239
  VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
240
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
221
241
  VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
222
242
  fi
@@ -17,9 +17,6 @@
17
17
  name: 'e2e test / long-term-test'
18
18
 
19
19
  on:
20
- schedule:
21
- # Runs at 23:00 UTC (7:00 AM Beijing) every day
22
- - cron: '0 23 * * *'
23
20
  pull_request:
24
21
  types: [ labeled ]
25
22
 
@@ -43,12 +40,12 @@ jobs:
43
40
  max-parallel: 2
44
41
  matrix:
45
42
  os: [linux-arm64-npu-1, linux-arm64-npu-4]
46
- vllm_version: [main, v0.9.0]
43
+ vllm_version: [v0.9.1]
47
44
  name: vLLM Ascend long term test
48
45
  runs-on: ${{ matrix.os }}
49
46
  container:
50
47
  # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
51
- image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
48
+ image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
52
49
  env:
53
50
  HF_ENDPOINT: https://hf-mirror.com
54
51
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -95,12 +92,17 @@ jobs:
95
92
  - name: Run vllm-project/vllm-ascend long term test
96
93
  run: |
97
94
  if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
98
- # spec decode test
99
- VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
100
- VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py
101
- VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process
102
- pytest -sv tests/long_term/spec_decode --ignore=tests/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
95
+ # v0 spec decode test
96
+ # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py # it needs a clean process
97
+ # pytest -sv tests/long_term/spec_decode_v0 --ignore=tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
98
+ # v1 spec decode test
99
+ # TODO: revert me when test_v1_mtp_correctness.py is fixed
100
+ VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
101
+ # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
102
+ # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_spec_decode.py
103
+ # accuracy test single card
103
104
  pytest -sv tests/long_term/test_accuracy.py
104
105
  else
106
+ # accuracy test multi card
105
107
  VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
106
108
  fi
@@ -17,9 +17,6 @@
17
17
  name: 'e2e test / pd-disaggregation'
18
18
 
19
19
  on:
20
- schedule:
21
- # Runs at 23:00 UTC (7:00 AM Beijing) every day
22
- - cron: '0 23 * * *'
23
20
  pull_request:
24
21
  types: [ labeled ]
25
22
 
@@ -41,12 +38,12 @@ jobs:
41
38
  if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
42
39
  strategy:
43
40
  matrix:
44
- vllm_verison: [main, v0.9.0]
41
+ vllm_verison: [v0.9.1]
45
42
  name: vLLM Ascend prefilling decoding disaggregation test
46
43
  runs-on: linux-arm64-npu-static-8
47
44
 
48
45
  container:
49
- image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
46
+ image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
50
47
  volumes:
51
48
  - /usr/local/dcmi:/usr/local/dcmi
52
49
  - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
@@ -104,3 +101,7 @@ jobs:
104
101
  - name: Run vllm-project/vllm-ascend PD Disaggregation test
105
102
  run: |
106
103
  pytest -sv tests/e2e/pd_disaggreate/test_pd_e2e.py
104
+
105
+ - name: Run vllm-project/vllm-ascend PD Disaggregation edge test
106
+ run: |
107
+ bash tests/e2e/pd_disaggreate/run_edge_case_test.sh
@@ -96,5 +96,3 @@ target_link_libraries(
96
96
  target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib")
97
97
 
98
98
  install(TARGETS vllm_ascend_C vllm_ascend_kernels DESTINATION ${VLLM_ASCEND_INSTALL_PATH})
99
-
100
-
@@ -15,7 +15,7 @@
15
15
  # This file is a part of the vllm-ascend project.
16
16
  #
17
17
 
18
- FROM quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
18
+ FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
19
19
 
20
20
  ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
21
21
  ARG COMPILE_CUSTOM_KERNELS=1
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
37
37
 
38
38
  # Install vLLM
39
39
  ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
40
- ARG VLLM_TAG=v0.9.0
40
+ ARG VLLM_TAG=v0.9.1
41
41
  RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
42
42
  # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
43
43
  RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
@@ -15,7 +15,7 @@
15
15
  # This file is a part of the vllm-ascend project.
16
16
  #
17
17
 
18
- FROM quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
18
+ FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler22.03-py3.11
19
19
 
20
20
  ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
21
21
  ARG COMPILE_CUSTOM_KERNELS=1
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
34
34
 
35
35
  # Install vLLM
36
36
  ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
37
- ARG VLLM_TAG=v0.9.0
37
+ ARG VLLM_TAG=v0.9.1
38
38
 
39
39
  RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
40
40
  # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vllm_ascend
3
- Version: 0.9.0rc2
3
+ Version: 0.9.1rc2
4
4
  Summary: vLLM Ascend backend plugin
5
5
  Home-page: https://github.com/vllm-project/vllm-ascend
6
6
  Author: vLLM-Ascend team
@@ -58,8 +58,8 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
58
58
  - OS: Linux
59
59
  - Software:
60
60
  * Python >= 3.9, < 3.12
61
- * CANN >= 8.1.RC1
62
- * PyTorch >= 2.5.1, torch-npu >= 2.5.1
61
+ * CANN >= 8.2.RC1
62
+ * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1
63
63
  * vLLM (the same version as vllm-ascend)
64
64
 
65
65
  ## Getting Started
@@ -37,8 +37,8 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
37
37
  - OS: Linux
38
38
  - Software:
39
39
  * Python >= 3.9, < 3.12
40
- * CANN >= 8.1.RC1
41
- * PyTorch >= 2.5.1, torch-npu >= 2.5.1
40
+ * CANN >= 8.2.RC1
41
+ * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1
42
42
  * vLLM (the same version as vllm-ascend)
43
43
 
44
44
  ## Getting Started
@@ -38,8 +38,8 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
38
38
  - 操作系统:Linux
39
39
  - 软件:
40
40
  * Python >= 3.9, < 3.12
41
- * CANN >= 8.1.RC1
42
- * PyTorch >= 2.5.1, torch-npu >= 2.5.1
41
+ * CANN >= 8.2.RC1
42
+ * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1
43
43
  * vLLM (与vllm-ascend版本一致)
44
44
 
45
45
  ## 开始使用