vllm-ascend 0.9.1rc2__tar.gz → 0.9.1rc3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (379) hide show
  1. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/Dockerfile +1 -1
  2. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/Dockerfile.openEuler +1 -1
  3. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/PKG-INFO +16 -5
  4. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/README.md +15 -4
  5. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/README.zh.md +14 -4
  6. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/csrc/kernels/pos_encoding_kernels.cpp +0 -5
  7. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/community/versioning_policy.md +4 -0
  8. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/conf.py +4 -4
  9. vllm_ascend-0.9.1rc3/docs/source/developer_guide/performance/distributed_dp_server_with_large_ep.md +253 -0
  10. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/performance/index.md +1 -0
  11. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/faqs.md +22 -5
  12. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/configuration/additional_config.md +0 -1
  13. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/release_notes.md +135 -1
  14. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregate_prefill_v1/README.md +0 -4
  15. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/requirements-dev.txt +2 -0
  16. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py +6 -1
  17. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/multicard/test_offline_inference_distributed.py +27 -12
  18. vllm_ascend-0.9.1rc3/tests/multicard/test_qwen3_moe.py +55 -0
  19. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/multicard/test_torchair_graph_mode.py +3 -3
  20. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/core/test_ascend_scheduler_e2e.py +1 -0
  21. vllm_ascend-0.9.1rc3/vllm_ascend/_version.py +34 -0
  22. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ascend_config.py +13 -4
  23. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/attention/attention.py +26 -45
  24. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/attention/attention_v1.py +60 -36
  25. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/attention/mla_v1.py +162 -213
  26. vllm_ascend-0.9.1rc3/vllm_ascend/attention/utils.py +92 -0
  27. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/core/schedule_config.py +10 -0
  28. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/core/scheduler.py +1 -1
  29. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/parallel_state.py +22 -0
  30. vllm_ascend-0.9.1rc3/vllm_ascend/distributed/utils.py +37 -0
  31. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/deepseek_mtp.py +10 -8
  32. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/deepseek_v2.py +10 -9
  33. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/qwen3_moe.py +15 -5
  34. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/multistream/ms_split.py +9 -10
  35. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/attention.py +1 -134
  36. vllm_ascend-0.9.1rc3/vllm_ascend/ops/lmhead.py +150 -0
  37. vllm_ascend-0.9.1rc3/vllm_ascend/ops/logits_processor.py +64 -0
  38. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/platform.py +42 -1
  39. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/model_runner_v1.py +150 -103
  40. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/mtp_proposer_v1.py +37 -25
  41. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/worker_v1.py +5 -2
  42. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend.egg-info/PKG-INFO +16 -5
  43. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend.egg-info/SOURCES.txt +5 -0
  44. vllm_ascend-0.9.1rc2/vllm_ascend/_version.py +0 -21
  45. vllm_ascend-0.9.1rc2/vllm_ascend/attention/utils.py +0 -23
  46. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/Dockerfile.buildwheel +0 -0
  47. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/100-documentation.yml +0 -0
  48. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/110-user-story.yml +0 -0
  49. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/200-installation.yml +0 -0
  50. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/300-usage.yml +0 -0
  51. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/400-bug-report.yml +0 -0
  52. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/500-feature-request.yml +0 -0
  53. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/600-new-model.yml +0 -0
  54. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/700-performance-discussion.yml +0 -0
  55. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/750-RFC.yml +0 -0
  56. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/800-others.yml +0 -0
  57. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  58. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  59. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/actionlint.yaml +0 -0
  60. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/dependabot.yml +0 -0
  61. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/labeler.yml +0 -0
  62. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/accuracy_report.yaml +0 -0
  63. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/accuracy_test.yaml +0 -0
  64. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/image_openeuler.yml +0 -0
  65. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/image_ubuntu.yml +0 -0
  66. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/label_merge_conflict.yml +0 -0
  67. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/labeler.yml +0 -0
  68. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/matchers/actionlint.json +0 -0
  69. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/matchers/mypy.json +0 -0
  70. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/matchers/ruff.json +0 -0
  71. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/nightly_benchmarks.yaml +0 -0
  72. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/release_code.yml +0 -0
  73. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/release_whl.yml +0 -0
  74. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/shellcheck.yml +0 -0
  75. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/vllm_ascend_doctest.yaml +0 -0
  76. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/vllm_ascend_test.yaml +0 -0
  77. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/vllm_ascend_test_long_term.yaml +0 -0
  78. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.github/workflows/vllm_ascend_test_pd.yaml +0 -0
  79. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.gitignore +0 -0
  80. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/.readthedocs.yaml +0 -0
  81. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/CMakeLists.txt +0 -0
  82. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/CODE_OF_CONDUCT.md +0 -0
  83. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/DCO +0 -0
  84. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/LICENSE +0 -0
  85. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/README.md +0 -0
  86. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/ops/ben_vocabparallelembedding.py +0 -0
  87. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/requirements-bench.txt +0 -0
  88. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/scripts/convert_json_to_markdown.py +0 -0
  89. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/scripts/patch_benchmark_dataset.py +0 -0
  90. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/scripts/perf_result_template.md +0 -0
  91. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/scripts/run-performance-benchmarks.sh +0 -0
  92. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/scripts/run_accuracy.py +0 -0
  93. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/tests/latency-tests.json +0 -0
  94. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/tests/serving-tests.json +0 -0
  95. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/benchmarks/tests/throughput-tests.json +0 -0
  96. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/cmake/utils.cmake +0 -0
  97. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/collect_env.py +0 -0
  98. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/csrc/camem_allocator.cpp +0 -0
  99. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/csrc/kernels/advance_step.cpp +0 -0
  100. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/csrc/kernels/get_masked_input_and_mask_kernel.cpp +0 -0
  101. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/csrc/kernels/types.h +0 -0
  102. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/csrc/kernels/utils.h +0 -0
  103. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/csrc/ops.h +0 -0
  104. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/csrc/torch_binding.cpp +0 -0
  105. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/csrc/utils.h +0 -0
  106. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/Makefile +0 -0
  107. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/README.md +0 -0
  108. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/requirements-docs.txt +0 -0
  109. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/requirements-test.txt +0 -0
  110. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/_templates/sections/header.html +0 -0
  111. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/assets/multi_node_dp.png +0 -0
  112. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/community/contributors.md +0 -0
  113. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/community/governance.md +0 -0
  114. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/community/user_stories/index.md +0 -0
  115. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/community/user_stories/llamafactory.md +0 -0
  116. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/contribution/index.md +0 -0
  117. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/contribution/testing.md +0 -0
  118. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/evaluation/accuracy_report/index.md +0 -0
  119. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/evaluation/index.md +0 -0
  120. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/evaluation/using_evalscope.md +0 -0
  121. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/evaluation/using_lm_eval.md +0 -0
  122. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/evaluation/using_opencompass.md +0 -0
  123. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/feature_guide/index.md +0 -0
  124. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/feature_guide/patch.md +0 -0
  125. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/modeling/adding_a_new_model.md +0 -0
  126. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/modeling/adding_a_new_multimodal_model.md +0 -0
  127. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/modeling/index.md +0 -0
  128. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/performance/optimization_and_tuning.md +0 -0
  129. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/performance/performance_benchmark.md +0 -0
  130. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/developer_guide/performance/profile_execute_duration.md +0 -0
  131. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/index.md +0 -0
  132. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/installation.md +0 -0
  133. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/logos/vllm-ascend-logo-text-dark.png +0 -0
  134. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/logos/vllm-ascend-logo-text-light.png +0 -0
  135. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/quick_start.md +0 -0
  136. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/tutorials/index.md +0 -0
  137. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/tutorials/multi_node.md +0 -0
  138. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/tutorials/multi_npu.md +0 -0
  139. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/tutorials/multi_npu_quantization.md +0 -0
  140. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/tutorials/multi_npu_qwen3_moe.md +0 -0
  141. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/tutorials/single_npu.md +0 -0
  142. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/tutorials/single_npu_multimodal.md +0 -0
  143. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/configuration/env_vars.md +0 -0
  144. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/configuration/index.md +0 -0
  145. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/feature_guide/graph_mode.md +0 -0
  146. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/feature_guide/images/structured_output_1.png +0 -0
  147. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/feature_guide/index.md +0 -0
  148. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/feature_guide/lora.md +0 -0
  149. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/feature_guide/quantization.md +0 -0
  150. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/feature_guide/sleep_mode.md +0 -0
  151. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/feature_guide/structured_output.md +0 -0
  152. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/support_matrix/index.md +0 -0
  153. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/support_matrix/supported_features.md +0 -0
  154. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/docs/source/user_guide/support_matrix/supported_models.md +0 -0
  155. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregate_prefill_v1/gen_ranktable.py +0 -0
  156. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregate_prefill_v1/gen_ranktable.sh +0 -0
  157. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py +0 -0
  158. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregate_prefill_v1/run_server.sh +0 -0
  159. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregated_prefill/disaggregated_prefill_offline.py +0 -0
  160. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregated_prefill/dp_proxy.py +0 -0
  161. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregated_prefill/find_device_ips.py +0 -0
  162. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py +0 -0
  163. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregated_prefill/run_decode_server.sh +0 -0
  164. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/disaggregated_prefill/run_prefill_server.sh +0 -0
  165. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/dp_offline/data_parallel.py +0 -0
  166. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/dp_offline/run_dp.sh +0 -0
  167. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/eplb_generate_map.py +0 -0
  168. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/external_online_dp/README.md +0 -0
  169. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/external_online_dp/launch_dp_program.py +0 -0
  170. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/external_online_dp/run_dp_template.sh +0 -0
  171. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/offline_disaggregated_prefill_npu.py +0 -0
  172. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/offline_distributed_inference_npu.py +0 -0
  173. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/offline_dualbatch_overlap_npu.py +0 -0
  174. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/offline_inference_audio_language.py +0 -0
  175. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/offline_inference_npu.py +0 -0
  176. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/offline_inference_npu_v1.py +0 -0
  177. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/offline_multi_step_custom_ops.py +0 -0
  178. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/prompt_embedding_inference.py +0 -0
  179. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/run_dp_attention_etp16_benmark.sh +0 -0
  180. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/run_dp_server.sh +0 -0
  181. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/examples/run_dp_with_cached_graph_etp16.sh +0 -0
  182. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/format.sh +0 -0
  183. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/mypy.ini +0 -0
  184. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/packages.txt +0 -0
  185. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/pyproject.toml +0 -0
  186. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/pytest.ini +0 -0
  187. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/requirements-lint.txt +0 -0
  188. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/requirements.txt +0 -0
  189. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/setup.cfg +0 -0
  190. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/setup.py +0 -0
  191. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/__init__.py +0 -0
  192. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/conftest.py +0 -0
  193. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/e2e/common.sh +0 -0
  194. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/e2e/doctests/001-quickstart-test.sh +0 -0
  195. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/e2e/pd_disaggreate/run_edge_case_test.sh +0 -0
  196. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/e2e/pd_disaggreate/setup_pd.sh +0 -0
  197. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/e2e/pd_disaggreate/test_edge_cases.py +0 -0
  198. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/e2e/pd_disaggreate/test_pd_e2e.py +0 -0
  199. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/e2e/run_disagg_pd.sh +0 -0
  200. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/e2e/run_doctests.sh +0 -0
  201. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/__init__.py +0 -0
  202. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/conftest.py +0 -0
  203. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/e2e/__init__.py +0 -0
  204. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/e2e/conftest.py +0 -0
  205. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/e2e/test_eagle_correctness.py +0 -0
  206. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/e2e/test_medusa_correctness.py +0 -0
  207. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/e2e/test_mlp_correctness.py +0 -0
  208. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py +0 -0
  209. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/e2e/test_ngram_correctness.py +0 -0
  210. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/test_dynamic_spec_decode.py +0 -0
  211. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/test_multi_step_worker.py +0 -0
  212. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/test_ngram_worker.py +0 -0
  213. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/test_spec_decode_worker.py +0 -0
  214. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/test_utils.py +0 -0
  215. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v0/utils.py +0 -0
  216. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/spec_decode_v1/test_v1_spec_decode.py +0 -0
  217. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/test_accuracy.py +0 -0
  218. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py +0 -0
  219. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/model_utils.py +0 -0
  220. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/multicard/test_data_parallel.py +0 -0
  221. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/multicard/test_dynamic_npugraph_batchsize.py +0 -0
  222. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/multicard/test_ilama_lora_tp2.py +0 -0
  223. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/multicard/test_model_qwen3_w4a8.py +0 -0
  224. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/multicard/test_multimodal_context_parallel.py +0 -0
  225. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/multicard/test_pyhccl_distributed.py +0 -0
  226. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/multicard/test_w4a8_deepseek.py +0 -0
  227. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/ops/test_vocabparallelembedding.py +0 -0
  228. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/__init__.py +0 -0
  229. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/compile/__init__.py +0 -0
  230. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/compile/test_simple.py +0 -0
  231. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/core/__init__.py +0 -0
  232. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/core/test_ascend_scheduler.py +0 -0
  233. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/ops/__init__.py +0 -0
  234. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/ops/test_fused_moe.py +0 -0
  235. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/ops/test_multi_step.py +0 -0
  236. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/ops/test_rotary_embedding.py +0 -0
  237. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/sample/__init__.py +0 -0
  238. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/sample/test_rejection_sampler.py +0 -0
  239. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_aclgraph.py +0 -0
  240. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_ascend_config.py +0 -0
  241. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_camem.py +0 -0
  242. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_chunked.py +0 -0
  243. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_guided_decoding.py +0 -0
  244. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_ilama_lora.py +0 -0
  245. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_offline_inference.py +0 -0
  246. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_profile_execute_duration.py +0 -0
  247. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_prompt_embedding.py +0 -0
  248. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_pyhccl.py +0 -0
  249. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_sampler.py +0 -0
  250. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/singlecard/test_scheduler.py +0 -0
  251. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/ut/kv_connector/test_llmdatadist_connector.py +0 -0
  252. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/ut/kv_connector/test_remote_decode_lifecycle.py +0 -0
  253. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/ut/kv_connector/test_remote_prefill_lifecycle.py +0 -0
  254. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/ut/kv_connector/utils.py +0 -0
  255. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/ut/ops/test_expert_load_balancer.py +0 -0
  256. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/ut/patch/worker/patch_common/test_patch_sampler.py +0 -0
  257. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/ut/test_distributed_tensor_parallel.py +0 -0
  258. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/ut/test_token_dispatcher.py +0 -0
  259. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tests/utils.py +0 -0
  260. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tools/actionlint.sh +0 -0
  261. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tools/check_repo.sh +0 -0
  262. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tools/mypy.sh +0 -0
  263. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tools/png-lint.sh +0 -0
  264. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tools/shellcheck.sh +0 -0
  265. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/tools/sphinx-lint.sh +0 -0
  266. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/__init__.py +0 -0
  267. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ascend_forward_context.py +0 -0
  268. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/attention/__init__.py +0 -0
  269. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/compilation/__init__.py +0 -0
  270. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/compilation/piecewise_backend.py +0 -0
  271. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/core/__init__.py +0 -0
  272. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/cpu_binding.py +0 -0
  273. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/device_allocator/__init__.py +0 -0
  274. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/device_allocator/camem.py +0 -0
  275. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/__init__.py +0 -0
  276. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/communicator.py +0 -0
  277. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/context_parallel_utils.py +0 -0
  278. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/device_communicators/__init__.py +0 -0
  279. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/device_communicators/pyhccl.py +0 -0
  280. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py +0 -0
  281. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/kv_transfer/__init__.py +0 -0
  282. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/kv_transfer/simple_buffer.py +0 -0
  283. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/kv_transfer/simple_connector.py +0 -0
  284. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/kv_transfer/simple_pipe.py +0 -0
  285. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/kv_transfer/utils.py +0 -0
  286. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py +0 -0
  287. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/llmdatadist_connector.py +0 -0
  288. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/distributed/tensor_parallel.py +0 -0
  289. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/envs.py +0 -0
  290. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/__init__.py +0 -0
  291. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/adaptor/__init__.py +0 -0
  292. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/adaptor/abstract_adaptor.py +0 -0
  293. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/adaptor/vllm_adaptor.py +0 -0
  294. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/__init__.py +0 -0
  295. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/eplb_device_transfer_loader.py +0 -0
  296. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/eplb_utils.py +0 -0
  297. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/eplb_worker.py +0 -0
  298. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/policy/__init__.py +0 -0
  299. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/policy/policy_abstract.py +0 -0
  300. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/policy/policy_dynamic_ep.py +0 -0
  301. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py +0 -0
  302. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/policy/policy_factory.py +0 -0
  303. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/core/policy/policy_random.py +0 -0
  304. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/eplb/eplb_updator.py +0 -0
  305. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/lora/__init__.py +0 -0
  306. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/lora/punica_wrapper/__init__.py +0 -0
  307. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/lora/punica_wrapper/punica_npu.py +0 -0
  308. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/__init__.py +0 -0
  309. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/deepseek_dbo.py +0 -0
  310. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/qwen2.py +0 -0
  311. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/qwen2_5_vl.py +0 -0
  312. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/qwen2_5_vl_without_padding.py +0 -0
  313. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/qwen2_vl.py +0 -0
  314. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/qwen3.py +0 -0
  315. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/models/qwen3_dbo.py +0 -0
  316. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/multistream/__init__.py +0 -0
  317. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/multistream/base.py +0 -0
  318. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/multistream/context.py +0 -0
  319. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/multistream/decorator.py +0 -0
  320. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/multistream/layers.py +0 -0
  321. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/multistream/metadata.py +0 -0
  322. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/__init__.py +0 -0
  323. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/activation.py +0 -0
  324. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/cache.py +0 -0
  325. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/comm_utils.py +0 -0
  326. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/common_fused_moe.py +0 -0
  327. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/expert_load_balancer.py +0 -0
  328. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/fused_moe.py +0 -0
  329. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/layernorm.py +0 -0
  330. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/moe_dispatcher/__init__.py +0 -0
  331. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py +0 -0
  332. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/rotary_embedding.py +0 -0
  333. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/sequence_parallel.py +0 -0
  334. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/ops/vocab_parallel_embedding.py +0 -0
  335. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/__init__.py +0 -0
  336. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/__init__.py +0 -0
  337. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/patch_0_9_1/__init__.py +0 -0
  338. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/patch_0_9_1/patch_cache_manager.py +0 -0
  339. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/patch_0_9_1/patch_configs.py +0 -0
  340. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/patch_0_9_1/patch_core.py +0 -0
  341. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/patch_0_9_1/patch_core_client.py +0 -0
  342. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/patch_0_9_1/patch_decorator.py +0 -0
  343. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/patch_common/__init__.py +0 -0
  344. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/patch_common/patch_distributed.py +0 -0
  345. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/platform/patch_main/__init__.py +0 -0
  346. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/__init__.py +0 -0
  347. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_0_9_1/__init__.py +0 -0
  348. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_common/__init__.py +0 -0
  349. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_common/patch_distributed.py +0 -0
  350. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_common/patch_eagle.py +0 -0
  351. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_common/patch_minicpm.py +0 -0
  352. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py +0 -0
  353. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_common/patch_sampler.py +0 -0
  354. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py +0 -0
  355. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_common/patch_utils.py +0 -0
  356. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/patch/worker/patch_main/__init__.py +0 -0
  357. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/quantization/__init__.py +0 -0
  358. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/quantization/func_wrapper.py +0 -0
  359. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/quantization/quant_config.py +0 -0
  360. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/quantization/quantizer.py +0 -0
  361. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/quantization/w4a8_dynamic.py +0 -0
  362. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/quantization/w8a8.py +0 -0
  363. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/quantization/w8a8_dynamic.py +0 -0
  364. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/sample/__init__.py +0 -0
  365. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/sample/rejection_sampler.py +0 -0
  366. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/soc_info.py +0 -0
  367. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/utils.py +0 -0
  368. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/__init__.py +0 -0
  369. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/cache_engine.py +0 -0
  370. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/draft_model_runner.py +0 -0
  371. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/model_runner.py +0 -0
  372. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/multi_step_runner.py +0 -0
  373. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/multi_step_worker.py +0 -0
  374. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/pooling_model_runner.py +0 -0
  375. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend/worker/worker.py +0 -0
  376. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend.egg-info/dependency_links.txt +0 -0
  377. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend.egg-info/entry_points.txt +0 -0
  378. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend.egg-info/requires.txt +0 -0
  379. {vllm_ascend-0.9.1rc2 → vllm_ascend-0.9.1rc3}/vllm_ascend.egg-info/top_level.txt +0 -0
@@ -53,7 +53,7 @@ RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
53
53
  python3 -m pip cache purge
54
54
 
55
55
  # Install modelscope (for fast download) and ray (for multinode)
56
- RUN python3 -m pip install modelscope ray && \
56
+ RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
57
57
  python3 -m pip cache purge
58
58
 
59
59
  CMD ["/bin/bash"]
@@ -50,7 +50,7 @@ RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
50
50
  python3 -m pip cache purge
51
51
 
52
52
  # Install modelscope (for fast download) and ray (for multinode)
53
- RUN python3 -m pip install modelscope ray && \
53
+ RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
54
54
  python3 -m pip cache purge
55
55
 
56
56
  CMD ["/bin/bash"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vllm_ascend
3
- Version: 0.9.1rc2
3
+ Version: 0.9.1rc3
4
4
  Summary: vLLM Ascend backend plugin
5
5
  Home-page: https://github.com/vllm-project/vllm-ascend
6
6
  Author: vLLM-Ascend team
@@ -40,6 +40,10 @@ vLLM Ascend Plugin
40
40
 
41
41
  ---
42
42
  *Latest News* 🔥
43
+
44
+ - [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with ‌LLaMA-Factory/verl//TRL/GPUStack‌ to demonstrate how ‌vLLM Ascend‌ assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
45
+ - [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
46
+ - [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
43
47
  - [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
44
48
  - [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
45
49
  - [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
@@ -64,10 +68,16 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
64
68
 
65
69
  ## Getting Started
66
70
 
67
- Please refer to [QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details.
71
+ Please use the following recommended versions to get started quickly:
72
+
73
+ | Version | Release type | Doc |
74
+ |------------|--------------|--------------------------------------|
75
+ |v0.9.2rc1|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
76
+ |v0.9.1rc3|Next stable release|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
77
+ |v0.7.3.post1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/stable/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/stable/installation.html) for more details|
68
78
 
69
79
  ## Contributing
70
- See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/main/developer_guide/contributing.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
80
+ See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
71
81
 
72
82
  We welcome and value any contributions and collaborations:
73
83
  - Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
@@ -86,9 +96,10 @@ Below is maintained branches:
86
96
  |------------|--------------|--------------------------------------|
87
97
  | main | Maintained | CI commitment for vLLM main branch and vLLM 0.9.x branch |
88
98
  | v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
89
- | v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version |
99
+ | v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
100
+ | v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.1 version |
90
101
 
91
- Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/main/developer_guide/versioning_policy.html) for more details.
102
+ Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
92
103
 
93
104
  ## Weekly Meeting
94
105
 
@@ -19,6 +19,10 @@ vLLM Ascend Plugin
19
19
 
20
20
  ---
21
21
  *Latest News* 🔥
22
+
23
+ - [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with ‌LLaMA-Factory/verl//TRL/GPUStack‌ to demonstrate how ‌vLLM Ascend‌ assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
24
+ - [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
25
+ - [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
22
26
  - [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
23
27
  - [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
24
28
  - [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
@@ -43,10 +47,16 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
43
47
 
44
48
  ## Getting Started
45
49
 
46
- Please refer to [QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details.
50
+ Please use the following recommended versions to get started quickly:
51
+
52
+ | Version | Release type | Doc |
53
+ |------------|--------------|--------------------------------------|
54
+ |v0.9.2rc1|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
55
+ |v0.9.1rc3|Next stable release|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
56
+ |v0.7.3.post1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/stable/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/stable/installation.html) for more details|
47
57
 
48
58
  ## Contributing
49
- See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/main/developer_guide/contributing.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
59
+ See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
50
60
 
51
61
  We welcome and value any contributions and collaborations:
52
62
  - Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
@@ -65,9 +75,10 @@ Below is maintained branches:
65
75
  |------------|--------------|--------------------------------------|
66
76
  | main | Maintained | CI commitment for vLLM main branch and vLLM 0.9.x branch |
67
77
  | v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
68
- | v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version |
78
+ | v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
79
+ | v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.1 version |
69
80
 
70
- Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/main/developer_guide/versioning_policy.html) for more details.
81
+ Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
71
82
 
72
83
  ## Weekly Meeting
73
84
 
@@ -20,6 +20,9 @@ vLLM Ascend Plugin
20
20
  ---
21
21
  *最新消息* 🔥
22
22
 
23
+ - [2025/06] [用户案例](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html)现已上线!展示了LLaMA-Factory/verl/TRL/GPUStack等用户案例,展示了vLLM Ascend如何帮助昇腾用户在模型微调、评估、强化学习 (RL) 以及部署等场景中提升体验。
24
+ - [2025/06] [贡献者](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html)页面现已上线!所有的贡献都值得被记录,感谢所有的贡献者。
25
+ - [2025/05] 我们发布了首个正式版本 [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)!我们与 vLLM 社区合作发布了一篇博客文章,分享了我们的实践:[Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html)。
23
26
  - [2025/03] 我们和vLLM团队举办了[vLLM Beijing Meetup](https://mp.weixin.qq.com/s/CGDuMoB301Uytnrkc2oyjg)! 你可以在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料.
24
27
  - [2025/02] vLLM社区正式创建了[vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend)仓库,让vLLM可以无缝运行在Ascend NPU。
25
28
  - [2024/12] 我们正在与 vLLM 社区合作,以支持 [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
@@ -44,10 +47,16 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
44
47
 
45
48
  ## 开始使用
46
49
 
47
- 请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多.
50
+ 推荐您使用以下版本快速开始使用:
51
+
52
+ | Version | Release type | Doc |
53
+ |------------|--------------|--------------------------------------|
54
+ |v0.9.2rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
55
+ |v0.9.1rc3| 下一个正式/稳定版 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|
56
+ |v0.7.3.post1| 最新正式/稳定版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/stable/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/stable/installation.html)了解更多|
48
57
 
49
58
  ## 贡献
50
- 请参考 [CONTRIBUTING]((https://vllm-ascend.readthedocs.io/en/main/developer_guide/contributing.html)) 文档了解更多关于开发环境搭建、功能测试以及 PR 提交规范的信息。
59
+ 请参考 [CONTRIBUTING]((https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html)) 文档了解更多关于开发环境搭建、功能测试以及 PR 提交规范的信息。
51
60
 
52
61
  我们欢迎并重视任何形式的贡献与合作:
53
62
  - 请通过[Issue](https://github.com/vllm-project/vllm-ascend/issues)来告知我们您遇到的任何Bug。
@@ -65,9 +74,10 @@ vllm-ascend有主干分支和开发分支。
65
74
  |------------|------------|---------------------|
66
75
  | main | Maintained | 基于vLLM main分支CI看护 |
67
76
  | v0.7.1-dev | Unmaintained | 只允许文档修复 |
68
- | v0.7.3-dev | Maintained | 基于vLLM v0.7.3版本CI看护 |
77
+ | v0.7.3-dev | Maintained | 基于vLLM v0.7.3版本CI看护, 只允许Bug修复,不会再发布新版本 |
78
+ | v0.9.1-dev | Maintained | 基于vLLM v0.9.1版本CI看护 |
69
79
 
70
- 请参阅[版本策略](https://vllm-ascend.readthedocs.io/en/main/developer_guide/versioning_policy.html)了解更多详细信息。
80
+ 请参阅[版本策略](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html)了解更多详细信息。
71
81
 
72
82
  ## 社区例会
73
83
 
@@ -15,11 +15,6 @@
15
15
  */
16
16
 
17
17
  #include "kernel_operator.h"
18
- #include "kernel_tpipe_impl.h"
19
- #include "kernel_tensor_impl.h"
20
- #include "kernel_type.h"
21
- #include "kernel_operator_intf.h"
22
- #include "inner_interface/inner_kernel_operator_intf.h"
23
18
  #include <stdio.h>
24
19
  #include "types.h"
25
20
  #include "utils.h"
@@ -23,6 +23,8 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
23
23
  | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | MindIE Turbo |
24
24
  |-------------|--------------|------------------|-------------|--------------------|--------------|
25
25
  | v0.9.2rc1 | v0.9.2 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250619 | |
26
+ | v0.9.1rc3 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1 | |
27
+ | v0.9.1rc2 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1| |
26
28
  | v0.9.1rc1 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250528 | |
27
29
  | v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | |
28
30
  | v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | |
@@ -37,6 +39,8 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
37
39
 
38
40
  | Date | Event |
39
41
  |------------|-------------------------------------------|
42
+ | 2025.08.22 | Release candidates, v0.9.1rc3 |
43
+ | 2025.08.06 | Release candidates, v0.9.1rc2 |
40
44
  | 2025.07.11 | Release candidates, v0.9.2rc1 |
41
45
  | 2025.06.22 | Release candidates, v0.9.1rc1 |
42
46
  | 2025.06.10 | Release candidates, v0.9.0rc2 |
@@ -65,15 +65,15 @@ myst_substitutions = {
65
65
  # the branch of vllm, used in vllm clone
66
66
  # - main branch: 'main'
67
67
  # - vX.Y.Z branch: 'vX.Y.Z'
68
- 'vllm_version': 'v0.9.0',
68
+ 'vllm_version': 'v0.9.1',
69
69
  # the branch of vllm-ascend, used in vllm-ascend clone and image tag
70
70
  # - main branch: 'main'
71
71
  # - vX.Y.Z branch: latest vllm-ascend release tag
72
- 'vllm_ascend_version': 'v0.9.0rc2',
72
+ 'vllm_ascend_version': 'v0.9.1rc3',
73
73
  # the newest release version of vllm-ascend and matched vLLM, used in pip install.
74
74
  # This value should be updated when cut down release.
75
- 'pip_vllm_ascend_version': "0.9.0rc2",
76
- 'pip_vllm_version': "0.9.0",
75
+ 'pip_vllm_ascend_version': "0.9.1rc3",
76
+ 'pip_vllm_version': "0.9.1",
77
77
  # CANN image tag
78
78
  'cann_image_tag': "8.2.rc1-910b-ubuntu22.04-py3.11",
79
79
  }
@@ -0,0 +1,253 @@
1
+ # Distributed DP Server With Large EP (DeepSeek)
2
+
3
+ ## Getting Start
4
+
5
+ vLLM-Ascend now supports prefill-decode (PD) disaggregation in the large **Expert Parallelism (EP)** scenario. To achieve better performance,the distributed DP server is applied in vLLM-Ascend. In the PD separation scenario, different optimization strategies can be implemented based on the distinct characteristics of PD nodes, thereby enabling more flexible model deployment.
6
+
7
+ ## Verify Multi-Node Communication Environment
8
+
9
+ ### Physical Layer Requirements:
10
+
11
+ - The physical machines must be located on the same WLAN, with network connectivity.
12
+ - All NPUs are connected with optical modules, and the connection status must be normal.
13
+
14
+ ### Verification Process:
15
+
16
+ Execute the following commands on each node in sequence. The results must all be `success` and the status must be `UP`:
17
+
18
+ ```bash
19
+ # Check the remote switch ports
20
+ for i in {0..15}; do hccn_tool -i $i -lldp -g | grep Ifname; done
21
+ # Get the link status of the Ethernet ports (UP or DOWN)
22
+ for i in {0..15}; do hccn_tool -i $i -link -g ; done
23
+ # Check the network health status
24
+ for i in {0..15}; do hccn_tool -i $i -net_health -g ; done
25
+ # View the network detected IP configuration
26
+ for i in {0..15}; do hccn_tool -i $i -netdetect -g ; done
27
+ # View gateway configuration
28
+ for i in {0..15}; do hccn_tool -i $i -gateway -g ; done
29
+ # View NPU network configuration
30
+ cat /etc/hccn.conf
31
+ ```
32
+
33
+ ### NPU Interconnect Verification:
34
+
35
+ #### 1. Get NPU IP Addresses
36
+
37
+ ```bash
38
+ for i in {0..15}; do hccn_tool -i $i -vnic -g;done
39
+ ```
40
+
41
+ #### 2. Get superpodid and SDID
42
+
43
+ ```bash
44
+ for i in {0..7}; do npu-smi info -t spod-info -i $i -c 0;npu-smi info -t spod-info -i $i -c 1;done
45
+ ```
46
+
47
+ #### 3. Cross-Node PING Test
48
+
49
+ ```bash
50
+ # Execute on the target node (replace with actual IP)
51
+ for i in {0..15}; do hccn_tool -i $i -hccs_ping -g address x.x.x.x;done
52
+ ```
53
+
54
+ ## Generate Ranktable
55
+
56
+ You need to generate a ranktable to make mulit nodes to communicate with each other. For more details please refer to the [vllm-ascend examples](https://github.com/vllm-project/vllm-ascend/blob/v0.9.1-dev/examples/disaggregate_prefill_v1/README.md). Execute the following commands for reference.
57
+
58
+ ```shell
59
+ cd vllm-ascend/examples/disaggregate_prefill_v1/
60
+ bash gen_ranktable.sh --ips prefiller_node1_local_ip prefiller_node2_local_ip decoder_node1_local_ip decoder_node2_local_ip \
61
+ --npus-per-node npu_clips --network-card-name nic_name --prefill-device-cnt prefiller_npu_clips --decode-device-cnt decode_npu_clips
62
+ ```
63
+
64
+ |Parameter | meaning |
65
+ | --- | --- |
66
+ | --ips | Each node's local ip (prefiller nodes should be front of decoder nodes) |
67
+ | --npus-per-node | Each node's npu clips |
68
+ | --network-card-name | The physical machines' NIC |
69
+ |--prefill-device-cnt | Npu clips used for prefill |
70
+ |--decode-device-cnt |Npu clips used for decode |
71
+
72
+ ## Use the Distributed DP Server
73
+
74
+ Execute the following commands to use the distributed DP server. (We recommend using this feature on the v0.9.1-dev branch)
75
+
76
+ ```python
77
+ import multiprocessing
78
+ import os
79
+ import sys
80
+ dp_size = "total number of DP workers for decode/prefill"
81
+ dp_size_local = "number of DP workers on the current node"
82
+ dp_rank_start = "starting DP rank for the current node"
83
+ dp_ip = "master node ip"
84
+ dp_port = "port used for communication"
85
+ engine_port = "the starting port for all DP groups on the current node"
86
+ template_path = "./run_dp_template.sh"
87
+ if not os.path.exists(template_path):
88
+ print(f"Template file {template_path} does not exist.")
89
+ sys.exit(1)
90
+ def run_command(dp_rank_local, dp_rank, engine_port_):
91
+ command = f"bash ./run_dp_template.sh {dp_size} {dp_ip} {dp_port} {dp_rank_local} {dp_rank} {engine_port_} {dp_size_local}"
92
+ os.system(command)
93
+ processes = []
94
+ for i in range(dp_size_local):
95
+ dp_rank = dp_rank_start + i
96
+ dp_rank_local = i
97
+ engine_port_ = engine_port + i
98
+ process = multiprocessing.Process(target=run_command, args=(dp_rank_local, dp_rank, engine_port_))
99
+ processes.append(process)
100
+ process.start()
101
+ for process in processes:
102
+ process.join()
103
+ ```
104
+
105
+ Note that the prefiller nodes and the decoder nodes may have differenet configurations. You can use the following shell script for configuring the prefiller and decoder nodes respectively.
106
+
107
+ ```shell
108
+ # run_dp_template.sh
109
+ #!/bin/sh
110
+
111
+ # this obtained through ifconfig
112
+ # nic_name is the network interface name corresponding to local_ip
113
+ nic_name="xxxx"
114
+ local_ip="xxxx"
115
+
116
+ # basic configuration for HCCL and connection
117
+ export HCCL_IF_IP=$local_ip
118
+ export GLOO_SOCKET_IFNAME=$nic_name
119
+ export TP_SOCKET_IFNAME=$nic_name
120
+ export HCCL_SOCKET_IFNAME=$nic_name
121
+ export OMP_PROC_BIND=false
122
+ export OMP_NUM_THREADS=10
123
+ export HCCL_BUFFSIZE=256
124
+ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH='ranktable you generate'
125
+
126
+ # obtain parameters from distributed DP server
127
+ export VLLM_DP_SIZE=$1
128
+ export VLLM_DP_MASTER_IP=$2
129
+ export VLLM_DP_MASTER_PORT=$3
130
+ export VLLM_DP_RANK_LOCAL=$4
131
+ export VLLM_DP_RANK=$5
132
+ export VLLM_DP_SIZE_LOCAL=$7
133
+
134
+ #pytorch_npu settings and vllm settings
135
+ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
136
+ export TASK_QUEUE_ENABLE=1
137
+ export VLLM_USE_V1=1
138
+
139
+ # enable the distributed DP server
140
+ export VLLM_WORKER_MULTIPROC_METHOD="fork"
141
+ export VLLM_ASCEND_EXTERNAL_DP_LB_ENABLED=1
142
+
143
+ # The w8a8 weight can obtained from https://www.modelscope.cn/models/vllm-ascend/DeepSeek-R1-W8A8
144
+ # "--additional-config" is used to enable characteristics from vllm-ascend
145
+ vllm serve /root/.cache/ds_r1 \
146
+ --host 0.0.0.0 \
147
+ --port $6 \
148
+ --tensor-parallel-size 8 \
149
+ --enable-expert-parallel \
150
+ --seed 1024 \
151
+ --served-model-name deepseek_r1 \
152
+ --max-model-len 17000 \
153
+ --max-num-batched-tokens 16384 \
154
+ --trust-remote-code \
155
+ --max-num-seqs 4 \
156
+ --gpu-memory-utilization 0.9 \
157
+ --quantization ascend \
158
+ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
159
+ --kv-transfer-config \
160
+ '{"kv_connector": "LLMDataDistCMgrConnector",
161
+ "kv_buffer_device": "npu",
162
+ "kv_role": "kv_consumer",
163
+ "kv_parallel_size": "1",
164
+ "kv_port": "20001",
165
+ "engine_id": "0",
166
+ "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
167
+ }' \
168
+ --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
169
+ ```
170
+
171
+ In the PD separation scenario, we provide a recommended optimized configuration.
172
+
173
+ - **prefiller node**
174
+
175
+ 1. set HCCL_BUFFSIZE=256
176
+ 2. add '--enforce-eager' command to 'vllm serve'
177
+ 3. Take '--additional-config' as follow
178
+
179
+ ```shell
180
+ --additional-config '{"ascend_scheduler_config":{"enabled":false}, "torchair_graph_config":{"enabled":false},"enable_weight_nz_layout":true,"enable_prefill_optimizations":true}'
181
+ ```
182
+
183
+ - **decoder node**
184
+
185
+ 1. set HCCL_BUFFSIZE=1024
186
+ 2. Take '--additional-config' as follow
187
+
188
+ ```shell
189
+ --additional-config '{"ascend_scheduler_config":{"enabled":false}, "torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"enable_multistream_moe":true,"graph_batch_sizes":[28], "enable_super_kernel":true, "use_cached_graph":true},"enable_weight_nz_layout":true}'
190
+ ```
191
+
192
+ <br>
193
+
194
+ '--additional-config' Parameter Introduction:
195
+
196
+ - **"torchair_graph_config":** The config options for torchair graph mode.
197
+ - **"ascend_scheduler_config":** The config options for ascend scheduler.
198
+ - **"enable_weight_nz_layout":** Whether to convert quantized weights to NZ format to accelerate matrix multiplication.
199
+ - **"enable_prefill_optimizations":** Whether to enable DeepSeek models' prefill optimizations.
200
+ <br>
201
+
202
+ "torchair_graph_config" Parameter Introduction:
203
+
204
+ - **"enable_multistream_mla":** Whether to put vector ops of MLA to another stream. This option only takes effects on models using MLA.
205
+ - **"enable_multistream_moe":** Whether to enable multistream shared expert. This option only takes effects on DeepSeek moe models.
206
+ - **"graph_batch_sizes":** The batch size for torchair graph cache.
207
+ - **"enable_super_kernel":** Whether to enable super kernel.
208
+ - **"use_cached_graph":** Whether to use cached graph
209
+
210
+ ## Toy proxy for Distributed DP Server
211
+
212
+ In the PD separation scenario, we need a proxy to distribute requests. Execute the following commands to enable the toy proxy:
213
+
214
+ ```shell
215
+ python load_balance_proxy_server_example.py \
216
+ --port "proxy port" \
217
+ --host 0.0.0.0 \
218
+ --prefiller-hosts \
219
+ prefiller node1 local ip \
220
+ prefiller node2 local ip \
221
+ --prefiller-ports \
222
+ engine_port engine_port \
223
+ --decoder-hosts \
224
+ decoder node1 local ip \
225
+ decoder node1 local ip \
226
+ decoder node2 local ip \
227
+ decoder node2 local ip \
228
+ --decoder-ports \
229
+ engine_port ... \ # Increase by dp_size_local e.g. 9000 9001
230
+ engine_port ... \ # Increase by dp_size_local e.g. 9000 9001
231
+ ```
232
+
233
+ :::{note}
234
+ Each node local ip should repeat the same times as its '**dp_size_local**', at the same time, each node has the same number of ports as '**dp_size_local**', and their ports increase sequentially starting from '**engine_port**'.
235
+ :::
236
+
237
+ You can get the proxy program in the repository's examples, [load\_balance\_proxy\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/v0.9.1-dev/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py)
238
+
239
+ ## Recommended Configuration
240
+
241
+ For example,if the average input length is 3.5k, and the output length is 1.1k, the context length is 16k, the max length of the input dataset is 7K. In this scenario, we give a recommended configuration for distributed DP server with high EP. Here we use 4 nodes for prefill and 4 nodes for decode.
242
+ <br>
243
+
244
+ | node | DP | TP | EP | max-model-len | max-num-batched-tokens | max-num-seqs | gpu-memory-utilization |
245
+ |----------|----|----|----|---------------|------------------------|--------------|-----------|
246
+ | prefill | 2 | 8 | 16 | 17000 | 16384 | 4 | 0.9 |
247
+ | decode | 64 | 1 | 64 | 17000 | 256 | 28 | 0.9 |
248
+
249
+ ## FAQ
250
+
251
+ ### 1. Prefiller nodes need to warmup
252
+
253
+ Since the computation of some NPU operators requires several rounds of warm-up to achieve best performance, we recommend preheating the service with some requests before conducting performance tests to achieve the best end-to-end throughput.
@@ -6,4 +6,5 @@
6
6
  performance_benchmark
7
7
  profile_execute_duration
8
8
  optimization_and_tuning
9
+ distributed_dp_server_with_large_ep
9
10
  :::
@@ -3,7 +3,7 @@
3
3
  ## Version Specific FAQs
4
4
 
5
5
  - [[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1007)
6
- - [[v0.9.0rc2] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115)
6
+ - [[v0.9.1rc3] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/2410)
7
7
 
8
8
  ## General FAQs
9
9
 
@@ -158,12 +158,29 @@ for output in outputs:
158
158
  2. Set the following enveriments parameters:
159
159
 
160
160
  ```bash
161
- export LCCL_DETERMINISTIC = 1
162
- export HCCL_DETERMINISTIC = 1
163
- export ATB_MATMUL_SHUFFLE_K_ENABLE = 0
164
- export ATB_LLM_LCOC_ENABLE = 0
161
+ export LCCL_DETERMINISTIC=1
162
+ export HCCL_DETERMINISTIC=true
163
+ export ATB_MATMUL_SHUFFLE_K_ENABLE=0
164
+ export ATB_LLM_LCOC_ENABLE=0
165
165
  ```
166
166
 
167
167
  ### 19. How to fix the error "ImportError: Please install vllm[audio] for audio support" for Qwen2.5-Omni model?
168
168
  The `Qwen2.5-Omni` model requires the `librosa` package to be installed, you need to install the `qwen-omni-utils` package to ensure all dependencies are met `pip install qwen-omni-utils`,
169
169
  this package will install `librosa` and its related dependencies, resolving the `ImportError: No module named 'librosa'` issue and ensuring audio processing functionality works correctly.
170
+
171
+ ### 20. Failed to run with `ray` distributed backend?
172
+ You might facing the following errors when running with ray backend in distributed scenarios:
173
+
174
+ ```
175
+ TypeError: can't convert npu:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
176
+ ```
177
+
178
+ ```
179
+ AttributeError: 'str' object has no attribute 'DESCRIPTOR' when packaging message to dict
180
+ ```
181
+
182
+ This has been solved in `ray>=2.47.1`, thus we could solve this as following:
183
+
184
+ ```
185
+ python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0'
186
+ ```
@@ -30,7 +30,6 @@ The following table lists the additional configuration options available in vLLM
30
30
  | `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
31
31
  | `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case. |
32
32
  | `expert_map_path` | str | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
33
- | `chunked_prefill_for_mla` | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
34
33
  | `kv_cache_dtype` | str | `None` | When using the kv cache quantization method, kv cache dtype needs to be set, currently only int8 is supported. |
35
34
 
36
35
  The details of each config option are as follows: