sendnn-inference 2.2.0__tar.gz → 2.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/test.yml +10 -1
  2. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/PKG-INFO +2 -2
  3. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/configuration.md +9 -0
  4. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/long_context.py +15 -2
  5. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/text_inference.py +14 -1
  6. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/pyproject.toml +3 -3
  7. sendnn_inference-2.2.2/sendnn_inference/_version.py +34 -0
  8. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/argparse_utils.py +1 -1
  9. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_configs.yaml +31 -3
  10. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/envs.py +10 -0
  11. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/spyre.py +26 -10
  12. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/platform.py +16 -3
  13. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/core/scheduler.py +43 -33
  14. sendnn_inference-2.2.2/sendnn_inference/v1/worker/mm_shared_memory.py +130 -0
  15. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_model_runner.py +160 -30
  16. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_worker.py +29 -3
  17. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/PKG-INFO +2 -2
  18. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/SOURCES.txt +3 -0
  19. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/requires.txt +1 -1
  20. sendnn_inference-2.2.2/tests/e2e/test_load_format_dummy.py +39 -0
  21. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_basic.py +3 -0
  22. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_pc_scheduler_steps.py +27 -27
  23. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/llm_cache.py +5 -0
  24. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_upstream_compatibility.py +17 -0
  25. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/core/test_scheduler_structured_outputs.py +6 -0
  26. sendnn_inference-2.2.2/tests/v1/worker/test_mm_shared_memory.py +328 -0
  27. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/uv.lock +16 -118
  28. sendnn_inference-2.2.0/sendnn_inference/_version.py +0 -24
  29. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.bob/skills/update-vllm/SKILL.md +0 -0
  30. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.claude/skills/update-vllm/SKILL.md +0 -0
  31. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/CODEOWNERS +0 -0
  32. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
  33. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  34. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
  35. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/rfc.yml +0 -0
  36. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/actions/free-up-disk-space/action.yml +0 -0
  37. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ci_model_cache.yaml +0 -0
  38. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/pull_request_template.md +0 -0
  39. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/add_label_automerge.yml +0 -0
  40. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/build_and_publish.yaml +0 -0
  41. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/build_docker.yml +0 -0
  42. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/check_uv_lock.yml +0 -0
  43. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/lint_scripts.yml +0 -0
  44. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/matchers/ruff.json +0 -0
  45. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/model_smoke.yml +0 -0
  46. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/pre-commit.yml +0 -0
  47. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/publish_to_test_pypi.yaml +0 -0
  48. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/reminder_comment.yml +0 -0
  49. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/stale.yml +0 -0
  50. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.gitignore +0 -0
  51. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.pre-commit-config.yaml +0 -0
  52. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.readthedocs.yaml +0 -0
  53. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.shellcheckrc +0 -0
  54. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.yapfignore +0 -0
  55. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/CLAUDE.md +0 -0
  56. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/CODE_OF_CONDUCT.md +0 -0
  57. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/CONTRIBUTING.md +0 -0
  58. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/DCO +0 -0
  59. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/LICENSE +0 -0
  60. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/README.md +0 -0
  61. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/RELEASING.md +0 -0
  62. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/_local_envs_for_test.sh +0 -0
  63. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docker/.senlib.json +0 -0
  64. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docker/Dockerfile.amd64 +0 -0
  65. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docker/simple_vllm_serve.sh +0 -0
  66. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/.nav.yml +0 -0
  67. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/README.md +0 -0
  68. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/README.md +0 -0
  69. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/architecture.md +0 -0
  70. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/images/vllm_v1.svg +0 -0
  71. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/images/vllm_v1_spyre.svg +0 -0
  72. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/maintaining.md +0 -0
  73. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/multimodal/adding_new_models.md +0 -0
  74. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/scheduler.md +0 -0
  75. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/vllm-update-procedure.md +0 -0
  76. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/deploying/docker.md +0 -0
  77. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/deploying/k8s.md +0 -0
  78. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/deploying/rhoai.md +0 -0
  79. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/getting_started/installation.md +0 -0
  80. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/generate_example.py +0 -0
  81. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefill_single_chunks.json +0 -0
  82. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefill_three_chunks.json +0 -0
  83. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_1.json +0 -0
  84. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_2.json +0 -0
  85. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_3.json +0 -0
  86. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/scheduling_admission_constraints.json +0 -0
  87. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/scheduling_padding_tkv_jump.json +0 -0
  88. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/timeline_admission_constraints.json +0 -0
  89. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_examples.py +0 -0
  90. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_model_tables.py +0 -0
  91. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_prefill_only_plots.py +0 -0
  92. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_prefix_caching_plots.py +0 -0
  93. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_scheduling_plots.py +0 -0
  94. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_timeline.py +0 -0
  95. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/url_schemes.py +0 -0
  96. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/overrides/main.html +0 -0
  97. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/requirements-docs.txt +0 -0
  98. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/roadmaps/q3-2025.md +0 -0
  99. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/env_vars.md +0 -0
  100. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/performance.md +0 -0
  101. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/supported_features.md +0 -0
  102. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/supported_models.md +0 -0
  103. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/vision_inference.py +0 -0
  104. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/online_inference/openai_spyre_text.py +0 -0
  105. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/online_inference/openai_spyre_vision.py +0 -0
  106. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/online_inference/spyre_vllm_benchmark.py +0 -0
  107. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/online_inference/spyre_vllm_setup_container.sh +0 -0
  108. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/format.sh +0 -0
  109. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/mkdocs.yaml +0 -0
  110. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/__init__.py +0 -0
  111. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/compat_utils.py +0 -0
  112. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/compilation_utils.py +0 -0
  113. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/README.md +0 -0
  114. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/__init__.py +0 -0
  115. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/configurators/__init__.py +0 -0
  116. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/configurators/model_configurator.py +0 -0
  117. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_config.py +0 -0
  118. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_matcher.py +0 -0
  119. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_registry.py +0 -0
  120. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/__init__.py +0 -0
  121. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/__init__.py +0 -0
  122. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/spyre_setup.py +0 -0
  123. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/__init__.py +0 -0
  124. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/__init__.py +0 -0
  125. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/base.py +0 -0
  126. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/llava_next.py +0 -0
  127. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/mistral3.py +0 -0
  128. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/perf_metrics.py +0 -0
  129. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/utils.py +0 -0
  130. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/__init__.py +0 -0
  131. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/core/__init__.py +0 -0
  132. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/metrics/__init__.py +0 -0
  133. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/metrics/stats_logger.py +0 -0
  134. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/sample/golden_token_injector.py +0 -0
  135. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/sample/spyre_logits_processor.py +0 -0
  136. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/__init__.py +0 -0
  137. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_input_batch.py +0 -0
  138. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/dependency_links.txt +0 -0
  139. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/entry_points.txt +0 -0
  140. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/top_level.txt +0 -0
  141. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/setup.cfg +0 -0
  142. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/aftu/graph_compare_utils.py +0 -0
  143. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/aftu/test_compare_graphs.py +0 -0
  144. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/__init__.py +0 -0
  145. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/conftest.py +0 -0
  146. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/fixtures/test_error_handling_models.yaml +0 -0
  147. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_env_config_path.py +0 -0
  148. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_error_handling.py +0 -0
  149. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_integration.py +0 -0
  150. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_model_config.py +0 -0
  151. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_model_configurator.py +0 -0
  152. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_model_matcher.py +0 -0
  153. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_model_registry.py +0 -0
  154. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/conftest.py +0 -0
  155. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/download_model_configs.py +0 -0
  156. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_chunked_prefill.py +0 -0
  157. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_chunked_prefill_tkv_steps.py +0 -0
  158. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_logits_processors.py +0 -0
  159. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_model_smoke.py +0 -0
  160. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_profiler.py +0 -0
  161. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_sampling_params.py +0 -0
  162. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_async_llm.py +0 -0
  163. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_cp_scheduler_steps.py +0 -0
  164. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_embeddings.py +0 -0
  165. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_mm.py +0 -0
  166. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_online.py +0 -0
  167. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_scoring.py +0 -0
  168. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_seed.py +0 -0
  169. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_stagger_basic.py +0 -0
  170. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_stats_logger.py +0 -0
  171. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_structured_outputs.py +0 -0
  172. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/BAAI/bge-reranker-large/config.json +0 -0
  173. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/BAAI/bge-reranker-v2-m3/config.json +0 -0
  174. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-ai-platform/micro-g3.3-8b-instruct-1b/config.json +0 -0
  175. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-3.3-8b-instruct/config.json +0 -0
  176. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-3.3-8b-instruct-FP8/config.json +0 -0
  177. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json +0 -0
  178. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense-hybrid/config.json +0 -0
  179. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-embedding-125m-english/config.json +0 -0
  180. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-embedding-278m-multilingual/config.json +0 -0
  181. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/intfloat/multilingual-e5-large/config.json +0 -0
  182. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/sentence-transformers/all-roberta-large-v1/config.json +0 -0
  183. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/hf_cache.json +0 -0
  184. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/hf_result_cache.py +0 -0
  185. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/llm_cache_util.py +0 -0
  186. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/model_loader/test_spyre.py +0 -0
  187. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/multimodal/test_llava_next.py +0 -0
  188. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/multimodal/test_mistral3.py +0 -0
  189. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/output_util.py +0 -0
  190. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/precompilation/test_disable_compilation.py +0 -0
  191. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/scheduling_utils.py +0 -0
  192. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/spyre_util.py +0 -0
  193. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/bge_copy/config.json +0 -0
  194. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_cli_args.py +0 -0
  195. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_envs.py +0 -0
  196. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_golden_token_injector.py +0 -0
  197. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_platform_validation.py +0 -0
  198. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_spyre_model_list.py +0 -0
  199. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/mock_model.py +0 -0
  200. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/test_prefix_caching_worker.py +0 -0
  201. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/test_scheduler_tkv_limits.py +0 -0
  202. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/test_spyre_input_batch.py +0 -0
  203. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/test_spyre_worker_profile.py +0 -0
  204. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tools/check_aiu.sh +0 -0
  205. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tools/check_repo.sh +0 -0
  206. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tools/download_model.py +0 -0
  207. {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tools/lint_scripts.sh +0 -0
@@ -27,7 +27,7 @@ concurrency:
27
27
 
28
28
  jobs:
29
29
  test:
30
- timeout-minutes: 20
30
+ timeout-minutes: 25
31
31
  runs-on: ${{ matrix.os }}
32
32
  strategy:
33
33
  fail-fast: false
@@ -108,6 +108,15 @@ jobs:
108
108
  flags: "--timeout=300"
109
109
  os: "ubuntu-latest"
110
110
  python_version: "3.12"
111
+ - vllm_version:
112
+ name: "vLLM:0.22.0"
113
+ repo: "git+https://github.com/vllm-project/vllm --tag v0.22.0"
114
+ test_suite:
115
+ name: "backward compat"
116
+ markers: "compat or (cpu and basic and not quantized)"
117
+ flags: "--timeout=300"
118
+ os: "ubuntu-latest"
119
+ python_version: "3.12"
111
120
 
112
121
 
113
122
  # Only run vllm:main jobs on PRs with `vllm:main` label
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sendnn-inference
3
- Version: 2.2.0
3
+ Version: 2.2.2
4
4
  Summary: vLLM plugin for Spyre hardware support
5
5
  License: Apache 2
6
6
  Requires-Python: >=3.11
@@ -8,7 +8,7 @@ Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: fms-model-optimizer[fp8-infer]<0.9,>=0.8.3
10
10
  Requires-Dist: ibm-fms<2,>=1.11.1
11
- Requires-Dist: vllm<0.22.1,>=0.19.1
11
+ Requires-Dist: vllm<0.23.1,>=0.19.1
12
12
  Requires-Dist: torch
13
13
  Requires-Dist: torchvision
14
14
  Dynamic: license-file
@@ -121,6 +121,15 @@ Prefix caching mirrors upstream vLLM, though the requirement for fixed-size pref
121
121
 
122
122
  When prefix caching is enabled, the `vllm:prefix_cache_queries` and `vllm:prefix_cache_hits` metrics correctly report prefix cache stats in tokens.
123
123
 
124
+ ### Multimodal Models
125
+
126
+ For multimodal models, vision encoding is offloaded to the CPU. In order to prevent expensive duplication of vision encoding, prefill during multimodal models is slightly different than that of text-only models. Vision encoding is done once per request instead of per worker so the threading configuration for multimodal models is also slightly different to improve performance.
127
+
128
+ Text-only models set the number of available threads through dividing the number of available CPUs available by number of worker and only assigning that per worker.
129
+ Multimodal models currently set the number of available threads to the number of available cpus available, ignoring the number of workers. This may be changed in the future.
130
+
131
+ The maximum available number of CPUs also can be set using `SENDNN_INFERENCE_NUM_CPUS`.
132
+
124
133
  ## Pooling Models
125
134
 
126
135
  For the embedding, scoring, and reranking tasks, vLLM supports running Pooling Models. More information on Pooling Models can be found in the [vLLM official documentation](https://docs.vllm.ai/en/latest/models/pooling_models/).
@@ -45,6 +45,18 @@ if __name__ == "__main__":
45
45
  )
46
46
  parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
47
47
  parser.add_argument("--backend", type=str, default="sendnn", choices=["eager", "sendnn"])
48
+ parser.add_argument(
49
+ "--tokenizer",
50
+ type=str,
51
+ default=None,
52
+ help="HF tokenizer id or path. Defaults to --model.",
53
+ )
54
+ parser.add_argument(
55
+ "--load-format",
56
+ type=str,
57
+ default="auto",
58
+ help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
59
+ )
48
60
 
49
61
  args = parser.parse_args()
50
62
 
@@ -95,7 +107,7 @@ if __name__ == "__main__":
95
107
  prompts = prompts * (args.num_prompts // len(prompts) + 1)
96
108
  prompts = prompts[0 : args.num_prompts]
97
109
 
98
- tokenizer = AutoTokenizer.from_pretrained(args.model)
110
+ tokenizer = AutoTokenizer.from_pretrained(args.tokenizer or args.model)
99
111
 
100
112
  tokenized_prompts = tokenizer(prompts)["input_ids"]
101
113
  tokenized_prompts = [p[: args.max_prompt_len] for p in tokenized_prompts]
@@ -124,7 +136,8 @@ if __name__ == "__main__":
124
136
  # Create an LLM.
125
137
  llm = LLM(
126
138
  model=args.model,
127
- tokenizer=args.model,
139
+ tokenizer=args.tokenizer or args.model,
140
+ load_format=args.load_format,
128
141
  max_model_len=args.max_model_len,
129
142
  max_num_seqs=args.max_num_seqs,
130
143
  tensor_parallel_size=args.tp,
@@ -29,6 +29,18 @@ if __name__ == "__main__":
29
29
  )
30
30
  parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
31
31
  parser.add_argument("--backend", type=str, default="eager", choices=["eager", "sendnn"])
32
+ parser.add_argument(
33
+ "--tokenizer",
34
+ type=str,
35
+ default=None,
36
+ help="HF tokenizer id or path. Defaults to --model.",
37
+ )
38
+ parser.add_argument(
39
+ "--load-format",
40
+ type=str,
41
+ default="auto",
42
+ help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
43
+ )
32
44
 
33
45
  args = parser.parse_args()
34
46
 
@@ -84,7 +96,8 @@ if __name__ == "__main__":
84
96
  # Create an LLM.
85
97
  llm = LLM(
86
98
  model=args.model,
87
- tokenizer=args.model,
99
+ tokenizer=args.tokenizer or args.model,
100
+ load_format=args.load_format,
88
101
  max_model_len=args.max_model_len,
89
102
  max_num_seqs=args.max_num_seqs,
90
103
  tensor_parallel_size=args.tp,
@@ -1,7 +1,7 @@
1
1
  [build-system]
2
2
  requires = [
3
3
  "setuptools>=82",
4
- "setuptools_scm>=8"
4
+ "setuptools_scm>=8,<10"
5
5
  ]
6
6
  build-backend = "setuptools.build_meta"
7
7
 
@@ -15,7 +15,7 @@ dependencies = [
15
15
  "ibm-fms>=1.11.1,<2",
16
16
  # NB: use strict < with the next patch version to not exclude versions with
17
17
  # build metadata suffixes
18
- "vllm>=0.19.1,<0.22.1",
18
+ "vllm>=0.19.1,<0.23.1",
19
19
 
20
20
  # Specific torch version overrides handled by uv
21
21
  "torch",
@@ -90,7 +90,7 @@ build-constraint-dependencies = []
90
90
  extra-build-variables = { vllm = { VLLM_TARGET_DEVICE = "empty" } }
91
91
 
92
92
  [tool.uv.sources]
93
- vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.22.0" }
93
+ vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.23.0" }
94
94
  torch = [
95
95
  { index = "pytorch-cpu" },
96
96
  ]
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '2.2.2'
32
+ __version_tuple__ = version_tuple = (2, 2, 2)
33
+
34
+ __commit_id__ = commit_id = 'gd054d78'
@@ -162,7 +162,7 @@ class ConditionalDefaultManager:
162
162
  namespace: argparse.Namespace | None = None,
163
163
  ) -> argparse.Namespace:
164
164
  result = original_parse_args(self, args, namespace)
165
- assert result is not None # type: ignore[redundant-expr]
165
+ assert result is not None
166
166
 
167
167
  if args is None or len(args) == 0:
168
168
  # Don't override anything if there were no args parsed
@@ -9,6 +9,22 @@
9
9
 
10
10
  # templates for reuse via YAML anchors
11
11
  _templates:
12
+ granite_41_30b_architecture: &granite_41_30b_architecture
13
+ model_type: granite
14
+ num_hidden_layers: 64
15
+ max_position_embeddings: 131072
16
+ hidden_size: 4096
17
+ vocab_size: 100352
18
+ num_key_value_heads: 8
19
+ num_attention_heads: 32
20
+
21
+ # device config for TP=4 Granite 4.1 30b models
22
+ granite_41_30b_tp4_device_config: &granite_41_30b_tp4_device_config
23
+ env_vars:
24
+ VLLM_DT_MAX_BATCH_TKV_LIMIT: 131072 # 128k
25
+ FLEX_HDMA_P2PSIZE: 268435456 # 256MB
26
+ FLEX_HDMA_COLLSIZE: 33554432 # 32MB
27
+ num_gpu_blocks_override: 2080
12
28
 
13
29
  granite_4_8b_architecture: &granite_4_8b_architecture
14
30
  model_type: granite
@@ -35,7 +51,7 @@ _templates:
35
51
  FLEX_HDMA_P2PSIZE: 268435456 # 256MB
36
52
  FLEX_HDMA_COLLSIZE: 33554432 # 32MB
37
53
  num_gpu_blocks_override: 8192
38
-
54
+
39
55
  granite_vision_33_2b_architecture: &granite_vision_33_2b_architecture
40
56
  model_type: llava_next
41
57
  text_config:
@@ -166,7 +182,7 @@ models:
166
182
  max_model_len: 32768
167
183
  max_num_seqs: 32
168
184
  device_config: *granite_8b_tp4_device_config
169
-
185
+
170
186
  # Llama 3.1 8B Instruct
171
187
  meta-llama/Llama-3.1-8B-Instruct:
172
188
  architecture: *llama3_8b_architecture
@@ -247,6 +263,15 @@ models:
247
263
  max_num_seqs: 32
248
264
  device_config: *granite_8b_tp4_device_config
249
265
 
266
+ # Granite 4.1 30B
267
+ ibm-granite/granite-4.1-30b:
268
+ architecture: *granite_41_30b_architecture
269
+ continuous_batching_configs:
270
+ - tp_size: 4
271
+ max_model_len: 32768
272
+ max_num_seqs: 32
273
+ device_config: *granite_41_30b_tp4_device_config
274
+
250
275
  # Granite Vision 3.3 2B
251
276
  ibm-granite/granite-vision-3.3-2b:
252
277
  architecture: *granite_vision_33_2b_architecture
@@ -262,7 +287,7 @@ models:
262
287
  max_model_len: 32768
263
288
  max_num_seqs: 32
264
289
  device_config: *granite_vision_2b_tp4_device_config
265
-
290
+
266
291
  # Mistral Small 3.2 24B Instruct
267
292
  mistralai/Mistral-Small-3.2-24B-Instruct-2506:
268
293
  architecture: *mistral3_24b_architecture
@@ -279,6 +304,9 @@ models:
279
304
  mistralai/Ministral-3-14B-Instruct-2512-BF16:
280
305
  architecture: *ministral3_14b_architecture
281
306
  continuous_batching_configs:
307
+ - tp_size: 1
308
+ max_model_len: 4096
309
+ max_num_seqs: 32
282
310
  - tp_size: 4
283
311
  max_model_len: 32768
284
312
  max_num_seqs: 32
@@ -26,6 +26,7 @@ if TYPE_CHECKING:
26
26
  SENDNN_INFERENCE_MODEL_CONFIG_FILE: str | None = None
27
27
  SENDNN_INFERENCE_CPU_MM_DTYPE: torch.dtype = torch.float16
28
28
  SENDNN_INFERENCE_MM_DEVICE: str = "auto"
29
+ SENDNN_INFERENCE_TP_MM_SHARING: bool = True
29
30
 
30
31
  logger = init_logger(__name__)
31
32
 
@@ -92,6 +93,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
92
93
  ),
93
94
  # Allow sendnn-inference to update env vars related to multi-threading (eg. OMP)
94
95
  # based on the detected CPU cores and server configuration
96
+ # Multimodal models will not take into account the number of workers for configuration.
95
97
  "SENDNN_INFERENCE_UPDATE_THREAD_CONFIG": lambda: bool(
96
98
  int(os.getenv("SENDNN_INFERENCE_UPDATE_THREAD_CONFIG", "1"))
97
99
  ),
@@ -171,6 +173,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
171
173
  "SENDNN_INFERENCE_MM_DEVICE": lambda: parse_mm_device(
172
174
  os.getenv("SENDNN_INFERENCE_MM_DEVICE", "auto")
173
175
  ),
176
+ # When "1" (default), rank 0 runs the vision encoder and shares the result
177
+ # with other TP ranks via POSIX shared memory (one encoder call instead of
178
+ # world_size calls). Set to "0" to fall back to every TP rank running the
179
+ # vision encoder independently — the original behaviour, which avoids any
180
+ # SHM-related failure modes at the cost of redundant CPU work.
181
+ "SENDNN_INFERENCE_TP_MM_SHARING": lambda: bool(
182
+ int(os.getenv("SENDNN_INFERENCE_TP_MM_SHARING", "1"))
183
+ ),
174
184
  }
175
185
  # --8<-- [end:env-vars-definition]
176
186
 
@@ -72,6 +72,7 @@ class SpyreCausalLM(nn.Module):
72
72
  self.parallel_config = vllm_config.parallel_config
73
73
  self.cache_config = vllm_config.cache_config
74
74
  self.scheduler_config = vllm_config.scheduler_config
75
+ self.load_config = vllm_config.load_config
75
76
  self.dtype = self.get_dtype()
76
77
 
77
78
  # Wrappers for utils for multimodal
@@ -171,16 +172,30 @@ class SpyreCausalLM(nn.Module):
171
172
  self.dtype,
172
173
  )
173
174
 
174
- is_local = os.path.isdir(model_config.model)
175
- model_path = model_config.model
176
- # Get location of model from HF cache.
177
- if not is_local:
178
- model_path = download_weights_from_hf(
179
- model_name_or_path=model_path,
180
- cache_dir=None,
181
- allow_patterns=["*.safetensors", "*.bin", "*.pt"],
182
- revision=model_config.revision,
175
+ # `--load-format dummy` skips the checkpoint download and routes through
176
+ # FMS's `hf_configured` path, which fetches only config.json and then
177
+ # random-inits the model via `reset_parameters()`.
178
+ variant: str | None = None
179
+ if self.load_config.load_format == "dummy":
180
+ logger.info(
181
+ "Loading model %s with random weights.",
182
+ model_config.model,
183
183
  )
184
+ architecture = "hf_configured"
185
+ variant = model_config.model
186
+ model_path: str | None = None
187
+ else:
188
+ architecture = "hf_pretrained"
189
+ is_local = os.path.isdir(model_config.model)
190
+ model_path = model_config.model
191
+ # Get location of model from HF cache.
192
+ if not is_local:
193
+ model_path = download_weights_from_hf(
194
+ model_name_or_path=model_path,
195
+ cache_dir=None,
196
+ allow_patterns=["*.safetensors", "*.bin", "*.pt"],
197
+ revision=model_config.revision,
198
+ )
184
199
 
185
200
  # Get any fixes needed that must be patched into the kwargs;
186
201
  # currently this is only use for multimodal models / llava next
@@ -192,7 +207,8 @@ class SpyreCausalLM(nn.Module):
192
207
  kwargs["rank"],
193
208
  ):
194
209
  self.fms_model = get_model(
195
- architecture="hf_pretrained",
210
+ architecture=architecture,
211
+ variant=variant,
196
212
  model_path=model_path,
197
213
  distributed_strategy=distributed_strategy,
198
214
  group=dist.group.WORLD,
@@ -242,6 +242,14 @@ class SpyrePlatform(Platform):
242
242
  if not is_decoder and not is_pooling:
243
243
  raise ValueError("Only the 'generate' and 'pooling' runners are supported")
244
244
 
245
+ if vllm_config.load_config.load_format == "dummy" and (
246
+ model_config.is_multimodal_model or is_pooling
247
+ ):
248
+ raise ValueError(
249
+ "--load-format dummy is only supported for text generation models; "
250
+ "random-weight init is not implemented for multimodal or pooling models."
251
+ )
252
+
245
253
  if parallel_config.worker_cls == "auto":
246
254
  parallel_config.worker_cls = "sendnn_inference.v1.worker.spyre_worker.SpyreWorker"
247
255
 
@@ -345,7 +353,7 @@ class SpyrePlatform(Platform):
345
353
  scheduler_config.max_num_batched_tokens = (
346
354
  model_config.max_model_len * scheduler_config.max_num_seqs
347
355
  )
348
- cache_config.block_size = model_config.max_model_len # ty: ignore[invalid-assignment]
356
+ cache_config.block_size = model_config.max_model_len
349
357
  vllm_config.cache_config.enable_prefix_caching = False
350
358
 
351
359
  else:
@@ -635,7 +643,12 @@ class SpyrePlatform(Platform):
635
643
 
636
644
  # NOTE: math.ceil can output a number for each worker that sums
637
645
  # to a total greater than cpu_count.
638
- cpus_per_worker = math.ceil(cpu_count / worker_count) if cpu_count is not None else None
646
+ thread_factor = worker_count
647
+ if cls._config.model_config.is_multimodal_model:
648
+ # thread_factor value/formula subject to further tuning
649
+ thread_factor = 1
650
+
651
+ cpus_per_worker = math.ceil(cpu_count / thread_factor) if cpu_count is not None else None
639
652
 
640
653
  thread_warning = (
641
654
  "Excessive threads may result in CPU contention. "
@@ -821,7 +834,7 @@ class SpyrePlatform(Platform):
821
834
  @classmethod
822
835
  def _set_batch_tkv_limit_from_env(cls) -> None:
823
836
  try:
824
- cls._max_batch_tkv_limit = int(os.getenv("VLLM_DT_MAX_BATCH_TKV_LIMIT", "-1")) # ty: ignore
837
+ cls._max_batch_tkv_limit = int(os.getenv("VLLM_DT_MAX_BATCH_TKV_LIMIT", "-1"))
825
838
  except ValueError as e:
826
839
  raise ValueError("VLLM_DT_MAX_BATCH_TKV_LIMIT must be an integer") from e
827
840
 
@@ -216,23 +216,6 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
216
216
  "Expecting an instance of CPSpyreModelRunnerOutput when doing chunked prefill."
217
217
  )
218
218
 
219
- # Update the correct num_computed_tokens value given left-padding and
220
- # prefix cache hit info
221
- for req in self.ongoing_prefills:
222
- # The number of computed tokens only need to be adapted when it is
223
- # the first chunk of a multi-chunk prefill
224
- is_first_chunk = req.num_computed_tokens <= self.chunk_size
225
- is_last_chunk = req.num_computed_tokens == req.num_prompt_tokens
226
- if is_first_chunk and not is_last_chunk:
227
- left_padding = model_runner_output.left_padding.get(req.request_id, 0)
228
- prefix_cache_len = model_runner_output.prefix_cache_hit_len.get(req.request_id, 0)
229
-
230
- req.num_computed_tokens = self.adjust_computed_tokens(
231
- computed_tokens=req.num_computed_tokens,
232
- left_padding=left_padding,
233
- prefix_cache_len=prefix_cache_len,
234
- )
235
-
236
219
  # Remove completed prefills
237
220
  self.ongoing_prefills = [
238
221
  req for req in self.ongoing_prefills if req.num_computed_tokens < req.num_prompt_tokens
@@ -250,21 +233,39 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
250
233
 
251
234
  return result
252
235
 
253
- def adjust_computed_tokens(
254
- self, computed_tokens: int, left_padding: int, prefix_cache_len: int
255
- ) -> int:
256
- """
257
- Returns an adjusted `num_computed_tokens` given left padding and prefix
258
- cache hit info.
259
- """
260
- # The prefix cache length is already adjusted for left padding.
261
- # If it's bigger than the number of computed tokens, then we hit more
262
- # prefix cache than we scheduled.
263
- if prefix_cache_len > computed_tokens:
264
- assert (prefix_cache_len + left_padding) % self.chunk_size == 0
265
- return prefix_cache_len
266
- # Otherwise just account for the left padding
267
- return computed_tokens - left_padding
236
+ def _current_chunk_token_threshold(self, new_prefill_candidates: list[Request]) -> int:
237
+ """Returns the `long_prefill_token_threshold` to use for this step.
238
+
239
+ For the chunk-0 step cap to `chunk_size - left_padding` so the base
240
+ scheduler is aware of the padding blocks.
241
+ Otherwise return `chunk_size`: the natural chunk boundary."""
242
+
243
+ # If there are no new prefill candidates, no cap is needed.
244
+ if not new_prefill_candidates:
245
+ return self.chunk_size
246
+
247
+ new_prefill = new_prefill_candidates[0]
248
+
249
+ # Calculate left-padding tokens for this prompt.
250
+ prompt_len = new_prefill.num_prompt_tokens
251
+ n_chunks = math.ceil(prompt_len / self.chunk_size)
252
+ padded_prompt_len = math.ceil(prompt_len / self.block_size) * self.block_size
253
+ left_padding = n_chunks * self.chunk_size - padded_prompt_len
254
+
255
+ # If the prefix cache already covers chunk 0's real content, no cap is
256
+ # needed: the base scheduler will start from chunk i>=1, which has no
257
+ # padding. `get_computed_blocks` records into `prefix_cache_stats` as
258
+ # a side effect; the base scheduler calls it again, so toggle
259
+ # log_stats off here to avoid double-counting.
260
+ prev_log_stats = self.kv_cache_manager.log_stats
261
+ self.kv_cache_manager.log_stats = False
262
+ _, prefix_token_len = self.kv_cache_manager.get_computed_blocks(new_prefill)
263
+ self.kv_cache_manager.log_stats = prev_log_stats
264
+ if prefix_token_len >= self.chunk_size - left_padding:
265
+ return self.chunk_size
266
+
267
+ # Adjust the token threshold to account for left padding
268
+ return self.chunk_size - left_padding
268
269
 
269
270
  def _get_required_blocks(self, request: Request, max_output: bool = False) -> tuple[int, int]:
270
271
  """
@@ -404,7 +405,7 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
404
405
  ready_to_prefill = [
405
406
  r
406
407
  for r in self.waiting
407
- if r.status != RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR # type: ignore[attr-defined]
408
+ if r.status != RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR
408
409
  ]
409
410
  if ready_to_prefill:
410
411
  new_prefill_candidates = list(self.waiting)
@@ -424,6 +425,15 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
424
425
  self.previous_step_was_prefill = False
425
426
  running_holdback = []
426
427
 
428
+ # Cap chunk-0 token count to chunk_size - left_padding so the upstream KV
429
+ # cache manager doesn't allocate a real blocks for the left-padding region.
430
+ # Only matters at chunk 0; later chunks land on natural chunk boundaries.
431
+ # Mutating scheduler_config is safe: the SpyreScheduler is the only
432
+ # scheduler in this engine and at most one prefill is in flight per step.
433
+ self.scheduler_config.long_prefill_token_threshold = self._current_chunk_token_threshold(
434
+ new_prefill_candidates
435
+ )
436
+
427
437
  # delegate to super of SpyreScheduler: base V1 Scheduler
428
438
  outputs = super(SpyreScheduler, self).schedule()
429
439
 
@@ -0,0 +1,130 @@
1
+ """Utilities for sharing multimodal embeddings across TP ranks via POSIX shared memory.
2
+
3
+ During chunked prefill rank 0 computes the full vision-encoder embeddings and writes
4
+ them here; non-zero ranks read after synchronisation in the model runner.
5
+ This avoids running the (CPU-bound) vision encoder world_size times per request.
6
+ """
7
+
8
+ import hashlib
9
+ from multiprocessing.shared_memory import SharedMemory
10
+
11
+ import torch
12
+
13
+ from vllm.logger import init_logger
14
+
15
+ logger = init_logger(__name__)
16
+
17
+ # Stable mapping between torch dtypes and compact integer identifiers used in
18
+ # the broadcast metadata tensor. torch.frombuffer handles all dtypes natively.
19
+ _DTYPE_TO_IDX: dict[torch.dtype, int] = {
20
+ torch.float16: 0,
21
+ torch.float32: 1,
22
+ torch.bfloat16: 2,
23
+ }
24
+ _IDX_TO_DTYPE: dict[int, torch.dtype] = {v: k for k, v in _DTYPE_TO_IDX.items()}
25
+
26
+
27
+ def dtype_to_idx(dtype: torch.dtype) -> int:
28
+ """Encode a torch dtype as a compact integer for the broadcast metadata tensor."""
29
+ if dtype not in _DTYPE_TO_IDX:
30
+ raise ValueError(f"Unsupported dtype for SHM transfer: {dtype}")
31
+ return _DTYPE_TO_IDX[dtype]
32
+
33
+
34
+ def idx_to_dtype(idx: int) -> torch.dtype:
35
+ """Decode a compact integer back to the corresponding torch dtype."""
36
+ if idx not in _IDX_TO_DTYPE:
37
+ raise ValueError(f"Unknown dtype index: {idx}")
38
+ return _IDX_TO_DTYPE[idx]
39
+
40
+
41
+ def _shm_name(req_id: str) -> str:
42
+ """Generate a short, deterministic POSIX SHM name for a request.
43
+
44
+ Uses an MD5 hash of the *full* req_id so that requests which share a
45
+ common prefix (e.g. all benchmark requests in a run share the
46
+ ``chatcmpl-bench-<uuid>-`` prefix) still get distinct SHM names.
47
+
48
+ Truncating the req_id (the previous approach) caused silent collisions:
49
+ ``chatcmpl-bench-34e3ed2d-1-…`` and ``chatcmpl-bench-34e3ed2d-39-…``
50
+ both hash to the same 20-char prefix, so every request in the benchmark
51
+ wrote to the same SHM segment — corrupting each other's embeddings.
52
+
53
+ Linux NAME_MAX is 255; macOS requires ≤ 30 chars for the name itself
54
+ (the kernel prefixes it with ``/``). 'sm' + 16 hex chars = 18 chars,
55
+ safely within every platform's limit.
56
+ """
57
+ digest = hashlib.md5(req_id.encode(), usedforsecurity=False).hexdigest()[:16]
58
+ return f"sm{digest}"
59
+
60
+
61
+ def write_embeddings(tensor: torch.Tensor, req_id: str) -> SharedMemory:
62
+ """Write *tensor* to a shared-memory block keyed by *req_id*.
63
+
64
+ Returns the ``SharedMemory`` handle — the caller must keep it and pass it
65
+ to :func:`cleanup_embeddings` after all ranks have read.
66
+
67
+ Shape and dtype are NOT stored in SHM; the caller broadcasts them via a
68
+ tiny ``torch.distributed.broadcast`` so readers already have that info
69
+ before calling :func:`read_embeddings`.
70
+ """
71
+ if tensor.device.type != "cpu":
72
+ tensor = tensor.cpu()
73
+ tensor = tensor.contiguous()
74
+
75
+ assert tensor.ndim == 3, f"Expected 3-D embedding tensor, got shape {tensor.shape}"
76
+ assert tensor.dtype in _DTYPE_TO_IDX, f"Unsupported dtype for SHM transfer: {tensor.dtype}"
77
+
78
+ data_shm = SharedMemory(create=True, size=tensor.nbytes, name=_shm_name(req_id))
79
+ torch.frombuffer(data_shm.buf, dtype=tensor.dtype).reshape(tensor.shape).copy_(tensor)
80
+
81
+ logger.debug(
82
+ "Wrote MM embeddings to SHM for req '%s': shape=%s dtype=%s bytes=%d",
83
+ req_id,
84
+ tuple(tensor.shape),
85
+ tensor.dtype,
86
+ tensor.nbytes,
87
+ )
88
+ return data_shm
89
+
90
+
91
+ def read_embeddings(
92
+ req_id: str,
93
+ shape: tuple[int, int, int],
94
+ dtype: torch.dtype,
95
+ ) -> torch.Tensor:
96
+ """Read embeddings from shared memory and return a detached CPU tensor.
97
+
98
+ *shape* and *dtype* must be provided by the caller (obtained from the
99
+ broadcast metadata tensor) — they are not re-read from SHM.
100
+
101
+ Opens and closes the shared-memory handle internally.
102
+ """
103
+ data_shm = SharedMemory(name=_shm_name(req_id))
104
+ # .clone() detaches the tensor from the SHM buffer so the handle can be closed.
105
+ result = torch.frombuffer(data_shm.buf, dtype=dtype).reshape(shape).clone()
106
+ data_shm.close()
107
+
108
+ logger.debug(
109
+ "Read MM embeddings from SHM for req '%s': shape=%s dtype=%s",
110
+ req_id,
111
+ shape,
112
+ dtype,
113
+ )
114
+ return result
115
+
116
+
117
+ def cleanup_embeddings(data_shm: SharedMemory) -> None:
118
+ """Unlink and close the shared-memory block.
119
+
120
+ Safe to call even if the block was already cleaned up — exceptions are
121
+ logged but not re-raised.
122
+ """
123
+ try:
124
+ data_shm.unlink()
125
+ except Exception as exc:
126
+ logger.debug("SHM unlink skipped (%s): %s", data_shm.name, exc)
127
+ try:
128
+ data_shm.close()
129
+ except Exception as exc:
130
+ logger.debug("SHM close skipped (%s): %s", data_shm.name, exc)