sendnn-inference 2.1.4__tar.gz → 2.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (209) hide show
  1. sendnn_inference-2.2.2/.github/ci_model_cache.yaml +17 -0
  2. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/model_smoke.yml +6 -2
  3. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/test.yml +46 -117
  4. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/PKG-INFO +3 -3
  5. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/configuration.md +9 -0
  6. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/long_context.py +15 -2
  7. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/text_inference.py +14 -1
  8. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/pyproject.toml +6 -5
  9. sendnn_inference-2.2.2/sendnn_inference/_version.py +34 -0
  10. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/argparse_utils.py +1 -1
  11. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_configs.yaml +56 -4
  12. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/envs.py +10 -0
  13. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/spyre.py +31 -15
  14. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/platform.py +16 -3
  15. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/core/scheduler.py +139 -34
  16. sendnn_inference-2.2.2/sendnn_inference/v1/worker/mm_shared_memory.py +130 -0
  17. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_model_runner.py +208 -40
  18. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_worker.py +29 -3
  19. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/PKG-INFO +3 -3
  20. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/SOURCES.txt +4 -0
  21. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/requires.txt +2 -2
  22. sendnn_inference-2.2.2/tests/e2e/test_load_format_dummy.py +39 -0
  23. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_basic.py +3 -0
  24. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_cp_scheduler_steps.py +184 -0
  25. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_pc_scheduler_steps.py +351 -18
  26. sendnn_inference-2.2.2/tests/hf_cache.json +594 -0
  27. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/llm_cache.py +5 -0
  28. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/multimodal/test_llava_next.py +1 -1
  29. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/multimodal/test_mistral3.py +1 -1
  30. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/scheduling_utils.py +12 -1
  31. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/spyre_util.py +12 -3
  32. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_upstream_compatibility.py +17 -0
  33. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/core/test_scheduler_structured_outputs.py +16 -0
  34. sendnn_inference-2.2.2/tests/v1/worker/test_mm_shared_memory.py +328 -0
  35. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/test_prefix_caching_worker.py +1 -1
  36. sendnn_inference-2.2.2/tools/download_model.py +87 -0
  37. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/uv.lock +1375 -1414
  38. sendnn_inference-2.1.4/sendnn_inference/_version.py +0 -24
  39. sendnn_inference-2.1.4/tests/hf_cache.json +0 -1239
  40. sendnn_inference-2.1.4/tools/download_model.py +0 -42
  41. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.bob/skills/update-vllm/SKILL.md +0 -0
  42. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.claude/skills/update-vllm/SKILL.md +0 -0
  43. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/CODEOWNERS +0 -0
  44. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
  45. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  46. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
  47. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/rfc.yml +0 -0
  48. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/actions/free-up-disk-space/action.yml +0 -0
  49. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/pull_request_template.md +0 -0
  50. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/add_label_automerge.yml +0 -0
  51. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/build_and_publish.yaml +0 -0
  52. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/build_docker.yml +0 -0
  53. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/check_uv_lock.yml +0 -0
  54. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/lint_scripts.yml +0 -0
  55. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/matchers/ruff.json +0 -0
  56. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/pre-commit.yml +0 -0
  57. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/publish_to_test_pypi.yaml +0 -0
  58. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/reminder_comment.yml +0 -0
  59. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/stale.yml +0 -0
  60. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.gitignore +0 -0
  61. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.pre-commit-config.yaml +0 -0
  62. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.readthedocs.yaml +0 -0
  63. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.shellcheckrc +0 -0
  64. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.yapfignore +0 -0
  65. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/CLAUDE.md +0 -0
  66. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/CODE_OF_CONDUCT.md +0 -0
  67. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/CONTRIBUTING.md +0 -0
  68. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/DCO +0 -0
  69. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/LICENSE +0 -0
  70. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/README.md +0 -0
  71. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/RELEASING.md +0 -0
  72. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/_local_envs_for_test.sh +0 -0
  73. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docker/.senlib.json +0 -0
  74. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docker/Dockerfile.amd64 +0 -0
  75. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docker/simple_vllm_serve.sh +0 -0
  76. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/.nav.yml +0 -0
  77. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/README.md +0 -0
  78. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/README.md +0 -0
  79. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/architecture.md +0 -0
  80. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/images/vllm_v1.svg +0 -0
  81. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/images/vllm_v1_spyre.svg +0 -0
  82. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/maintaining.md +0 -0
  83. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/multimodal/adding_new_models.md +0 -0
  84. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/scheduler.md +0 -0
  85. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/vllm-update-procedure.md +0 -0
  86. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/deploying/docker.md +0 -0
  87. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/deploying/k8s.md +0 -0
  88. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/deploying/rhoai.md +0 -0
  89. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/getting_started/installation.md +0 -0
  90. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/generate_example.py +0 -0
  91. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefill_single_chunks.json +0 -0
  92. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefill_three_chunks.json +0 -0
  93. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_1.json +0 -0
  94. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_2.json +0 -0
  95. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_3.json +0 -0
  96. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/scheduling_admission_constraints.json +0 -0
  97. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/scheduling_padding_tkv_jump.json +0 -0
  98. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/timeline_admission_constraints.json +0 -0
  99. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_examples.py +0 -0
  100. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_model_tables.py +0 -0
  101. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_prefill_only_plots.py +0 -0
  102. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_prefix_caching_plots.py +0 -0
  103. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_scheduling_plots.py +0 -0
  104. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_timeline.py +0 -0
  105. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/url_schemes.py +0 -0
  106. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/overrides/main.html +0 -0
  107. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/requirements-docs.txt +0 -0
  108. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/roadmaps/q3-2025.md +0 -0
  109. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/env_vars.md +0 -0
  110. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/performance.md +0 -0
  111. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/supported_features.md +0 -0
  112. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/supported_models.md +0 -0
  113. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/vision_inference.py +0 -0
  114. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/online_inference/openai_spyre_text.py +0 -0
  115. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/online_inference/openai_spyre_vision.py +0 -0
  116. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/online_inference/spyre_vllm_benchmark.py +0 -0
  117. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/online_inference/spyre_vllm_setup_container.sh +0 -0
  118. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/format.sh +0 -0
  119. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/mkdocs.yaml +0 -0
  120. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/__init__.py +0 -0
  121. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/compat_utils.py +0 -0
  122. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/compilation_utils.py +0 -0
  123. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/README.md +0 -0
  124. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/__init__.py +0 -0
  125. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/configurators/__init__.py +0 -0
  126. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/configurators/model_configurator.py +0 -0
  127. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_config.py +0 -0
  128. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_matcher.py +0 -0
  129. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_registry.py +0 -0
  130. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/__init__.py +0 -0
  131. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/__init__.py +0 -0
  132. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/spyre_setup.py +0 -0
  133. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/__init__.py +0 -0
  134. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/__init__.py +0 -0
  135. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/base.py +0 -0
  136. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/llava_next.py +0 -0
  137. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/mistral3.py +0 -0
  138. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/perf_metrics.py +0 -0
  139. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/utils.py +0 -0
  140. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/__init__.py +0 -0
  141. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/core/__init__.py +0 -0
  142. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/metrics/__init__.py +0 -0
  143. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/metrics/stats_logger.py +0 -0
  144. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/sample/golden_token_injector.py +0 -0
  145. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/sample/spyre_logits_processor.py +0 -0
  146. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/__init__.py +0 -0
  147. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_input_batch.py +0 -0
  148. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/dependency_links.txt +0 -0
  149. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/entry_points.txt +0 -0
  150. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/top_level.txt +0 -0
  151. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/setup.cfg +0 -0
  152. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/aftu/graph_compare_utils.py +0 -0
  153. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/aftu/test_compare_graphs.py +0 -0
  154. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/__init__.py +0 -0
  155. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/conftest.py +0 -0
  156. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/fixtures/test_error_handling_models.yaml +0 -0
  157. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_env_config_path.py +0 -0
  158. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_error_handling.py +0 -0
  159. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_integration.py +0 -0
  160. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_model_config.py +0 -0
  161. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_model_configurator.py +0 -0
  162. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_model_matcher.py +0 -0
  163. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_model_registry.py +0 -0
  164. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/conftest.py +0 -0
  165. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/download_model_configs.py +0 -0
  166. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_chunked_prefill.py +0 -0
  167. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_chunked_prefill_tkv_steps.py +0 -0
  168. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_logits_processors.py +0 -0
  169. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_model_smoke.py +0 -0
  170. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_profiler.py +0 -0
  171. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_sampling_params.py +0 -0
  172. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_async_llm.py +0 -0
  173. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_embeddings.py +0 -0
  174. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_mm.py +0 -0
  175. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_online.py +0 -0
  176. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_scoring.py +0 -0
  177. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_seed.py +0 -0
  178. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_stagger_basic.py +0 -0
  179. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_stats_logger.py +0 -0
  180. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_structured_outputs.py +0 -0
  181. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/BAAI/bge-reranker-large/config.json +0 -0
  182. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/BAAI/bge-reranker-v2-m3/config.json +0 -0
  183. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-ai-platform/micro-g3.3-8b-instruct-1b/config.json +0 -0
  184. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-3.3-8b-instruct/config.json +0 -0
  185. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-3.3-8b-instruct-FP8/config.json +0 -0
  186. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json +0 -0
  187. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense-hybrid/config.json +0 -0
  188. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-embedding-125m-english/config.json +0 -0
  189. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-embedding-278m-multilingual/config.json +0 -0
  190. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/intfloat/multilingual-e5-large/config.json +0 -0
  191. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/sentence-transformers/all-roberta-large-v1/config.json +0 -0
  192. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/hf_result_cache.py +0 -0
  193. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/llm_cache_util.py +0 -0
  194. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/model_loader/test_spyre.py +0 -0
  195. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/output_util.py +0 -0
  196. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/precompilation/test_disable_compilation.py +0 -0
  197. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/bge_copy/config.json +0 -0
  198. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_cli_args.py +0 -0
  199. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_envs.py +0 -0
  200. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_golden_token_injector.py +0 -0
  201. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_platform_validation.py +0 -0
  202. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_spyre_model_list.py +0 -0
  203. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/mock_model.py +0 -0
  204. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/test_scheduler_tkv_limits.py +0 -0
  205. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/test_spyre_input_batch.py +0 -0
  206. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/test_spyre_worker_profile.py +0 -0
  207. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tools/check_aiu.sh +0 -0
  208. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tools/check_repo.sh +0 -0
  209. {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tools/lint_scripts.sh +0 -0
@@ -0,0 +1,17 @@
1
+ # Models pre-fetched into the GHA HuggingFace cache for the Test workflow.
2
+ #
3
+ # Adding/removing/changing an entry here changes the cache key and will trigger
4
+ # a fresh download + cache save on the next push to main. Total uncompressed
5
+ # size of all entries must stay under 8 GiB (GHA cache limit is 10 GiB).
6
+
7
+ models:
8
+ - repo: ibm-ai-platform/micro-g3.3-8b-instruct-1b
9
+ revision: 6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f
10
+ - repo: ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8
11
+ revision: 0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e
12
+ - repo: sentence-transformers/all-roberta-large-v1
13
+ revision: cf74d8acd4f198de950bf004b262e6accfed5d2c
14
+ - repo: cross-encoder/stsb-roberta-large
15
+ revision: 2b12c2c0088918e76151fd5937b7bba986ef1f98
16
+ - repo: Qwen/Qwen3-Embedding-0.6B
17
+ revision: 97b0c614be4d77ee51c0cef4e5f07c00f9eb65b3
@@ -62,9 +62,13 @@ jobs:
62
62
 
63
63
  steps:
64
64
  - name: "Lightweight disk cleanup"
65
+ # all rm -rf, no apt/docker — completes in < 1s and frees ~20 GB
65
66
  run: |
66
- rm -rf /usr/share/swift
67
- rm -rf /user/local/share/chromium
67
+ sudo rm -rf /usr/local/lib/android
68
+ sudo rm -rf /opt/ghc /usr/local/.ghcup
69
+ sudo rm -rf /usr/share/dotnet
70
+ sudo rm -rf /usr/share/swift
71
+ sudo rm -rf /usr/local/share/chromium
68
72
  sudo rm -rf /usr/local/share/powershell
69
73
 
70
74
  - name: "Checkout"
@@ -20,8 +20,6 @@ env:
20
20
  VLLM_TARGET_DEVICE: "empty"
21
21
  VLLM_PLUGINS: "sendnn_inference"
22
22
  HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
23
- DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
24
- DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
25
23
 
26
24
  concurrency:
27
25
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -29,7 +27,7 @@ concurrency:
29
27
 
30
28
  jobs:
31
29
  test:
32
- timeout-minutes: 20
30
+ timeout-minutes: 25
33
31
  runs-on: ${{ matrix.os }}
34
32
  strategy:
35
33
  fail-fast: false
@@ -49,26 +47,18 @@ jobs:
49
47
  - name: "fp8"
50
48
  markers: "cpu and quantized and multi"
51
49
  flags: "--timeout=600 -k 'basic and test_output' --durations=0"
52
- hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
53
- hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e"
54
50
  - name: "embedding"
55
51
  markers: "cpu and embedding and not quantized"
56
52
  flags: "--timeout=300"
57
- hf_model: "sentence-transformers/all-roberta-large-v1"
58
- hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
59
53
  - name: "scoring"
60
54
  markers: "cpu and scoring"
61
55
  flags: "--timeout=300"
62
- hf_model: "cross-encoder/stsb-roberta-large"
63
- hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98"
64
56
  - name: "worker and utils"
65
57
  markers: "not e2e and not quantized and not spyre and not multimodal"
66
58
  flags: "--timeout=300"
67
59
  - name: "multimodal"
68
60
  markers: "cpu and multimodal"
69
61
  flags: "--timeout=300 -sv"
70
- # hf_model: "ibm-granite/granite-vision-3.2-2b"
71
- # hf_model_rev: "2818ae5b93cb750b099df1b65f7864e4a0401271"
72
62
  env_overrides: "HF_HUB_OFFLINE=0"
73
63
  include:
74
64
  # Lower bound support
@@ -79,8 +69,6 @@ jobs:
79
69
  name: "backward compat"
80
70
  markers: "compat or (cpu and basic and not quantized)"
81
71
  flags: "--timeout=300"
82
- hf_model_2: "sentence-transformers/all-roberta-large-v1"
83
- hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
84
72
  os: "ubuntu-latest"
85
73
  python_version: "3.12"
86
74
  # Intermediate versions of vllm to check basic support for as well
@@ -91,8 +79,6 @@ jobs:
91
79
  name: "backward compat"
92
80
  markers: "compat or (cpu and basic and not quantized)"
93
81
  flags: "--timeout=300"
94
- hf_model_2: "sentence-transformers/all-roberta-large-v1"
95
- hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
96
82
  os: "ubuntu-latest"
97
83
  python_version: "3.12"
98
84
  - vllm_version:
@@ -102,8 +88,6 @@ jobs:
102
88
  name: "backward compat"
103
89
  markers: "compat or (cpu and basic and not quantized)"
104
90
  flags: "--timeout=300"
105
- hf_model_2: "sentence-transformers/all-roberta-large-v1"
106
- hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
107
91
  os: "ubuntu-latest"
108
92
  python_version: "3.12"
109
93
  - vllm_version:
@@ -113,8 +97,6 @@ jobs:
113
97
  name: "backward compat"
114
98
  markers: "compat or (cpu and basic and not quantized)"
115
99
  flags: "--timeout=300"
116
- hf_model_2: "sentence-transformers/all-roberta-large-v1"
117
- hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
118
100
  os: "ubuntu-latest"
119
101
  python_version: "3.12"
120
102
  - vllm_version:
@@ -124,8 +106,15 @@ jobs:
124
106
  name: "backward compat"
125
107
  markers: "compat or (cpu and basic and not quantized)"
126
108
  flags: "--timeout=300"
127
- hf_model_2: "sentence-transformers/all-roberta-large-v1"
128
- hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
109
+ os: "ubuntu-latest"
110
+ python_version: "3.12"
111
+ - vllm_version:
112
+ name: "vLLM:0.22.0"
113
+ repo: "git+https://github.com/vllm-project/vllm --tag v0.22.0"
114
+ test_suite:
115
+ name: "backward compat"
116
+ markers: "compat or (cpu and basic and not quantized)"
117
+ flags: "--timeout=300"
129
118
  os: "ubuntu-latest"
130
119
  python_version: "3.12"
131
120
 
@@ -146,10 +135,14 @@ jobs:
146
135
  steps:
147
136
  - name: "Lightweight disk cleanup"
148
137
  # super lightweight cleanup, not nearly as much as actions/free-up-disk-space
138
+ # all rm -rf, no apt/docker — completes in < 1s and frees ~20 GB
149
139
  shell: bash
150
140
  run: |
151
- rm -rf /usr/share/swift
152
- rm -rf /user/local/share/chromium
141
+ sudo rm -rf /usr/local/lib/android
142
+ sudo rm -rf /opt/ghc /usr/local/.ghcup
143
+ sudo rm -rf /usr/share/dotnet
144
+ sudo rm -rf /usr/share/swift
145
+ sudo rm -rf /usr/local/share/chromium
153
146
  sudo rm -rf /usr/local/share/powershell
154
147
 
155
148
  - name: "Checkout"
@@ -199,117 +192,53 @@ jobs:
199
192
  # overwritten.
200
193
  uv pip install -v .
201
194
 
202
- - name: "Standardize HF model names for caching"
203
- id: standardize-names
204
- if: steps.changed-src-files.outputs.any_changed == 'true'
205
- run: |
206
- # replace '/' characters in HF_MODEL with '--' for GHA cache keys and
207
- # in model file names in local HF hub cache
208
-
209
- # don't use in-line default values for variable expansion here to not
210
- # use the default model revision with a non-default model like this:
211
- # model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}"
212
- # revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}"
213
-
214
- if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
215
- model="${{ matrix.test_suite.hf_model }}"
216
- revision="${{ matrix.test_suite.hf_model_rev }}"
217
- else
218
- model="${{ env.DEFAULT_HF_MODEL }}"
219
- revision="${{ env.DEFAULT_HF_MODEL_REV }}"
220
- fi
221
- safe_name="${model//\//--}"
222
- echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV"
223
- echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV"
224
-
225
- if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
226
- model_2="${{ matrix.test_suite.hf_model_2 }}"
227
- revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
228
- safe_name_2="${model_2//\//--}"
229
- echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV"
230
- echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
231
- fi
232
-
233
195
  - name: "Restore HF models cache"
234
196
  id: cache_restore
235
197
  if: steps.changed-src-files.outputs.any_changed == 'true'
236
198
  uses: actions/cache/restore@v4
237
199
  with:
238
- path: ${{ env.model_path }}
239
- key: ${{ runner.os }}-hf-model-${{ env.model_key }}
240
-
241
- - name: "Restore HF models cache for additional model"
242
- id: cache_restore_2
243
- if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
244
- uses: actions/cache/restore@v4
245
- with:
246
- path: ${{ env.model_2_path }}
247
- key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
200
+ path: ${{ env.HF_HUB_CACHE }}
201
+ key: ${{ runner.os }}-hf-cache-${{ hashFiles('.github/ci_model_cache.yaml') }}
248
202
 
249
203
  - name: "Download HF models"
250
- if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true'))
204
+ if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' )
251
205
  run: |
252
- # We are caching HF models (HF_HUB_CACHE) for reliability rather than
253
- # speed, since HF downloads are flaky for concurrent jobs.
254
- # Be careful when adding models to the cache here, as the GHA cache is
255
- # limited to 10 GB.
256
- # If a new model is added here, a new hash key is generated. The
257
- # previous cache blob can then be removed by an admin or can be left
258
- # to expire after 7 days.
259
-
260
- if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
261
- model="${{ matrix.test_suite.hf_model }}"
262
- revision="${{ matrix.test_suite.hf_model_rev }}"
263
- else
264
- model="${{ env.DEFAULT_HF_MODEL }}"
265
- revision="${{ env.DEFAULT_HF_MODEL_REV }}"
266
- fi
267
- model_2="${{ matrix.test_suite.hf_model_2 }}"
268
- revision_2="${{ matrix.test_suite.hf_model_2_rev }}"
269
-
270
- python3 tools/download_model.py -m "$model" -r "${revision:-main}" &
271
-
272
- if [[ -n "$model_2" ]]; then
273
- python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" &
274
- fi
275
-
276
- wait
206
+ # The full HF_HUB_CACHE is cached as a single GHA entry keyed on the
207
+ # hash of .github/ci_model_cache.yaml. Edit that file to add/remove
208
+ # models a fresh download + cache save will run on the next push to
209
+ # main. Stale cache blobs expire after 7 days (or can be deleted by
210
+ # an admin).
211
+ #
212
+ # We cache for reliability, not speed: HF downloads are flaky under
213
+ # concurrent jobs.
214
+ source .venv/bin/activate
215
+ python3 tools/download_model.py --config .github/ci_model_cache.yaml
277
216
 
278
- - name: "Check HF model cache sizes"
217
+ - name: "Check HF model cache size"
279
218
  if: steps.changed-src-files.outputs.any_changed == 'true'
280
219
  run: |
281
- # Guard against accidentally caching multi-GB model artifacts (e.g.
282
- # onnx/, openvino/, or duplicate framework weights). The GHA cache
283
- # has a 10 GB total limit; keep individual model caches well under
284
- # that so multiple matrix entries can coexist.
285
- MAX_BYTES=$((3 * 1024 * 1024 * 1024)) # 3 GiB (uncompressed; caches are compressed before upload)
286
- status=0
287
- for path in "${{ env.model_path }}" "${{ env.model_2_path }}"; do
288
- if [[ -n "$path" && -d "$path" ]]; then
289
- size=$(du -sb "$path" | cut -f1)
290
- human=$(du -sh "$path" | cut -f1)
291
- echo "Model cache $path: $human ($size bytes)"
292
- if (( size > MAX_BYTES )); then
293
- echo "::error::Model cache at $path is $human, exceeding the 3 GiB limit. Update tools/download_model.py to exclude unused artifacts (onnx/, openvino/, duplicate framework weights, etc)."
294
- status=1
295
- fi
220
+ # GHA cache has a 10 GB total limit. Keep the combined HF cache well
221
+ # under that so other caches (uv, etc) can coexist. If this fails,
222
+ # either prune entries from .github/ci_model_cache.yaml or update
223
+ # tools/download_model.py to exclude unused artifacts (onnx/,
224
+ # openvino/, duplicate framework weights, etc).
225
+ MAX_BYTES=$((8 * 1024 * 1024 * 1024)) # 8 GiB uncompressed
226
+ if [[ -d "${HF_HUB_CACHE}" ]]; then
227
+ size=$(du -sb "${HF_HUB_CACHE}" | cut -f1)
228
+ human=$(du -sh "${HF_HUB_CACHE}" | cut -f1)
229
+ echo "HF cache ${HF_HUB_CACHE}: $human ($size bytes)"
230
+ if (( size > MAX_BYTES )); then
231
+ echo "::error::HF cache is $human, exceeding the 8 GiB limit."
232
+ exit 1
296
233
  fi
297
- done
298
- exit $status
234
+ fi
299
235
 
300
236
  - name: "Save HF models cache"
301
237
  if: ( steps.changed-src-files.outputs.any_changed == 'true' && github.event_name != 'pull_request' && steps.cache_restore.outputs.cache-hit != 'true' )
302
238
  uses: actions/cache/save@v4
303
239
  with:
304
- path: ${{ env.model_path }}
305
- key: ${{ runner.os }}-hf-model-${{ env.model_key }}
306
-
307
- - name: "Save HF models cache for additional model"
308
- if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
309
- uses: actions/cache/save@v4
310
- with:
311
- path: ${{ env.model_2_path }}
312
- key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
240
+ path: ${{ env.HF_HUB_CACHE }}
241
+ key: ${{ runner.os }}-hf-cache-${{ hashFiles('.github/ci_model_cache.yaml') }}
313
242
 
314
243
  - name: "Run tests"
315
244
  if: steps.changed-src-files.outputs.any_changed == 'true'
@@ -1,14 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sendnn-inference
3
- Version: 2.1.4
3
+ Version: 2.2.2
4
4
  Summary: vLLM plugin for Spyre hardware support
5
5
  License: Apache 2
6
6
  Requires-Python: >=3.11
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: fms-model-optimizer[fp8-infer]<0.9,>=0.8.3
10
- Requires-Dist: ibm-fms<2,>=1.9.0
11
- Requires-Dist: vllm<0.22.1,>=0.19.1
10
+ Requires-Dist: ibm-fms<2,>=1.11.1
11
+ Requires-Dist: vllm<0.23.1,>=0.19.1
12
12
  Requires-Dist: torch
13
13
  Requires-Dist: torchvision
14
14
  Dynamic: license-file
@@ -121,6 +121,15 @@ Prefix caching mirrors upstream vLLM, though the requirement for fixed-size pref
121
121
 
122
122
  When prefix caching is enabled, the `vllm:prefix_cache_queries` and `vllm:prefix_cache_hits` metrics correctly report prefix cache stats in tokens.
123
123
 
124
+ ### Multimodal Models
125
+
126
+ For multimodal models, vision encoding is offloaded to the CPU. In order to prevent expensive duplication of vision encoding, prefill during multimodal models is slightly different than that of text-only models. Vision encoding is done once per request instead of per worker so the threading configuration for multimodal models is also slightly different to improve performance.
127
+
128
+ Text-only models set the number of available threads through dividing the number of available CPUs available by number of worker and only assigning that per worker.
129
+ Multimodal models currently set the number of available threads to the number of available cpus available, ignoring the number of workers. This may be changed in the future.
130
+
131
+ The maximum available number of CPUs also can be set using `SENDNN_INFERENCE_NUM_CPUS`.
132
+
124
133
  ## Pooling Models
125
134
 
126
135
  For the embedding, scoring, and reranking tasks, vLLM supports running Pooling Models. More information on Pooling Models can be found in the [vLLM official documentation](https://docs.vllm.ai/en/latest/models/pooling_models/).
@@ -45,6 +45,18 @@ if __name__ == "__main__":
45
45
  )
46
46
  parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
47
47
  parser.add_argument("--backend", type=str, default="sendnn", choices=["eager", "sendnn"])
48
+ parser.add_argument(
49
+ "--tokenizer",
50
+ type=str,
51
+ default=None,
52
+ help="HF tokenizer id or path. Defaults to --model.",
53
+ )
54
+ parser.add_argument(
55
+ "--load-format",
56
+ type=str,
57
+ default="auto",
58
+ help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
59
+ )
48
60
 
49
61
  args = parser.parse_args()
50
62
 
@@ -95,7 +107,7 @@ if __name__ == "__main__":
95
107
  prompts = prompts * (args.num_prompts // len(prompts) + 1)
96
108
  prompts = prompts[0 : args.num_prompts]
97
109
 
98
- tokenizer = AutoTokenizer.from_pretrained(args.model)
110
+ tokenizer = AutoTokenizer.from_pretrained(args.tokenizer or args.model)
99
111
 
100
112
  tokenized_prompts = tokenizer(prompts)["input_ids"]
101
113
  tokenized_prompts = [p[: args.max_prompt_len] for p in tokenized_prompts]
@@ -124,7 +136,8 @@ if __name__ == "__main__":
124
136
  # Create an LLM.
125
137
  llm = LLM(
126
138
  model=args.model,
127
- tokenizer=args.model,
139
+ tokenizer=args.tokenizer or args.model,
140
+ load_format=args.load_format,
128
141
  max_model_len=args.max_model_len,
129
142
  max_num_seqs=args.max_num_seqs,
130
143
  tensor_parallel_size=args.tp,
@@ -29,6 +29,18 @@ if __name__ == "__main__":
29
29
  )
30
30
  parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
31
31
  parser.add_argument("--backend", type=str, default="eager", choices=["eager", "sendnn"])
32
+ parser.add_argument(
33
+ "--tokenizer",
34
+ type=str,
35
+ default=None,
36
+ help="HF tokenizer id or path. Defaults to --model.",
37
+ )
38
+ parser.add_argument(
39
+ "--load-format",
40
+ type=str,
41
+ default="auto",
42
+ help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
43
+ )
32
44
 
33
45
  args = parser.parse_args()
34
46
 
@@ -84,7 +96,8 @@ if __name__ == "__main__":
84
96
  # Create an LLM.
85
97
  llm = LLM(
86
98
  model=args.model,
87
- tokenizer=args.model,
99
+ tokenizer=args.tokenizer or args.model,
100
+ load_format=args.load_format,
88
101
  max_model_len=args.max_model_len,
89
102
  max_num_seqs=args.max_num_seqs,
90
103
  tensor_parallel_size=args.tp,
@@ -1,7 +1,7 @@
1
1
  [build-system]
2
2
  requires = [
3
3
  "setuptools>=82",
4
- "setuptools_scm>=8"
4
+ "setuptools_scm>=8,<10"
5
5
  ]
6
6
  build-backend = "setuptools.build_meta"
7
7
 
@@ -12,10 +12,10 @@ readme = "README.md"
12
12
  license = {text = "Apache 2"}
13
13
  dependencies = [
14
14
  "fms-model-optimizer[fp8-infer]>=0.8.3,<0.9",
15
- "ibm-fms>=1.9.0,<2",
15
+ "ibm-fms>=1.11.1,<2",
16
16
  # NB: use strict < with the next patch version to not exclude versions with
17
17
  # build metadata suffixes
18
- "vllm>=0.19.1,<0.22.1",
18
+ "vllm>=0.19.1,<0.23.1",
19
19
 
20
20
  # Specific torch version overrides handled by uv
21
21
  "torch",
@@ -54,6 +54,7 @@ git_describe_command = "git describe --dirty --tags --long --match 'v*'"
54
54
  # by accident
55
55
  override-dependencies = [
56
56
  "torch==2.11.0",
57
+ "torchvision==0.26.0",
57
58
  "triton; sys_platform == 'never'",
58
59
  "intel-extension-for-pytorch; sys_platform == 'never'",
59
60
 
@@ -89,7 +90,7 @@ build-constraint-dependencies = []
89
90
  extra-build-variables = { vllm = { VLLM_TARGET_DEVICE = "empty" } }
90
91
 
91
92
  [tool.uv.sources]
92
- vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.22.0" }
93
+ vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.23.0" }
93
94
  torch = [
94
95
  { index = "pytorch-cpu" },
95
96
  ]
@@ -243,7 +244,7 @@ dev = [
243
244
  "pytest-forked>=1.6.0",
244
245
  "pytest-timeout==2.3.1",
245
246
  "requests==2.32.3",
246
- "sentence-transformers==3.4.1",
247
+ "sentence-transformers>=3.4.1",
247
248
  "aiu-fms-testing-utils>=0.8.2",
248
249
  "pytest-mock>=3.15.0",
249
250
  ]
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '2.2.2'
32
+ __version_tuple__ = version_tuple = (2, 2, 2)
33
+
34
+ __commit_id__ = commit_id = 'gd054d78'
@@ -162,7 +162,7 @@ class ConditionalDefaultManager:
162
162
  namespace: argparse.Namespace | None = None,
163
163
  ) -> argparse.Namespace:
164
164
  result = original_parse_args(self, args, namespace)
165
- assert result is not None # type: ignore[redundant-expr]
165
+ assert result is not None
166
166
 
167
167
  if args is None or len(args) == 0:
168
168
  # Don't override anything if there were no args parsed
@@ -9,6 +9,22 @@
9
9
 
10
10
  # templates for reuse via YAML anchors
11
11
  _templates:
12
+ granite_41_30b_architecture: &granite_41_30b_architecture
13
+ model_type: granite
14
+ num_hidden_layers: 64
15
+ max_position_embeddings: 131072
16
+ hidden_size: 4096
17
+ vocab_size: 100352
18
+ num_key_value_heads: 8
19
+ num_attention_heads: 32
20
+
21
+ # device config for TP=4 Granite 4.1 30b models
22
+ granite_41_30b_tp4_device_config: &granite_41_30b_tp4_device_config
23
+ env_vars:
24
+ VLLM_DT_MAX_BATCH_TKV_LIMIT: 131072 # 128k
25
+ FLEX_HDMA_P2PSIZE: 268435456 # 256MB
26
+ FLEX_HDMA_COLLSIZE: 33554432 # 32MB
27
+ num_gpu_blocks_override: 2080
12
28
 
13
29
  granite_4_8b_architecture: &granite_4_8b_architecture
14
30
  model_type: granite
@@ -35,7 +51,7 @@ _templates:
35
51
  FLEX_HDMA_P2PSIZE: 268435456 # 256MB
36
52
  FLEX_HDMA_COLLSIZE: 33554432 # 32MB
37
53
  num_gpu_blocks_override: 8192
38
-
54
+
39
55
  granite_vision_33_2b_architecture: &granite_vision_33_2b_architecture
40
56
  model_type: llava_next
41
57
  text_config:
@@ -166,7 +182,7 @@ models:
166
182
  max_model_len: 32768
167
183
  max_num_seqs: 32
168
184
  device_config: *granite_8b_tp4_device_config
169
-
185
+
170
186
  # Llama 3.1 8B Instruct
171
187
  meta-llama/Llama-3.1-8B-Instruct:
172
188
  architecture: *llama3_8b_architecture
@@ -247,6 +263,15 @@ models:
247
263
  max_num_seqs: 32
248
264
  device_config: *granite_8b_tp4_device_config
249
265
 
266
+ # Granite 4.1 30B
267
+ ibm-granite/granite-4.1-30b:
268
+ architecture: *granite_41_30b_architecture
269
+ continuous_batching_configs:
270
+ - tp_size: 4
271
+ max_model_len: 32768
272
+ max_num_seqs: 32
273
+ device_config: *granite_41_30b_tp4_device_config
274
+
250
275
  # Granite Vision 3.3 2B
251
276
  ibm-granite/granite-vision-3.3-2b:
252
277
  architecture: *granite_vision_33_2b_architecture
@@ -255,14 +280,14 @@ models:
255
280
  max_model_len: 8192
256
281
  max_num_seqs: 16
257
282
  - tp_size: 2
258
- max_model_len: 16382
283
+ max_model_len: 16384
259
284
  max_num_seqs: 16
260
285
  device_config: *granite_vision_2b_tp2_device_config
261
286
  - tp_size: 4
262
287
  max_model_len: 32768
263
288
  max_num_seqs: 32
264
289
  device_config: *granite_vision_2b_tp4_device_config
265
-
290
+
266
291
  # Mistral Small 3.2 24B Instruct
267
292
  mistralai/Mistral-Small-3.2-24B-Instruct-2506:
268
293
  architecture: *mistral3_24b_architecture
@@ -279,6 +304,9 @@ models:
279
304
  mistralai/Ministral-3-14B-Instruct-2512-BF16:
280
305
  architecture: *ministral3_14b_architecture
281
306
  continuous_batching_configs:
307
+ - tp_size: 1
308
+ max_model_len: 4096
309
+ max_num_seqs: 32
282
310
  - tp_size: 4
283
311
  max_model_len: 32768
284
312
  max_num_seqs: 32
@@ -310,6 +338,30 @@ models:
310
338
  - prompt_len: 512
311
339
  batch_size: 64
312
340
 
341
+ Qwen/Qwen3-Embedding-0.6B:
342
+ architecture:
343
+ model_type: qwen3
344
+ num_hidden_layers: 28
345
+ vocab_size: 151669
346
+
347
+ static_batching_configs:
348
+ - tp_size: 1
349
+ warmup_shapes:
350
+ - prompt_len: 512
351
+ batch_size: 64
352
+
353
+ Qwen/Qwen3-Embedding-4B:
354
+ architecture:
355
+ model_type: qwen3
356
+ num_hidden_layers: 36
357
+ vocab_size: 151665
358
+
359
+ static_batching_configs:
360
+ - tp_size: 1
361
+ warmup_shapes:
362
+ - prompt_len: 512
363
+ batch_size: 64
364
+
313
365
  # Other supported models (static batching only)
314
366
  intfloat/multilingual-e5-large:
315
367
  architecture:
@@ -26,6 +26,7 @@ if TYPE_CHECKING:
26
26
  SENDNN_INFERENCE_MODEL_CONFIG_FILE: str | None = None
27
27
  SENDNN_INFERENCE_CPU_MM_DTYPE: torch.dtype = torch.float16
28
28
  SENDNN_INFERENCE_MM_DEVICE: str = "auto"
29
+ SENDNN_INFERENCE_TP_MM_SHARING: bool = True
29
30
 
30
31
  logger = init_logger(__name__)
31
32
 
@@ -92,6 +93,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
92
93
  ),
93
94
  # Allow sendnn-inference to update env vars related to multi-threading (eg. OMP)
94
95
  # based on the detected CPU cores and server configuration
96
+ # Multimodal models will not take into account the number of workers for configuration.
95
97
  "SENDNN_INFERENCE_UPDATE_THREAD_CONFIG": lambda: bool(
96
98
  int(os.getenv("SENDNN_INFERENCE_UPDATE_THREAD_CONFIG", "1"))
97
99
  ),
@@ -171,6 +173,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
171
173
  "SENDNN_INFERENCE_MM_DEVICE": lambda: parse_mm_device(
172
174
  os.getenv("SENDNN_INFERENCE_MM_DEVICE", "auto")
173
175
  ),
176
+ # When "1" (default), rank 0 runs the vision encoder and shares the result
177
+ # with other TP ranks via POSIX shared memory (one encoder call instead of
178
+ # world_size calls). Set to "0" to fall back to every TP rank running the
179
+ # vision encoder independently — the original behaviour, which avoids any
180
+ # SHM-related failure modes at the cost of redundant CPU work.
181
+ "SENDNN_INFERENCE_TP_MM_SHARING": lambda: bool(
182
+ int(os.getenv("SENDNN_INFERENCE_TP_MM_SHARING", "1"))
183
+ ),
174
184
  }
175
185
  # --8<-- [end:env-vars-definition]
176
186