sendnn-inference 2.2.0__tar.gz → 2.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/test.yml +10 -1
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/PKG-INFO +2 -2
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/configuration.md +9 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/long_context.py +15 -2
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/text_inference.py +14 -1
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/pyproject.toml +3 -3
- sendnn_inference-2.2.2/sendnn_inference/_version.py +34 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/argparse_utils.py +1 -1
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_configs.yaml +31 -3
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/envs.py +10 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/spyre.py +26 -10
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/platform.py +16 -3
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/core/scheduler.py +43 -33
- sendnn_inference-2.2.2/sendnn_inference/v1/worker/mm_shared_memory.py +130 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_model_runner.py +160 -30
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_worker.py +29 -3
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/PKG-INFO +2 -2
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/SOURCES.txt +3 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/requires.txt +1 -1
- sendnn_inference-2.2.2/tests/e2e/test_load_format_dummy.py +39 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_basic.py +3 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_pc_scheduler_steps.py +27 -27
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/llm_cache.py +5 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_upstream_compatibility.py +17 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/core/test_scheduler_structured_outputs.py +6 -0
- sendnn_inference-2.2.2/tests/v1/worker/test_mm_shared_memory.py +328 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/uv.lock +16 -118
- sendnn_inference-2.2.0/sendnn_inference/_version.py +0 -24
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.bob/skills/update-vllm/SKILL.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.claude/skills/update-vllm/SKILL.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/CODEOWNERS +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/rfc.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/actions/free-up-disk-space/action.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/ci_model_cache.yaml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/pull_request_template.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/add_label_automerge.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/build_and_publish.yaml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/build_docker.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/check_uv_lock.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/lint_scripts.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/matchers/ruff.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/model_smoke.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/pre-commit.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/publish_to_test_pypi.yaml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/reminder_comment.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/stale.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.gitignore +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.pre-commit-config.yaml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.readthedocs.yaml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.shellcheckrc +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.yapfignore +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/CLAUDE.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/CODE_OF_CONDUCT.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/CONTRIBUTING.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/DCO +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/LICENSE +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/README.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/RELEASING.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/_local_envs_for_test.sh +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docker/.senlib.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docker/Dockerfile.amd64 +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docker/simple_vllm_serve.sh +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/.nav.yml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/README.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/README.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/architecture.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/images/vllm_v1.svg +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/images/vllm_v1_spyre.svg +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/maintaining.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/multimodal/adding_new_models.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/scheduler.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/contributing/vllm-update-procedure.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/deploying/docker.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/deploying/k8s.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/deploying/rhoai.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/getting_started/installation.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/generate_example.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefill_single_chunks.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefill_three_chunks.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_1.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_2.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_3.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/scheduling_admission_constraints.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/scheduling_padding_tkv_jump.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/data/timeline_admission_constraints.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_examples.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_model_tables.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_prefill_only_plots.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_prefix_caching_plots.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_scheduling_plots.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_timeline.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/url_schemes.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/mkdocs/overrides/main.html +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/requirements-docs.txt +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/roadmaps/q3-2025.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/env_vars.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/performance.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/supported_features.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/supported_models.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/vision_inference.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/online_inference/openai_spyre_text.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/online_inference/openai_spyre_vision.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/online_inference/spyre_vllm_benchmark.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/online_inference/spyre_vllm_setup_container.sh +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/format.sh +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/mkdocs.yaml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/compat_utils.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/compilation_utils.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/README.md +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/configurators/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/configurators/model_configurator.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_config.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_matcher.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_registry.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/spyre_setup.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/base.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/llava_next.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/mistral3.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/perf_metrics.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/utils.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/core/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/metrics/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/metrics/stats_logger.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/sample/golden_token_injector.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/sample/spyre_logits_processor.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_input_batch.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/dependency_links.txt +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/entry_points.txt +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/top_level.txt +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/setup.cfg +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/aftu/graph_compare_utils.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/aftu/test_compare_graphs.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/__init__.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/conftest.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/fixtures/test_error_handling_models.yaml +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_env_config_path.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_error_handling.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_integration.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_model_config.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_model_configurator.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_model_matcher.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/config/test_model_registry.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/conftest.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/download_model_configs.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_chunked_prefill.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_chunked_prefill_tkv_steps.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_logits_processors.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_model_smoke.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_profiler.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_sampling_params.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_async_llm.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_cp_scheduler_steps.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_embeddings.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_mm.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_online.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_scoring.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_seed.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_stagger_basic.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_stats_logger.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/e2e/test_structured_outputs.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/BAAI/bge-reranker-large/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/BAAI/bge-reranker-v2-m3/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-ai-platform/micro-g3.3-8b-instruct-1b/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-3.3-8b-instruct/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-3.3-8b-instruct-FP8/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense-hybrid/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-embedding-125m-english/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-embedding-278m-multilingual/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/intfloat/multilingual-e5-large/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/sentence-transformers/all-roberta-large-v1/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/hf_cache.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/hf_result_cache.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/llm_cache_util.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/model_loader/test_spyre.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/multimodal/test_llava_next.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/multimodal/test_mistral3.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/output_util.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/precompilation/test_disable_compilation.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/scheduling_utils.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/spyre_util.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/bge_copy/config.json +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_cli_args.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_envs.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_golden_token_injector.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_platform_validation.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/utils/test_spyre_model_list.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/mock_model.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/test_prefix_caching_worker.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/test_scheduler_tkv_limits.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/test_spyre_input_batch.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tests/v1/worker/test_spyre_worker_profile.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tools/check_aiu.sh +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tools/check_repo.sh +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tools/download_model.py +0 -0
- {sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/tools/lint_scripts.sh +0 -0
|
@@ -27,7 +27,7 @@ concurrency:
|
|
|
27
27
|
|
|
28
28
|
jobs:
|
|
29
29
|
test:
|
|
30
|
-
timeout-minutes:
|
|
30
|
+
timeout-minutes: 25
|
|
31
31
|
runs-on: ${{ matrix.os }}
|
|
32
32
|
strategy:
|
|
33
33
|
fail-fast: false
|
|
@@ -108,6 +108,15 @@ jobs:
|
|
|
108
108
|
flags: "--timeout=300"
|
|
109
109
|
os: "ubuntu-latest"
|
|
110
110
|
python_version: "3.12"
|
|
111
|
+
- vllm_version:
|
|
112
|
+
name: "vLLM:0.22.0"
|
|
113
|
+
repo: "git+https://github.com/vllm-project/vllm --tag v0.22.0"
|
|
114
|
+
test_suite:
|
|
115
|
+
name: "backward compat"
|
|
116
|
+
markers: "compat or (cpu and basic and not quantized)"
|
|
117
|
+
flags: "--timeout=300"
|
|
118
|
+
os: "ubuntu-latest"
|
|
119
|
+
python_version: "3.12"
|
|
111
120
|
|
|
112
121
|
|
|
113
122
|
# Only run vllm:main jobs on PRs with `vllm:main` label
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sendnn-inference
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.2
|
|
4
4
|
Summary: vLLM plugin for Spyre hardware support
|
|
5
5
|
License: Apache 2
|
|
6
6
|
Requires-Python: >=3.11
|
|
@@ -8,7 +8,7 @@ Description-Content-Type: text/markdown
|
|
|
8
8
|
License-File: LICENSE
|
|
9
9
|
Requires-Dist: fms-model-optimizer[fp8-infer]<0.9,>=0.8.3
|
|
10
10
|
Requires-Dist: ibm-fms<2,>=1.11.1
|
|
11
|
-
Requires-Dist: vllm<0.
|
|
11
|
+
Requires-Dist: vllm<0.23.1,>=0.19.1
|
|
12
12
|
Requires-Dist: torch
|
|
13
13
|
Requires-Dist: torchvision
|
|
14
14
|
Dynamic: license-file
|
|
@@ -121,6 +121,15 @@ Prefix caching mirrors upstream vLLM, though the requirement for fixed-size pref
|
|
|
121
121
|
|
|
122
122
|
When prefix caching is enabled, the `vllm:prefix_cache_queries` and `vllm:prefix_cache_hits` metrics correctly report prefix cache stats in tokens.
|
|
123
123
|
|
|
124
|
+
### Multimodal Models
|
|
125
|
+
|
|
126
|
+
For multimodal models, vision encoding is offloaded to the CPU. In order to prevent expensive duplication of vision encoding, prefill during multimodal models is slightly different than that of text-only models. Vision encoding is done once per request instead of per worker so the threading configuration for multimodal models is also slightly different to improve performance.
|
|
127
|
+
|
|
128
|
+
Text-only models set the number of available threads through dividing the number of available CPUs available by number of worker and only assigning that per worker.
|
|
129
|
+
Multimodal models currently set the number of available threads to the number of available cpus available, ignoring the number of workers. This may be changed in the future.
|
|
130
|
+
|
|
131
|
+
The maximum available number of CPUs also can be set using `SENDNN_INFERENCE_NUM_CPUS`.
|
|
132
|
+
|
|
124
133
|
## Pooling Models
|
|
125
134
|
|
|
126
135
|
For the embedding, scoring, and reranking tasks, vLLM supports running Pooling Models. More information on Pooling Models can be found in the [vLLM official documentation](https://docs.vllm.ai/en/latest/models/pooling_models/).
|
{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/long_context.py
RENAMED
|
@@ -45,6 +45,18 @@ if __name__ == "__main__":
|
|
|
45
45
|
)
|
|
46
46
|
parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
|
|
47
47
|
parser.add_argument("--backend", type=str, default="sendnn", choices=["eager", "sendnn"])
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--tokenizer",
|
|
50
|
+
type=str,
|
|
51
|
+
default=None,
|
|
52
|
+
help="HF tokenizer id or path. Defaults to --model.",
|
|
53
|
+
)
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--load-format",
|
|
56
|
+
type=str,
|
|
57
|
+
default="auto",
|
|
58
|
+
help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
|
|
59
|
+
)
|
|
48
60
|
|
|
49
61
|
args = parser.parse_args()
|
|
50
62
|
|
|
@@ -95,7 +107,7 @@ if __name__ == "__main__":
|
|
|
95
107
|
prompts = prompts * (args.num_prompts // len(prompts) + 1)
|
|
96
108
|
prompts = prompts[0 : args.num_prompts]
|
|
97
109
|
|
|
98
|
-
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
|
110
|
+
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer or args.model)
|
|
99
111
|
|
|
100
112
|
tokenized_prompts = tokenizer(prompts)["input_ids"]
|
|
101
113
|
tokenized_prompts = [p[: args.max_prompt_len] for p in tokenized_prompts]
|
|
@@ -124,7 +136,8 @@ if __name__ == "__main__":
|
|
|
124
136
|
# Create an LLM.
|
|
125
137
|
llm = LLM(
|
|
126
138
|
model=args.model,
|
|
127
|
-
tokenizer=args.model,
|
|
139
|
+
tokenizer=args.tokenizer or args.model,
|
|
140
|
+
load_format=args.load_format,
|
|
128
141
|
max_model_len=args.max_model_len,
|
|
129
142
|
max_num_seqs=args.max_num_seqs,
|
|
130
143
|
tensor_parallel_size=args.tp,
|
{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/text_inference.py
RENAMED
|
@@ -29,6 +29,18 @@ if __name__ == "__main__":
|
|
|
29
29
|
)
|
|
30
30
|
parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
|
|
31
31
|
parser.add_argument("--backend", type=str, default="eager", choices=["eager", "sendnn"])
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--tokenizer",
|
|
34
|
+
type=str,
|
|
35
|
+
default=None,
|
|
36
|
+
help="HF tokenizer id or path. Defaults to --model.",
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--load-format",
|
|
40
|
+
type=str,
|
|
41
|
+
default="auto",
|
|
42
|
+
help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
|
|
43
|
+
)
|
|
32
44
|
|
|
33
45
|
args = parser.parse_args()
|
|
34
46
|
|
|
@@ -84,7 +96,8 @@ if __name__ == "__main__":
|
|
|
84
96
|
# Create an LLM.
|
|
85
97
|
llm = LLM(
|
|
86
98
|
model=args.model,
|
|
87
|
-
tokenizer=args.model,
|
|
99
|
+
tokenizer=args.tokenizer or args.model,
|
|
100
|
+
load_format=args.load_format,
|
|
88
101
|
max_model_len=args.max_model_len,
|
|
89
102
|
max_num_seqs=args.max_num_seqs,
|
|
90
103
|
tensor_parallel_size=args.tp,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[build-system]
|
|
2
2
|
requires = [
|
|
3
3
|
"setuptools>=82",
|
|
4
|
-
"setuptools_scm>=8"
|
|
4
|
+
"setuptools_scm>=8,<10"
|
|
5
5
|
]
|
|
6
6
|
build-backend = "setuptools.build_meta"
|
|
7
7
|
|
|
@@ -15,7 +15,7 @@ dependencies = [
|
|
|
15
15
|
"ibm-fms>=1.11.1,<2",
|
|
16
16
|
# NB: use strict < with the next patch version to not exclude versions with
|
|
17
17
|
# build metadata suffixes
|
|
18
|
-
"vllm>=0.19.1,<0.
|
|
18
|
+
"vllm>=0.19.1,<0.23.1",
|
|
19
19
|
|
|
20
20
|
# Specific torch version overrides handled by uv
|
|
21
21
|
"torch",
|
|
@@ -90,7 +90,7 @@ build-constraint-dependencies = []
|
|
|
90
90
|
extra-build-variables = { vllm = { VLLM_TARGET_DEVICE = "empty" } }
|
|
91
91
|
|
|
92
92
|
[tool.uv.sources]
|
|
93
|
-
vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.
|
|
93
|
+
vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.23.0" }
|
|
94
94
|
torch = [
|
|
95
95
|
{ index = "pytorch-cpu" },
|
|
96
96
|
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '2.2.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (2, 2, 2)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = 'gd054d78'
|
|
@@ -162,7 +162,7 @@ class ConditionalDefaultManager:
|
|
|
162
162
|
namespace: argparse.Namespace | None = None,
|
|
163
163
|
) -> argparse.Namespace:
|
|
164
164
|
result = original_parse_args(self, args, namespace)
|
|
165
|
-
assert result is not None
|
|
165
|
+
assert result is not None
|
|
166
166
|
|
|
167
167
|
if args is None or len(args) == 0:
|
|
168
168
|
# Don't override anything if there were no args parsed
|
{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_configs.yaml
RENAMED
|
@@ -9,6 +9,22 @@
|
|
|
9
9
|
|
|
10
10
|
# templates for reuse via YAML anchors
|
|
11
11
|
_templates:
|
|
12
|
+
granite_41_30b_architecture: &granite_41_30b_architecture
|
|
13
|
+
model_type: granite
|
|
14
|
+
num_hidden_layers: 64
|
|
15
|
+
max_position_embeddings: 131072
|
|
16
|
+
hidden_size: 4096
|
|
17
|
+
vocab_size: 100352
|
|
18
|
+
num_key_value_heads: 8
|
|
19
|
+
num_attention_heads: 32
|
|
20
|
+
|
|
21
|
+
# device config for TP=4 Granite 4.1 30b models
|
|
22
|
+
granite_41_30b_tp4_device_config: &granite_41_30b_tp4_device_config
|
|
23
|
+
env_vars:
|
|
24
|
+
VLLM_DT_MAX_BATCH_TKV_LIMIT: 131072 # 128k
|
|
25
|
+
FLEX_HDMA_P2PSIZE: 268435456 # 256MB
|
|
26
|
+
FLEX_HDMA_COLLSIZE: 33554432 # 32MB
|
|
27
|
+
num_gpu_blocks_override: 2080
|
|
12
28
|
|
|
13
29
|
granite_4_8b_architecture: &granite_4_8b_architecture
|
|
14
30
|
model_type: granite
|
|
@@ -35,7 +51,7 @@ _templates:
|
|
|
35
51
|
FLEX_HDMA_P2PSIZE: 268435456 # 256MB
|
|
36
52
|
FLEX_HDMA_COLLSIZE: 33554432 # 32MB
|
|
37
53
|
num_gpu_blocks_override: 8192
|
|
38
|
-
|
|
54
|
+
|
|
39
55
|
granite_vision_33_2b_architecture: &granite_vision_33_2b_architecture
|
|
40
56
|
model_type: llava_next
|
|
41
57
|
text_config:
|
|
@@ -166,7 +182,7 @@ models:
|
|
|
166
182
|
max_model_len: 32768
|
|
167
183
|
max_num_seqs: 32
|
|
168
184
|
device_config: *granite_8b_tp4_device_config
|
|
169
|
-
|
|
185
|
+
|
|
170
186
|
# Llama 3.1 8B Instruct
|
|
171
187
|
meta-llama/Llama-3.1-8B-Instruct:
|
|
172
188
|
architecture: *llama3_8b_architecture
|
|
@@ -247,6 +263,15 @@ models:
|
|
|
247
263
|
max_num_seqs: 32
|
|
248
264
|
device_config: *granite_8b_tp4_device_config
|
|
249
265
|
|
|
266
|
+
# Granite 4.1 30B
|
|
267
|
+
ibm-granite/granite-4.1-30b:
|
|
268
|
+
architecture: *granite_41_30b_architecture
|
|
269
|
+
continuous_batching_configs:
|
|
270
|
+
- tp_size: 4
|
|
271
|
+
max_model_len: 32768
|
|
272
|
+
max_num_seqs: 32
|
|
273
|
+
device_config: *granite_41_30b_tp4_device_config
|
|
274
|
+
|
|
250
275
|
# Granite Vision 3.3 2B
|
|
251
276
|
ibm-granite/granite-vision-3.3-2b:
|
|
252
277
|
architecture: *granite_vision_33_2b_architecture
|
|
@@ -262,7 +287,7 @@ models:
|
|
|
262
287
|
max_model_len: 32768
|
|
263
288
|
max_num_seqs: 32
|
|
264
289
|
device_config: *granite_vision_2b_tp4_device_config
|
|
265
|
-
|
|
290
|
+
|
|
266
291
|
# Mistral Small 3.2 24B Instruct
|
|
267
292
|
mistralai/Mistral-Small-3.2-24B-Instruct-2506:
|
|
268
293
|
architecture: *mistral3_24b_architecture
|
|
@@ -279,6 +304,9 @@ models:
|
|
|
279
304
|
mistralai/Ministral-3-14B-Instruct-2512-BF16:
|
|
280
305
|
architecture: *ministral3_14b_architecture
|
|
281
306
|
continuous_batching_configs:
|
|
307
|
+
- tp_size: 1
|
|
308
|
+
max_model_len: 4096
|
|
309
|
+
max_num_seqs: 32
|
|
282
310
|
- tp_size: 4
|
|
283
311
|
max_model_len: 32768
|
|
284
312
|
max_num_seqs: 32
|
|
@@ -26,6 +26,7 @@ if TYPE_CHECKING:
|
|
|
26
26
|
SENDNN_INFERENCE_MODEL_CONFIG_FILE: str | None = None
|
|
27
27
|
SENDNN_INFERENCE_CPU_MM_DTYPE: torch.dtype = torch.float16
|
|
28
28
|
SENDNN_INFERENCE_MM_DEVICE: str = "auto"
|
|
29
|
+
SENDNN_INFERENCE_TP_MM_SHARING: bool = True
|
|
29
30
|
|
|
30
31
|
logger = init_logger(__name__)
|
|
31
32
|
|
|
@@ -92,6 +93,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|
|
92
93
|
),
|
|
93
94
|
# Allow sendnn-inference to update env vars related to multi-threading (eg. OMP)
|
|
94
95
|
# based on the detected CPU cores and server configuration
|
|
96
|
+
# Multimodal models will not take into account the number of workers for configuration.
|
|
95
97
|
"SENDNN_INFERENCE_UPDATE_THREAD_CONFIG": lambda: bool(
|
|
96
98
|
int(os.getenv("SENDNN_INFERENCE_UPDATE_THREAD_CONFIG", "1"))
|
|
97
99
|
),
|
|
@@ -171,6 +173,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|
|
171
173
|
"SENDNN_INFERENCE_MM_DEVICE": lambda: parse_mm_device(
|
|
172
174
|
os.getenv("SENDNN_INFERENCE_MM_DEVICE", "auto")
|
|
173
175
|
),
|
|
176
|
+
# When "1" (default), rank 0 runs the vision encoder and shares the result
|
|
177
|
+
# with other TP ranks via POSIX shared memory (one encoder call instead of
|
|
178
|
+
# world_size calls). Set to "0" to fall back to every TP rank running the
|
|
179
|
+
# vision encoder independently — the original behaviour, which avoids any
|
|
180
|
+
# SHM-related failure modes at the cost of redundant CPU work.
|
|
181
|
+
"SENDNN_INFERENCE_TP_MM_SHARING": lambda: bool(
|
|
182
|
+
int(os.getenv("SENDNN_INFERENCE_TP_MM_SHARING", "1"))
|
|
183
|
+
),
|
|
174
184
|
}
|
|
175
185
|
# --8<-- [end:env-vars-definition]
|
|
176
186
|
|
|
@@ -72,6 +72,7 @@ class SpyreCausalLM(nn.Module):
|
|
|
72
72
|
self.parallel_config = vllm_config.parallel_config
|
|
73
73
|
self.cache_config = vllm_config.cache_config
|
|
74
74
|
self.scheduler_config = vllm_config.scheduler_config
|
|
75
|
+
self.load_config = vllm_config.load_config
|
|
75
76
|
self.dtype = self.get_dtype()
|
|
76
77
|
|
|
77
78
|
# Wrappers for utils for multimodal
|
|
@@ -171,16 +172,30 @@ class SpyreCausalLM(nn.Module):
|
|
|
171
172
|
self.dtype,
|
|
172
173
|
)
|
|
173
174
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
#
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
revision=model_config.revision,
|
|
175
|
+
# `--load-format dummy` skips the checkpoint download and routes through
|
|
176
|
+
# FMS's `hf_configured` path, which fetches only config.json and then
|
|
177
|
+
# random-inits the model via `reset_parameters()`.
|
|
178
|
+
variant: str | None = None
|
|
179
|
+
if self.load_config.load_format == "dummy":
|
|
180
|
+
logger.info(
|
|
181
|
+
"Loading model %s with random weights.",
|
|
182
|
+
model_config.model,
|
|
183
183
|
)
|
|
184
|
+
architecture = "hf_configured"
|
|
185
|
+
variant = model_config.model
|
|
186
|
+
model_path: str | None = None
|
|
187
|
+
else:
|
|
188
|
+
architecture = "hf_pretrained"
|
|
189
|
+
is_local = os.path.isdir(model_config.model)
|
|
190
|
+
model_path = model_config.model
|
|
191
|
+
# Get location of model from HF cache.
|
|
192
|
+
if not is_local:
|
|
193
|
+
model_path = download_weights_from_hf(
|
|
194
|
+
model_name_or_path=model_path,
|
|
195
|
+
cache_dir=None,
|
|
196
|
+
allow_patterns=["*.safetensors", "*.bin", "*.pt"],
|
|
197
|
+
revision=model_config.revision,
|
|
198
|
+
)
|
|
184
199
|
|
|
185
200
|
# Get any fixes needed that must be patched into the kwargs;
|
|
186
201
|
# currently this is only use for multimodal models / llava next
|
|
@@ -192,7 +207,8 @@ class SpyreCausalLM(nn.Module):
|
|
|
192
207
|
kwargs["rank"],
|
|
193
208
|
):
|
|
194
209
|
self.fms_model = get_model(
|
|
195
|
-
architecture=
|
|
210
|
+
architecture=architecture,
|
|
211
|
+
variant=variant,
|
|
196
212
|
model_path=model_path,
|
|
197
213
|
distributed_strategy=distributed_strategy,
|
|
198
214
|
group=dist.group.WORLD,
|
|
@@ -242,6 +242,14 @@ class SpyrePlatform(Platform):
|
|
|
242
242
|
if not is_decoder and not is_pooling:
|
|
243
243
|
raise ValueError("Only the 'generate' and 'pooling' runners are supported")
|
|
244
244
|
|
|
245
|
+
if vllm_config.load_config.load_format == "dummy" and (
|
|
246
|
+
model_config.is_multimodal_model or is_pooling
|
|
247
|
+
):
|
|
248
|
+
raise ValueError(
|
|
249
|
+
"--load-format dummy is only supported for text generation models; "
|
|
250
|
+
"random-weight init is not implemented for multimodal or pooling models."
|
|
251
|
+
)
|
|
252
|
+
|
|
245
253
|
if parallel_config.worker_cls == "auto":
|
|
246
254
|
parallel_config.worker_cls = "sendnn_inference.v1.worker.spyre_worker.SpyreWorker"
|
|
247
255
|
|
|
@@ -345,7 +353,7 @@ class SpyrePlatform(Platform):
|
|
|
345
353
|
scheduler_config.max_num_batched_tokens = (
|
|
346
354
|
model_config.max_model_len * scheduler_config.max_num_seqs
|
|
347
355
|
)
|
|
348
|
-
cache_config.block_size = model_config.max_model_len
|
|
356
|
+
cache_config.block_size = model_config.max_model_len
|
|
349
357
|
vllm_config.cache_config.enable_prefix_caching = False
|
|
350
358
|
|
|
351
359
|
else:
|
|
@@ -635,7 +643,12 @@ class SpyrePlatform(Platform):
|
|
|
635
643
|
|
|
636
644
|
# NOTE: math.ceil can output a number for each worker that sums
|
|
637
645
|
# to a total greater than cpu_count.
|
|
638
|
-
|
|
646
|
+
thread_factor = worker_count
|
|
647
|
+
if cls._config.model_config.is_multimodal_model:
|
|
648
|
+
# thread_factor value/formula subject to further tuning
|
|
649
|
+
thread_factor = 1
|
|
650
|
+
|
|
651
|
+
cpus_per_worker = math.ceil(cpu_count / thread_factor) if cpu_count is not None else None
|
|
639
652
|
|
|
640
653
|
thread_warning = (
|
|
641
654
|
"Excessive threads may result in CPU contention. "
|
|
@@ -821,7 +834,7 @@ class SpyrePlatform(Platform):
|
|
|
821
834
|
@classmethod
|
|
822
835
|
def _set_batch_tkv_limit_from_env(cls) -> None:
|
|
823
836
|
try:
|
|
824
|
-
cls._max_batch_tkv_limit = int(os.getenv("VLLM_DT_MAX_BATCH_TKV_LIMIT", "-1"))
|
|
837
|
+
cls._max_batch_tkv_limit = int(os.getenv("VLLM_DT_MAX_BATCH_TKV_LIMIT", "-1"))
|
|
825
838
|
except ValueError as e:
|
|
826
839
|
raise ValueError("VLLM_DT_MAX_BATCH_TKV_LIMIT must be an integer") from e
|
|
827
840
|
|
|
@@ -216,23 +216,6 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
|
|
|
216
216
|
"Expecting an instance of CPSpyreModelRunnerOutput when doing chunked prefill."
|
|
217
217
|
)
|
|
218
218
|
|
|
219
|
-
# Update the correct num_computed_tokens value given left-padding and
|
|
220
|
-
# prefix cache hit info
|
|
221
|
-
for req in self.ongoing_prefills:
|
|
222
|
-
# The number of computed tokens only need to be adapted when it is
|
|
223
|
-
# the first chunk of a multi-chunk prefill
|
|
224
|
-
is_first_chunk = req.num_computed_tokens <= self.chunk_size
|
|
225
|
-
is_last_chunk = req.num_computed_tokens == req.num_prompt_tokens
|
|
226
|
-
if is_first_chunk and not is_last_chunk:
|
|
227
|
-
left_padding = model_runner_output.left_padding.get(req.request_id, 0)
|
|
228
|
-
prefix_cache_len = model_runner_output.prefix_cache_hit_len.get(req.request_id, 0)
|
|
229
|
-
|
|
230
|
-
req.num_computed_tokens = self.adjust_computed_tokens(
|
|
231
|
-
computed_tokens=req.num_computed_tokens,
|
|
232
|
-
left_padding=left_padding,
|
|
233
|
-
prefix_cache_len=prefix_cache_len,
|
|
234
|
-
)
|
|
235
|
-
|
|
236
219
|
# Remove completed prefills
|
|
237
220
|
self.ongoing_prefills = [
|
|
238
221
|
req for req in self.ongoing_prefills if req.num_computed_tokens < req.num_prompt_tokens
|
|
@@ -250,21 +233,39 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
|
|
|
250
233
|
|
|
251
234
|
return result
|
|
252
235
|
|
|
253
|
-
def
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
#
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
#
|
|
267
|
-
|
|
236
|
+
def _current_chunk_token_threshold(self, new_prefill_candidates: list[Request]) -> int:
|
|
237
|
+
"""Returns the `long_prefill_token_threshold` to use for this step.
|
|
238
|
+
|
|
239
|
+
For the chunk-0 step cap to `chunk_size - left_padding` so the base
|
|
240
|
+
scheduler is aware of the padding blocks.
|
|
241
|
+
Otherwise return `chunk_size`: the natural chunk boundary."""
|
|
242
|
+
|
|
243
|
+
# If there are no new prefill candidates, no cap is needed.
|
|
244
|
+
if not new_prefill_candidates:
|
|
245
|
+
return self.chunk_size
|
|
246
|
+
|
|
247
|
+
new_prefill = new_prefill_candidates[0]
|
|
248
|
+
|
|
249
|
+
# Calculate left-padding tokens for this prompt.
|
|
250
|
+
prompt_len = new_prefill.num_prompt_tokens
|
|
251
|
+
n_chunks = math.ceil(prompt_len / self.chunk_size)
|
|
252
|
+
padded_prompt_len = math.ceil(prompt_len / self.block_size) * self.block_size
|
|
253
|
+
left_padding = n_chunks * self.chunk_size - padded_prompt_len
|
|
254
|
+
|
|
255
|
+
# If the prefix cache already covers chunk 0's real content, no cap is
|
|
256
|
+
# needed: the base scheduler will start from chunk i>=1, which has no
|
|
257
|
+
# padding. `get_computed_blocks` records into `prefix_cache_stats` as
|
|
258
|
+
# a side effect; the base scheduler calls it again, so toggle
|
|
259
|
+
# log_stats off here to avoid double-counting.
|
|
260
|
+
prev_log_stats = self.kv_cache_manager.log_stats
|
|
261
|
+
self.kv_cache_manager.log_stats = False
|
|
262
|
+
_, prefix_token_len = self.kv_cache_manager.get_computed_blocks(new_prefill)
|
|
263
|
+
self.kv_cache_manager.log_stats = prev_log_stats
|
|
264
|
+
if prefix_token_len >= self.chunk_size - left_padding:
|
|
265
|
+
return self.chunk_size
|
|
266
|
+
|
|
267
|
+
# Adjust the token threshold to account for left padding
|
|
268
|
+
return self.chunk_size - left_padding
|
|
268
269
|
|
|
269
270
|
def _get_required_blocks(self, request: Request, max_output: bool = False) -> tuple[int, int]:
|
|
270
271
|
"""
|
|
@@ -404,7 +405,7 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
|
|
|
404
405
|
ready_to_prefill = [
|
|
405
406
|
r
|
|
406
407
|
for r in self.waiting
|
|
407
|
-
if r.status != RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR
|
|
408
|
+
if r.status != RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR
|
|
408
409
|
]
|
|
409
410
|
if ready_to_prefill:
|
|
410
411
|
new_prefill_candidates = list(self.waiting)
|
|
@@ -424,6 +425,15 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
|
|
|
424
425
|
self.previous_step_was_prefill = False
|
|
425
426
|
running_holdback = []
|
|
426
427
|
|
|
428
|
+
# Cap chunk-0 token count to chunk_size - left_padding so the upstream KV
|
|
429
|
+
# cache manager doesn't allocate a real blocks for the left-padding region.
|
|
430
|
+
# Only matters at chunk 0; later chunks land on natural chunk boundaries.
|
|
431
|
+
# Mutating scheduler_config is safe: the SpyreScheduler is the only
|
|
432
|
+
# scheduler in this engine and at most one prefill is in flight per step.
|
|
433
|
+
self.scheduler_config.long_prefill_token_threshold = self._current_chunk_token_threshold(
|
|
434
|
+
new_prefill_candidates
|
|
435
|
+
)
|
|
436
|
+
|
|
427
437
|
# delegate to super of SpyreScheduler: base V1 Scheduler
|
|
428
438
|
outputs = super(SpyreScheduler, self).schedule()
|
|
429
439
|
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Utilities for sharing multimodal embeddings across TP ranks via POSIX shared memory.
|
|
2
|
+
|
|
3
|
+
During chunked prefill rank 0 computes the full vision-encoder embeddings and writes
|
|
4
|
+
them here; non-zero ranks read after synchronisation in the model runner.
|
|
5
|
+
This avoids running the (CPU-bound) vision encoder world_size times per request.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
from multiprocessing.shared_memory import SharedMemory
|
|
10
|
+
|
|
11
|
+
import torch
|
|
12
|
+
|
|
13
|
+
from vllm.logger import init_logger
|
|
14
|
+
|
|
15
|
+
logger = init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# Stable mapping between torch dtypes and compact integer identifiers used in
|
|
18
|
+
# the broadcast metadata tensor. torch.frombuffer handles all dtypes natively.
|
|
19
|
+
_DTYPE_TO_IDX: dict[torch.dtype, int] = {
|
|
20
|
+
torch.float16: 0,
|
|
21
|
+
torch.float32: 1,
|
|
22
|
+
torch.bfloat16: 2,
|
|
23
|
+
}
|
|
24
|
+
_IDX_TO_DTYPE: dict[int, torch.dtype] = {v: k for k, v in _DTYPE_TO_IDX.items()}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def dtype_to_idx(dtype: torch.dtype) -> int:
|
|
28
|
+
"""Encode a torch dtype as a compact integer for the broadcast metadata tensor."""
|
|
29
|
+
if dtype not in _DTYPE_TO_IDX:
|
|
30
|
+
raise ValueError(f"Unsupported dtype for SHM transfer: {dtype}")
|
|
31
|
+
return _DTYPE_TO_IDX[dtype]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def idx_to_dtype(idx: int) -> torch.dtype:
|
|
35
|
+
"""Decode a compact integer back to the corresponding torch dtype."""
|
|
36
|
+
if idx not in _IDX_TO_DTYPE:
|
|
37
|
+
raise ValueError(f"Unknown dtype index: {idx}")
|
|
38
|
+
return _IDX_TO_DTYPE[idx]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _shm_name(req_id: str) -> str:
|
|
42
|
+
"""Generate a short, deterministic POSIX SHM name for a request.
|
|
43
|
+
|
|
44
|
+
Uses an MD5 hash of the *full* req_id so that requests which share a
|
|
45
|
+
common prefix (e.g. all benchmark requests in a run share the
|
|
46
|
+
``chatcmpl-bench-<uuid>-`` prefix) still get distinct SHM names.
|
|
47
|
+
|
|
48
|
+
Truncating the req_id (the previous approach) caused silent collisions:
|
|
49
|
+
``chatcmpl-bench-34e3ed2d-1-…`` and ``chatcmpl-bench-34e3ed2d-39-…``
|
|
50
|
+
both hash to the same 20-char prefix, so every request in the benchmark
|
|
51
|
+
wrote to the same SHM segment — corrupting each other's embeddings.
|
|
52
|
+
|
|
53
|
+
Linux NAME_MAX is 255; macOS requires ≤ 30 chars for the name itself
|
|
54
|
+
(the kernel prefixes it with ``/``). 'sm' + 16 hex chars = 18 chars,
|
|
55
|
+
safely within every platform's limit.
|
|
56
|
+
"""
|
|
57
|
+
digest = hashlib.md5(req_id.encode(), usedforsecurity=False).hexdigest()[:16]
|
|
58
|
+
return f"sm{digest}"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def write_embeddings(tensor: torch.Tensor, req_id: str) -> SharedMemory:
|
|
62
|
+
"""Write *tensor* to a shared-memory block keyed by *req_id*.
|
|
63
|
+
|
|
64
|
+
Returns the ``SharedMemory`` handle — the caller must keep it and pass it
|
|
65
|
+
to :func:`cleanup_embeddings` after all ranks have read.
|
|
66
|
+
|
|
67
|
+
Shape and dtype are NOT stored in SHM; the caller broadcasts them via a
|
|
68
|
+
tiny ``torch.distributed.broadcast`` so readers already have that info
|
|
69
|
+
before calling :func:`read_embeddings`.
|
|
70
|
+
"""
|
|
71
|
+
if tensor.device.type != "cpu":
|
|
72
|
+
tensor = tensor.cpu()
|
|
73
|
+
tensor = tensor.contiguous()
|
|
74
|
+
|
|
75
|
+
assert tensor.ndim == 3, f"Expected 3-D embedding tensor, got shape {tensor.shape}"
|
|
76
|
+
assert tensor.dtype in _DTYPE_TO_IDX, f"Unsupported dtype for SHM transfer: {tensor.dtype}"
|
|
77
|
+
|
|
78
|
+
data_shm = SharedMemory(create=True, size=tensor.nbytes, name=_shm_name(req_id))
|
|
79
|
+
torch.frombuffer(data_shm.buf, dtype=tensor.dtype).reshape(tensor.shape).copy_(tensor)
|
|
80
|
+
|
|
81
|
+
logger.debug(
|
|
82
|
+
"Wrote MM embeddings to SHM for req '%s': shape=%s dtype=%s bytes=%d",
|
|
83
|
+
req_id,
|
|
84
|
+
tuple(tensor.shape),
|
|
85
|
+
tensor.dtype,
|
|
86
|
+
tensor.nbytes,
|
|
87
|
+
)
|
|
88
|
+
return data_shm
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def read_embeddings(
|
|
92
|
+
req_id: str,
|
|
93
|
+
shape: tuple[int, int, int],
|
|
94
|
+
dtype: torch.dtype,
|
|
95
|
+
) -> torch.Tensor:
|
|
96
|
+
"""Read embeddings from shared memory and return a detached CPU tensor.
|
|
97
|
+
|
|
98
|
+
*shape* and *dtype* must be provided by the caller (obtained from the
|
|
99
|
+
broadcast metadata tensor) — they are not re-read from SHM.
|
|
100
|
+
|
|
101
|
+
Opens and closes the shared-memory handle internally.
|
|
102
|
+
"""
|
|
103
|
+
data_shm = SharedMemory(name=_shm_name(req_id))
|
|
104
|
+
# .clone() detaches the tensor from the SHM buffer so the handle can be closed.
|
|
105
|
+
result = torch.frombuffer(data_shm.buf, dtype=dtype).reshape(shape).clone()
|
|
106
|
+
data_shm.close()
|
|
107
|
+
|
|
108
|
+
logger.debug(
|
|
109
|
+
"Read MM embeddings from SHM for req '%s': shape=%s dtype=%s",
|
|
110
|
+
req_id,
|
|
111
|
+
shape,
|
|
112
|
+
dtype,
|
|
113
|
+
)
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def cleanup_embeddings(data_shm: SharedMemory) -> None:
|
|
118
|
+
"""Unlink and close the shared-memory block.
|
|
119
|
+
|
|
120
|
+
Safe to call even if the block was already cleaned up — exceptions are
|
|
121
|
+
logged but not re-raised.
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
data_shm.unlink()
|
|
125
|
+
except Exception as exc:
|
|
126
|
+
logger.debug("SHM unlink skipped (%s): %s", data_shm.name, exc)
|
|
127
|
+
try:
|
|
128
|
+
data_shm.close()
|
|
129
|
+
except Exception as exc:
|
|
130
|
+
logger.debug("SHM close skipped (%s): %s", data_shm.name, exc)
|