sendnn-inference 2.1.4__tar.gz → 2.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sendnn_inference-2.2.2/.github/ci_model_cache.yaml +17 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/model_smoke.yml +6 -2
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/test.yml +46 -117
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/PKG-INFO +3 -3
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/configuration.md +9 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/long_context.py +15 -2
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/text_inference.py +14 -1
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/pyproject.toml +6 -5
- sendnn_inference-2.2.2/sendnn_inference/_version.py +34 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/argparse_utils.py +1 -1
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_configs.yaml +56 -4
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/envs.py +10 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/spyre.py +31 -15
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/platform.py +16 -3
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/core/scheduler.py +139 -34
- sendnn_inference-2.2.2/sendnn_inference/v1/worker/mm_shared_memory.py +130 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_model_runner.py +208 -40
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_worker.py +29 -3
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/PKG-INFO +3 -3
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/SOURCES.txt +4 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/requires.txt +2 -2
- sendnn_inference-2.2.2/tests/e2e/test_load_format_dummy.py +39 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_basic.py +3 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_cp_scheduler_steps.py +184 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_pc_scheduler_steps.py +351 -18
- sendnn_inference-2.2.2/tests/hf_cache.json +594 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/llm_cache.py +5 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/multimodal/test_llava_next.py +1 -1
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/multimodal/test_mistral3.py +1 -1
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/scheduling_utils.py +12 -1
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/spyre_util.py +12 -3
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_upstream_compatibility.py +17 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/core/test_scheduler_structured_outputs.py +16 -0
- sendnn_inference-2.2.2/tests/v1/worker/test_mm_shared_memory.py +328 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/test_prefix_caching_worker.py +1 -1
- sendnn_inference-2.2.2/tools/download_model.py +87 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/uv.lock +1375 -1414
- sendnn_inference-2.1.4/sendnn_inference/_version.py +0 -24
- sendnn_inference-2.1.4/tests/hf_cache.json +0 -1239
- sendnn_inference-2.1.4/tools/download_model.py +0 -42
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.bob/skills/update-vllm/SKILL.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.claude/skills/update-vllm/SKILL.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/CODEOWNERS +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/ISSUE_TEMPLATE/rfc.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/actions/free-up-disk-space/action.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/pull_request_template.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/add_label_automerge.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/build_and_publish.yaml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/build_docker.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/check_uv_lock.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/lint_scripts.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/matchers/ruff.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/pre-commit.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/publish_to_test_pypi.yaml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/reminder_comment.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/stale.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.gitignore +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.pre-commit-config.yaml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.readthedocs.yaml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.shellcheckrc +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.yapfignore +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/CLAUDE.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/CODE_OF_CONDUCT.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/CONTRIBUTING.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/DCO +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/LICENSE +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/README.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/RELEASING.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/_local_envs_for_test.sh +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docker/.senlib.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docker/Dockerfile.amd64 +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docker/simple_vllm_serve.sh +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/.nav.yml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/README.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/README.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/architecture.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/images/vllm_v1.svg +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/images/vllm_v1_spyre.svg +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/maintaining.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/multimodal/adding_new_models.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/scheduler.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/contributing/vllm-update-procedure.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/deploying/docker.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/deploying/k8s.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/deploying/rhoai.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/getting_started/installation.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/generate_example.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefill_single_chunks.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefill_three_chunks.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_1.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_2.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/prefix_caching_3.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/scheduling_admission_constraints.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/scheduling_padding_tkv_jump.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/data/timeline_admission_constraints.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_examples.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_model_tables.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_prefill_only_plots.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_prefix_caching_plots.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_scheduling_plots.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/generate_timeline.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/hooks/url_schemes.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/mkdocs/overrides/main.html +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/requirements-docs.txt +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/roadmaps/q3-2025.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/env_vars.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/performance.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/supported_features.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/supported_models.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/vision_inference.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/online_inference/openai_spyre_text.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/online_inference/openai_spyre_vision.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/online_inference/spyre_vllm_benchmark.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/online_inference/spyre_vllm_setup_container.sh +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/format.sh +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/mkdocs.yaml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/compat_utils.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/compilation_utils.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/README.md +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/configurators/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/configurators/model_configurator.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_config.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_matcher.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_registry.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/spyre_setup.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/base.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/llava_next.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/multimodal/mm_mappings/mistral3.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/perf_metrics.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/utils.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/core/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/metrics/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/metrics/stats_logger.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/sample/golden_token_injector.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/sample/spyre_logits_processor.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/v1/worker/spyre_input_batch.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/dependency_links.txt +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/entry_points.txt +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference.egg-info/top_level.txt +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/setup.cfg +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/aftu/graph_compare_utils.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/aftu/test_compare_graphs.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/__init__.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/conftest.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/fixtures/test_error_handling_models.yaml +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_env_config_path.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_error_handling.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_integration.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_model_config.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_model_configurator.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_model_matcher.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/config/test_model_registry.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/conftest.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/download_model_configs.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_chunked_prefill.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_chunked_prefill_tkv_steps.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_logits_processors.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_model_smoke.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_profiler.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_sampling_params.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_async_llm.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_embeddings.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_mm.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_online.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_scoring.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_seed.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_spyre_stagger_basic.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_stats_logger.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/e2e/test_structured_outputs.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/BAAI/bge-reranker-large/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/BAAI/bge-reranker-v2-m3/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-ai-platform/micro-g3.3-8b-instruct-1b/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-3.3-8b-instruct/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-3.3-8b-instruct-FP8/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense-hybrid/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-embedding-125m-english/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/ibm-granite/granite-embedding-278m-multilingual/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/intfloat/multilingual-e5-large/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/fixtures/model_configs/sentence-transformers/all-roberta-large-v1/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/hf_result_cache.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/llm_cache_util.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/model_loader/test_spyre.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/output_util.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/precompilation/test_disable_compilation.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/bge_copy/config.json +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_cli_args.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_envs.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_golden_token_injector.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_platform_validation.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/utils/test_spyre_model_list.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/mock_model.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/test_scheduler_tkv_limits.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/test_spyre_input_batch.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tests/v1/worker/test_spyre_worker_profile.py +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tools/check_aiu.sh +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tools/check_repo.sh +0 -0
- {sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/tools/lint_scripts.sh +0 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Models pre-fetched into the GHA HuggingFace cache for the Test workflow.
|
|
2
|
+
#
|
|
3
|
+
# Adding/removing/changing an entry here changes the cache key and will trigger
|
|
4
|
+
# a fresh download + cache save on the next push to main. Total uncompressed
|
|
5
|
+
# size of all entries must stay under 8 GiB (GHA cache limit is 10 GiB).
|
|
6
|
+
|
|
7
|
+
models:
|
|
8
|
+
- repo: ibm-ai-platform/micro-g3.3-8b-instruct-1b
|
|
9
|
+
revision: 6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f
|
|
10
|
+
- repo: ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8
|
|
11
|
+
revision: 0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e
|
|
12
|
+
- repo: sentence-transformers/all-roberta-large-v1
|
|
13
|
+
revision: cf74d8acd4f198de950bf004b262e6accfed5d2c
|
|
14
|
+
- repo: cross-encoder/stsb-roberta-large
|
|
15
|
+
revision: 2b12c2c0088918e76151fd5937b7bba986ef1f98
|
|
16
|
+
- repo: Qwen/Qwen3-Embedding-0.6B
|
|
17
|
+
revision: 97b0c614be4d77ee51c0cef4e5f07c00f9eb65b3
|
|
@@ -62,9 +62,13 @@ jobs:
|
|
|
62
62
|
|
|
63
63
|
steps:
|
|
64
64
|
- name: "Lightweight disk cleanup"
|
|
65
|
+
# all rm -rf, no apt/docker — completes in < 1s and frees ~20 GB
|
|
65
66
|
run: |
|
|
66
|
-
rm -rf /usr/
|
|
67
|
-
rm -rf /
|
|
67
|
+
sudo rm -rf /usr/local/lib/android
|
|
68
|
+
sudo rm -rf /opt/ghc /usr/local/.ghcup
|
|
69
|
+
sudo rm -rf /usr/share/dotnet
|
|
70
|
+
sudo rm -rf /usr/share/swift
|
|
71
|
+
sudo rm -rf /usr/local/share/chromium
|
|
68
72
|
sudo rm -rf /usr/local/share/powershell
|
|
69
73
|
|
|
70
74
|
- name: "Checkout"
|
|
@@ -20,8 +20,6 @@ env:
|
|
|
20
20
|
VLLM_TARGET_DEVICE: "empty"
|
|
21
21
|
VLLM_PLUGINS: "sendnn_inference"
|
|
22
22
|
HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
|
|
23
|
-
DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
|
|
24
|
-
DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
|
|
25
23
|
|
|
26
24
|
concurrency:
|
|
27
25
|
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
|
@@ -29,7 +27,7 @@ concurrency:
|
|
|
29
27
|
|
|
30
28
|
jobs:
|
|
31
29
|
test:
|
|
32
|
-
timeout-minutes:
|
|
30
|
+
timeout-minutes: 25
|
|
33
31
|
runs-on: ${{ matrix.os }}
|
|
34
32
|
strategy:
|
|
35
33
|
fail-fast: false
|
|
@@ -49,26 +47,18 @@ jobs:
|
|
|
49
47
|
- name: "fp8"
|
|
50
48
|
markers: "cpu and quantized and multi"
|
|
51
49
|
flags: "--timeout=600 -k 'basic and test_output' --durations=0"
|
|
52
|
-
hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
|
|
53
|
-
hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e"
|
|
54
50
|
- name: "embedding"
|
|
55
51
|
markers: "cpu and embedding and not quantized"
|
|
56
52
|
flags: "--timeout=300"
|
|
57
|
-
hf_model: "sentence-transformers/all-roberta-large-v1"
|
|
58
|
-
hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
|
|
59
53
|
- name: "scoring"
|
|
60
54
|
markers: "cpu and scoring"
|
|
61
55
|
flags: "--timeout=300"
|
|
62
|
-
hf_model: "cross-encoder/stsb-roberta-large"
|
|
63
|
-
hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98"
|
|
64
56
|
- name: "worker and utils"
|
|
65
57
|
markers: "not e2e and not quantized and not spyre and not multimodal"
|
|
66
58
|
flags: "--timeout=300"
|
|
67
59
|
- name: "multimodal"
|
|
68
60
|
markers: "cpu and multimodal"
|
|
69
61
|
flags: "--timeout=300 -sv"
|
|
70
|
-
# hf_model: "ibm-granite/granite-vision-3.2-2b"
|
|
71
|
-
# hf_model_rev: "2818ae5b93cb750b099df1b65f7864e4a0401271"
|
|
72
62
|
env_overrides: "HF_HUB_OFFLINE=0"
|
|
73
63
|
include:
|
|
74
64
|
# Lower bound support
|
|
@@ -79,8 +69,6 @@ jobs:
|
|
|
79
69
|
name: "backward compat"
|
|
80
70
|
markers: "compat or (cpu and basic and not quantized)"
|
|
81
71
|
flags: "--timeout=300"
|
|
82
|
-
hf_model_2: "sentence-transformers/all-roberta-large-v1"
|
|
83
|
-
hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
|
|
84
72
|
os: "ubuntu-latest"
|
|
85
73
|
python_version: "3.12"
|
|
86
74
|
# Intermediate versions of vllm to check basic support for as well
|
|
@@ -91,8 +79,6 @@ jobs:
|
|
|
91
79
|
name: "backward compat"
|
|
92
80
|
markers: "compat or (cpu and basic and not quantized)"
|
|
93
81
|
flags: "--timeout=300"
|
|
94
|
-
hf_model_2: "sentence-transformers/all-roberta-large-v1"
|
|
95
|
-
hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
|
|
96
82
|
os: "ubuntu-latest"
|
|
97
83
|
python_version: "3.12"
|
|
98
84
|
- vllm_version:
|
|
@@ -102,8 +88,6 @@ jobs:
|
|
|
102
88
|
name: "backward compat"
|
|
103
89
|
markers: "compat or (cpu and basic and not quantized)"
|
|
104
90
|
flags: "--timeout=300"
|
|
105
|
-
hf_model_2: "sentence-transformers/all-roberta-large-v1"
|
|
106
|
-
hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
|
|
107
91
|
os: "ubuntu-latest"
|
|
108
92
|
python_version: "3.12"
|
|
109
93
|
- vllm_version:
|
|
@@ -113,8 +97,6 @@ jobs:
|
|
|
113
97
|
name: "backward compat"
|
|
114
98
|
markers: "compat or (cpu and basic and not quantized)"
|
|
115
99
|
flags: "--timeout=300"
|
|
116
|
-
hf_model_2: "sentence-transformers/all-roberta-large-v1"
|
|
117
|
-
hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
|
|
118
100
|
os: "ubuntu-latest"
|
|
119
101
|
python_version: "3.12"
|
|
120
102
|
- vllm_version:
|
|
@@ -124,8 +106,15 @@ jobs:
|
|
|
124
106
|
name: "backward compat"
|
|
125
107
|
markers: "compat or (cpu and basic and not quantized)"
|
|
126
108
|
flags: "--timeout=300"
|
|
127
|
-
|
|
128
|
-
|
|
109
|
+
os: "ubuntu-latest"
|
|
110
|
+
python_version: "3.12"
|
|
111
|
+
- vllm_version:
|
|
112
|
+
name: "vLLM:0.22.0"
|
|
113
|
+
repo: "git+https://github.com/vllm-project/vllm --tag v0.22.0"
|
|
114
|
+
test_suite:
|
|
115
|
+
name: "backward compat"
|
|
116
|
+
markers: "compat or (cpu and basic and not quantized)"
|
|
117
|
+
flags: "--timeout=300"
|
|
129
118
|
os: "ubuntu-latest"
|
|
130
119
|
python_version: "3.12"
|
|
131
120
|
|
|
@@ -146,10 +135,14 @@ jobs:
|
|
|
146
135
|
steps:
|
|
147
136
|
- name: "Lightweight disk cleanup"
|
|
148
137
|
# super lightweight cleanup, not nearly as much as actions/free-up-disk-space
|
|
138
|
+
# all rm -rf, no apt/docker — completes in < 1s and frees ~20 GB
|
|
149
139
|
shell: bash
|
|
150
140
|
run: |
|
|
151
|
-
rm -rf /usr/
|
|
152
|
-
rm -rf /
|
|
141
|
+
sudo rm -rf /usr/local/lib/android
|
|
142
|
+
sudo rm -rf /opt/ghc /usr/local/.ghcup
|
|
143
|
+
sudo rm -rf /usr/share/dotnet
|
|
144
|
+
sudo rm -rf /usr/share/swift
|
|
145
|
+
sudo rm -rf /usr/local/share/chromium
|
|
153
146
|
sudo rm -rf /usr/local/share/powershell
|
|
154
147
|
|
|
155
148
|
- name: "Checkout"
|
|
@@ -199,117 +192,53 @@ jobs:
|
|
|
199
192
|
# overwritten.
|
|
200
193
|
uv pip install -v .
|
|
201
194
|
|
|
202
|
-
- name: "Standardize HF model names for caching"
|
|
203
|
-
id: standardize-names
|
|
204
|
-
if: steps.changed-src-files.outputs.any_changed == 'true'
|
|
205
|
-
run: |
|
|
206
|
-
# replace '/' characters in HF_MODEL with '--' for GHA cache keys and
|
|
207
|
-
# in model file names in local HF hub cache
|
|
208
|
-
|
|
209
|
-
# don't use in-line default values for variable expansion here to not
|
|
210
|
-
# use the default model revision with a non-default model like this:
|
|
211
|
-
# model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}"
|
|
212
|
-
# revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}"
|
|
213
|
-
|
|
214
|
-
if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
|
|
215
|
-
model="${{ matrix.test_suite.hf_model }}"
|
|
216
|
-
revision="${{ matrix.test_suite.hf_model_rev }}"
|
|
217
|
-
else
|
|
218
|
-
model="${{ env.DEFAULT_HF_MODEL }}"
|
|
219
|
-
revision="${{ env.DEFAULT_HF_MODEL_REV }}"
|
|
220
|
-
fi
|
|
221
|
-
safe_name="${model//\//--}"
|
|
222
|
-
echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV"
|
|
223
|
-
echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV"
|
|
224
|
-
|
|
225
|
-
if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
|
|
226
|
-
model_2="${{ matrix.test_suite.hf_model_2 }}"
|
|
227
|
-
revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
|
|
228
|
-
safe_name_2="${model_2//\//--}"
|
|
229
|
-
echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV"
|
|
230
|
-
echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
|
|
231
|
-
fi
|
|
232
|
-
|
|
233
195
|
- name: "Restore HF models cache"
|
|
234
196
|
id: cache_restore
|
|
235
197
|
if: steps.changed-src-files.outputs.any_changed == 'true'
|
|
236
198
|
uses: actions/cache/restore@v4
|
|
237
199
|
with:
|
|
238
|
-
path: ${{ env.
|
|
239
|
-
key: ${{ runner.os }}-hf-
|
|
240
|
-
|
|
241
|
-
- name: "Restore HF models cache for additional model"
|
|
242
|
-
id: cache_restore_2
|
|
243
|
-
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
|
|
244
|
-
uses: actions/cache/restore@v4
|
|
245
|
-
with:
|
|
246
|
-
path: ${{ env.model_2_path }}
|
|
247
|
-
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
|
|
200
|
+
path: ${{ env.HF_HUB_CACHE }}
|
|
201
|
+
key: ${{ runner.os }}-hf-cache-${{ hashFiles('.github/ci_model_cache.yaml') }}
|
|
248
202
|
|
|
249
203
|
- name: "Download HF models"
|
|
250
|
-
if: ( steps.changed-src-files.outputs.any_changed == 'true' &&
|
|
204
|
+
if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' )
|
|
251
205
|
run: |
|
|
252
|
-
#
|
|
253
|
-
#
|
|
254
|
-
#
|
|
255
|
-
#
|
|
256
|
-
#
|
|
257
|
-
#
|
|
258
|
-
#
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
revision="${{ matrix.test_suite.hf_model_rev }}"
|
|
263
|
-
else
|
|
264
|
-
model="${{ env.DEFAULT_HF_MODEL }}"
|
|
265
|
-
revision="${{ env.DEFAULT_HF_MODEL_REV }}"
|
|
266
|
-
fi
|
|
267
|
-
model_2="${{ matrix.test_suite.hf_model_2 }}"
|
|
268
|
-
revision_2="${{ matrix.test_suite.hf_model_2_rev }}"
|
|
269
|
-
|
|
270
|
-
python3 tools/download_model.py -m "$model" -r "${revision:-main}" &
|
|
271
|
-
|
|
272
|
-
if [[ -n "$model_2" ]]; then
|
|
273
|
-
python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" &
|
|
274
|
-
fi
|
|
275
|
-
|
|
276
|
-
wait
|
|
206
|
+
# The full HF_HUB_CACHE is cached as a single GHA entry keyed on the
|
|
207
|
+
# hash of .github/ci_model_cache.yaml. Edit that file to add/remove
|
|
208
|
+
# models — a fresh download + cache save will run on the next push to
|
|
209
|
+
# main. Stale cache blobs expire after 7 days (or can be deleted by
|
|
210
|
+
# an admin).
|
|
211
|
+
#
|
|
212
|
+
# We cache for reliability, not speed: HF downloads are flaky under
|
|
213
|
+
# concurrent jobs.
|
|
214
|
+
source .venv/bin/activate
|
|
215
|
+
python3 tools/download_model.py --config .github/ci_model_cache.yaml
|
|
277
216
|
|
|
278
|
-
- name: "Check HF model cache
|
|
217
|
+
- name: "Check HF model cache size"
|
|
279
218
|
if: steps.changed-src-files.outputs.any_changed == 'true'
|
|
280
219
|
run: |
|
|
281
|
-
#
|
|
282
|
-
#
|
|
283
|
-
#
|
|
284
|
-
#
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
status=1
|
|
295
|
-
fi
|
|
220
|
+
# GHA cache has a 10 GB total limit. Keep the combined HF cache well
|
|
221
|
+
# under that so other caches (uv, etc) can coexist. If this fails,
|
|
222
|
+
# either prune entries from .github/ci_model_cache.yaml or update
|
|
223
|
+
# tools/download_model.py to exclude unused artifacts (onnx/,
|
|
224
|
+
# openvino/, duplicate framework weights, etc).
|
|
225
|
+
MAX_BYTES=$((8 * 1024 * 1024 * 1024)) # 8 GiB uncompressed
|
|
226
|
+
if [[ -d "${HF_HUB_CACHE}" ]]; then
|
|
227
|
+
size=$(du -sb "${HF_HUB_CACHE}" | cut -f1)
|
|
228
|
+
human=$(du -sh "${HF_HUB_CACHE}" | cut -f1)
|
|
229
|
+
echo "HF cache ${HF_HUB_CACHE}: $human ($size bytes)"
|
|
230
|
+
if (( size > MAX_BYTES )); then
|
|
231
|
+
echo "::error::HF cache is $human, exceeding the 8 GiB limit."
|
|
232
|
+
exit 1
|
|
296
233
|
fi
|
|
297
|
-
|
|
298
|
-
exit $status
|
|
234
|
+
fi
|
|
299
235
|
|
|
300
236
|
- name: "Save HF models cache"
|
|
301
237
|
if: ( steps.changed-src-files.outputs.any_changed == 'true' && github.event_name != 'pull_request' && steps.cache_restore.outputs.cache-hit != 'true' )
|
|
302
238
|
uses: actions/cache/save@v4
|
|
303
239
|
with:
|
|
304
|
-
path: ${{ env.
|
|
305
|
-
key: ${{ runner.os }}-hf-
|
|
306
|
-
|
|
307
|
-
- name: "Save HF models cache for additional model"
|
|
308
|
-
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
|
|
309
|
-
uses: actions/cache/save@v4
|
|
310
|
-
with:
|
|
311
|
-
path: ${{ env.model_2_path }}
|
|
312
|
-
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
|
|
240
|
+
path: ${{ env.HF_HUB_CACHE }}
|
|
241
|
+
key: ${{ runner.os }}-hf-cache-${{ hashFiles('.github/ci_model_cache.yaml') }}
|
|
313
242
|
|
|
314
243
|
- name: "Run tests"
|
|
315
244
|
if: steps.changed-src-files.outputs.any_changed == 'true'
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sendnn-inference
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.2
|
|
4
4
|
Summary: vLLM plugin for Spyre hardware support
|
|
5
5
|
License: Apache 2
|
|
6
6
|
Requires-Python: >=3.11
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
License-File: LICENSE
|
|
9
9
|
Requires-Dist: fms-model-optimizer[fp8-infer]<0.9,>=0.8.3
|
|
10
|
-
Requires-Dist: ibm-fms<2,>=1.
|
|
11
|
-
Requires-Dist: vllm<0.
|
|
10
|
+
Requires-Dist: ibm-fms<2,>=1.11.1
|
|
11
|
+
Requires-Dist: vllm<0.23.1,>=0.19.1
|
|
12
12
|
Requires-Dist: torch
|
|
13
13
|
Requires-Dist: torchvision
|
|
14
14
|
Dynamic: license-file
|
|
@@ -121,6 +121,15 @@ Prefix caching mirrors upstream vLLM, though the requirement for fixed-size pref
|
|
|
121
121
|
|
|
122
122
|
When prefix caching is enabled, the `vllm:prefix_cache_queries` and `vllm:prefix_cache_hits` metrics correctly report prefix cache stats in tokens.
|
|
123
123
|
|
|
124
|
+
### Multimodal Models
|
|
125
|
+
|
|
126
|
+
For multimodal models, vision encoding is offloaded to the CPU. In order to prevent expensive duplication of vision encoding, prefill during multimodal models is slightly different than that of text-only models. Vision encoding is done once per request instead of per worker so the threading configuration for multimodal models is also slightly different to improve performance.
|
|
127
|
+
|
|
128
|
+
Text-only models set the number of available threads through dividing the number of available CPUs available by number of worker and only assigning that per worker.
|
|
129
|
+
Multimodal models currently set the number of available threads to the number of available cpus available, ignoring the number of workers. This may be changed in the future.
|
|
130
|
+
|
|
131
|
+
The maximum available number of CPUs also can be set using `SENDNN_INFERENCE_NUM_CPUS`.
|
|
132
|
+
|
|
124
133
|
## Pooling Models
|
|
125
134
|
|
|
126
135
|
For the embedding, scoring, and reranking tasks, vLLM supports running Pooling Models. More information on Pooling Models can be found in the [vLLM official documentation](https://docs.vllm.ai/en/latest/models/pooling_models/).
|
{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/long_context.py
RENAMED
|
@@ -45,6 +45,18 @@ if __name__ == "__main__":
|
|
|
45
45
|
)
|
|
46
46
|
parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
|
|
47
47
|
parser.add_argument("--backend", type=str, default="sendnn", choices=["eager", "sendnn"])
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--tokenizer",
|
|
50
|
+
type=str,
|
|
51
|
+
default=None,
|
|
52
|
+
help="HF tokenizer id or path. Defaults to --model.",
|
|
53
|
+
)
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--load-format",
|
|
56
|
+
type=str,
|
|
57
|
+
default="auto",
|
|
58
|
+
help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
|
|
59
|
+
)
|
|
48
60
|
|
|
49
61
|
args = parser.parse_args()
|
|
50
62
|
|
|
@@ -95,7 +107,7 @@ if __name__ == "__main__":
|
|
|
95
107
|
prompts = prompts * (args.num_prompts // len(prompts) + 1)
|
|
96
108
|
prompts = prompts[0 : args.num_prompts]
|
|
97
109
|
|
|
98
|
-
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
|
110
|
+
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer or args.model)
|
|
99
111
|
|
|
100
112
|
tokenized_prompts = tokenizer(prompts)["input_ids"]
|
|
101
113
|
tokenized_prompts = [p[: args.max_prompt_len] for p in tokenized_prompts]
|
|
@@ -124,7 +136,8 @@ if __name__ == "__main__":
|
|
|
124
136
|
# Create an LLM.
|
|
125
137
|
llm = LLM(
|
|
126
138
|
model=args.model,
|
|
127
|
-
tokenizer=args.model,
|
|
139
|
+
tokenizer=args.tokenizer or args.model,
|
|
140
|
+
load_format=args.load_format,
|
|
128
141
|
max_model_len=args.max_model_len,
|
|
129
142
|
max_num_seqs=args.max_num_seqs,
|
|
130
143
|
tensor_parallel_size=args.tp,
|
{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/text_inference.py
RENAMED
|
@@ -29,6 +29,18 @@ if __name__ == "__main__":
|
|
|
29
29
|
)
|
|
30
30
|
parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
|
|
31
31
|
parser.add_argument("--backend", type=str, default="eager", choices=["eager", "sendnn"])
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--tokenizer",
|
|
34
|
+
type=str,
|
|
35
|
+
default=None,
|
|
36
|
+
help="HF tokenizer id or path. Defaults to --model.",
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--load-format",
|
|
40
|
+
type=str,
|
|
41
|
+
default="auto",
|
|
42
|
+
help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
|
|
43
|
+
)
|
|
32
44
|
|
|
33
45
|
args = parser.parse_args()
|
|
34
46
|
|
|
@@ -84,7 +96,8 @@ if __name__ == "__main__":
|
|
|
84
96
|
# Create an LLM.
|
|
85
97
|
llm = LLM(
|
|
86
98
|
model=args.model,
|
|
87
|
-
tokenizer=args.model,
|
|
99
|
+
tokenizer=args.tokenizer or args.model,
|
|
100
|
+
load_format=args.load_format,
|
|
88
101
|
max_model_len=args.max_model_len,
|
|
89
102
|
max_num_seqs=args.max_num_seqs,
|
|
90
103
|
tensor_parallel_size=args.tp,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[build-system]
|
|
2
2
|
requires = [
|
|
3
3
|
"setuptools>=82",
|
|
4
|
-
"setuptools_scm>=8"
|
|
4
|
+
"setuptools_scm>=8,<10"
|
|
5
5
|
]
|
|
6
6
|
build-backend = "setuptools.build_meta"
|
|
7
7
|
|
|
@@ -12,10 +12,10 @@ readme = "README.md"
|
|
|
12
12
|
license = {text = "Apache 2"}
|
|
13
13
|
dependencies = [
|
|
14
14
|
"fms-model-optimizer[fp8-infer]>=0.8.3,<0.9",
|
|
15
|
-
"ibm-fms>=1.
|
|
15
|
+
"ibm-fms>=1.11.1,<2",
|
|
16
16
|
# NB: use strict < with the next patch version to not exclude versions with
|
|
17
17
|
# build metadata suffixes
|
|
18
|
-
"vllm>=0.19.1,<0.
|
|
18
|
+
"vllm>=0.19.1,<0.23.1",
|
|
19
19
|
|
|
20
20
|
# Specific torch version overrides handled by uv
|
|
21
21
|
"torch",
|
|
@@ -54,6 +54,7 @@ git_describe_command = "git describe --dirty --tags --long --match 'v*'"
|
|
|
54
54
|
# by accident
|
|
55
55
|
override-dependencies = [
|
|
56
56
|
"torch==2.11.0",
|
|
57
|
+
"torchvision==0.26.0",
|
|
57
58
|
"triton; sys_platform == 'never'",
|
|
58
59
|
"intel-extension-for-pytorch; sys_platform == 'never'",
|
|
59
60
|
|
|
@@ -89,7 +90,7 @@ build-constraint-dependencies = []
|
|
|
89
90
|
extra-build-variables = { vllm = { VLLM_TARGET_DEVICE = "empty" } }
|
|
90
91
|
|
|
91
92
|
[tool.uv.sources]
|
|
92
|
-
vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.
|
|
93
|
+
vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.23.0" }
|
|
93
94
|
torch = [
|
|
94
95
|
{ index = "pytorch-cpu" },
|
|
95
96
|
]
|
|
@@ -243,7 +244,7 @@ dev = [
|
|
|
243
244
|
"pytest-forked>=1.6.0",
|
|
244
245
|
"pytest-timeout==2.3.1",
|
|
245
246
|
"requests==2.32.3",
|
|
246
|
-
"sentence-transformers
|
|
247
|
+
"sentence-transformers>=3.4.1",
|
|
247
248
|
"aiu-fms-testing-utils>=0.8.2",
|
|
248
249
|
"pytest-mock>=3.15.0",
|
|
249
250
|
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '2.2.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (2, 2, 2)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = 'gd054d78'
|
|
@@ -162,7 +162,7 @@ class ConditionalDefaultManager:
|
|
|
162
162
|
namespace: argparse.Namespace | None = None,
|
|
163
163
|
) -> argparse.Namespace:
|
|
164
164
|
result = original_parse_args(self, args, namespace)
|
|
165
|
-
assert result is not None
|
|
165
|
+
assert result is not None
|
|
166
166
|
|
|
167
167
|
if args is None or len(args) == 0:
|
|
168
168
|
# Don't override anything if there were no args parsed
|
{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_configs.yaml
RENAMED
|
@@ -9,6 +9,22 @@
|
|
|
9
9
|
|
|
10
10
|
# templates for reuse via YAML anchors
|
|
11
11
|
_templates:
|
|
12
|
+
granite_41_30b_architecture: &granite_41_30b_architecture
|
|
13
|
+
model_type: granite
|
|
14
|
+
num_hidden_layers: 64
|
|
15
|
+
max_position_embeddings: 131072
|
|
16
|
+
hidden_size: 4096
|
|
17
|
+
vocab_size: 100352
|
|
18
|
+
num_key_value_heads: 8
|
|
19
|
+
num_attention_heads: 32
|
|
20
|
+
|
|
21
|
+
# device config for TP=4 Granite 4.1 30b models
|
|
22
|
+
granite_41_30b_tp4_device_config: &granite_41_30b_tp4_device_config
|
|
23
|
+
env_vars:
|
|
24
|
+
VLLM_DT_MAX_BATCH_TKV_LIMIT: 131072 # 128k
|
|
25
|
+
FLEX_HDMA_P2PSIZE: 268435456 # 256MB
|
|
26
|
+
FLEX_HDMA_COLLSIZE: 33554432 # 32MB
|
|
27
|
+
num_gpu_blocks_override: 2080
|
|
12
28
|
|
|
13
29
|
granite_4_8b_architecture: &granite_4_8b_architecture
|
|
14
30
|
model_type: granite
|
|
@@ -35,7 +51,7 @@ _templates:
|
|
|
35
51
|
FLEX_HDMA_P2PSIZE: 268435456 # 256MB
|
|
36
52
|
FLEX_HDMA_COLLSIZE: 33554432 # 32MB
|
|
37
53
|
num_gpu_blocks_override: 8192
|
|
38
|
-
|
|
54
|
+
|
|
39
55
|
granite_vision_33_2b_architecture: &granite_vision_33_2b_architecture
|
|
40
56
|
model_type: llava_next
|
|
41
57
|
text_config:
|
|
@@ -166,7 +182,7 @@ models:
|
|
|
166
182
|
max_model_len: 32768
|
|
167
183
|
max_num_seqs: 32
|
|
168
184
|
device_config: *granite_8b_tp4_device_config
|
|
169
|
-
|
|
185
|
+
|
|
170
186
|
# Llama 3.1 8B Instruct
|
|
171
187
|
meta-llama/Llama-3.1-8B-Instruct:
|
|
172
188
|
architecture: *llama3_8b_architecture
|
|
@@ -247,6 +263,15 @@ models:
|
|
|
247
263
|
max_num_seqs: 32
|
|
248
264
|
device_config: *granite_8b_tp4_device_config
|
|
249
265
|
|
|
266
|
+
# Granite 4.1 30B
|
|
267
|
+
ibm-granite/granite-4.1-30b:
|
|
268
|
+
architecture: *granite_41_30b_architecture
|
|
269
|
+
continuous_batching_configs:
|
|
270
|
+
- tp_size: 4
|
|
271
|
+
max_model_len: 32768
|
|
272
|
+
max_num_seqs: 32
|
|
273
|
+
device_config: *granite_41_30b_tp4_device_config
|
|
274
|
+
|
|
250
275
|
# Granite Vision 3.3 2B
|
|
251
276
|
ibm-granite/granite-vision-3.3-2b:
|
|
252
277
|
architecture: *granite_vision_33_2b_architecture
|
|
@@ -255,14 +280,14 @@ models:
|
|
|
255
280
|
max_model_len: 8192
|
|
256
281
|
max_num_seqs: 16
|
|
257
282
|
- tp_size: 2
|
|
258
|
-
max_model_len:
|
|
283
|
+
max_model_len: 16384
|
|
259
284
|
max_num_seqs: 16
|
|
260
285
|
device_config: *granite_vision_2b_tp2_device_config
|
|
261
286
|
- tp_size: 4
|
|
262
287
|
max_model_len: 32768
|
|
263
288
|
max_num_seqs: 32
|
|
264
289
|
device_config: *granite_vision_2b_tp4_device_config
|
|
265
|
-
|
|
290
|
+
|
|
266
291
|
# Mistral Small 3.2 24B Instruct
|
|
267
292
|
mistralai/Mistral-Small-3.2-24B-Instruct-2506:
|
|
268
293
|
architecture: *mistral3_24b_architecture
|
|
@@ -279,6 +304,9 @@ models:
|
|
|
279
304
|
mistralai/Ministral-3-14B-Instruct-2512-BF16:
|
|
280
305
|
architecture: *ministral3_14b_architecture
|
|
281
306
|
continuous_batching_configs:
|
|
307
|
+
- tp_size: 1
|
|
308
|
+
max_model_len: 4096
|
|
309
|
+
max_num_seqs: 32
|
|
282
310
|
- tp_size: 4
|
|
283
311
|
max_model_len: 32768
|
|
284
312
|
max_num_seqs: 32
|
|
@@ -310,6 +338,30 @@ models:
|
|
|
310
338
|
- prompt_len: 512
|
|
311
339
|
batch_size: 64
|
|
312
340
|
|
|
341
|
+
Qwen/Qwen3-Embedding-0.6B:
|
|
342
|
+
architecture:
|
|
343
|
+
model_type: qwen3
|
|
344
|
+
num_hidden_layers: 28
|
|
345
|
+
vocab_size: 151669
|
|
346
|
+
|
|
347
|
+
static_batching_configs:
|
|
348
|
+
- tp_size: 1
|
|
349
|
+
warmup_shapes:
|
|
350
|
+
- prompt_len: 512
|
|
351
|
+
batch_size: 64
|
|
352
|
+
|
|
353
|
+
Qwen/Qwen3-Embedding-4B:
|
|
354
|
+
architecture:
|
|
355
|
+
model_type: qwen3
|
|
356
|
+
num_hidden_layers: 36
|
|
357
|
+
vocab_size: 151665
|
|
358
|
+
|
|
359
|
+
static_batching_configs:
|
|
360
|
+
- tp_size: 1
|
|
361
|
+
warmup_shapes:
|
|
362
|
+
- prompt_len: 512
|
|
363
|
+
batch_size: 64
|
|
364
|
+
|
|
313
365
|
# Other supported models (static batching only)
|
|
314
366
|
intfloat/multilingual-e5-large:
|
|
315
367
|
architecture:
|
|
@@ -26,6 +26,7 @@ if TYPE_CHECKING:
|
|
|
26
26
|
SENDNN_INFERENCE_MODEL_CONFIG_FILE: str | None = None
|
|
27
27
|
SENDNN_INFERENCE_CPU_MM_DTYPE: torch.dtype = torch.float16
|
|
28
28
|
SENDNN_INFERENCE_MM_DEVICE: str = "auto"
|
|
29
|
+
SENDNN_INFERENCE_TP_MM_SHARING: bool = True
|
|
29
30
|
|
|
30
31
|
logger = init_logger(__name__)
|
|
31
32
|
|
|
@@ -92,6 +93,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|
|
92
93
|
),
|
|
93
94
|
# Allow sendnn-inference to update env vars related to multi-threading (eg. OMP)
|
|
94
95
|
# based on the detected CPU cores and server configuration
|
|
96
|
+
# Multimodal models will not take into account the number of workers for configuration.
|
|
95
97
|
"SENDNN_INFERENCE_UPDATE_THREAD_CONFIG": lambda: bool(
|
|
96
98
|
int(os.getenv("SENDNN_INFERENCE_UPDATE_THREAD_CONFIG", "1"))
|
|
97
99
|
),
|
|
@@ -171,6 +173,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|
|
171
173
|
"SENDNN_INFERENCE_MM_DEVICE": lambda: parse_mm_device(
|
|
172
174
|
os.getenv("SENDNN_INFERENCE_MM_DEVICE", "auto")
|
|
173
175
|
),
|
|
176
|
+
# When "1" (default), rank 0 runs the vision encoder and shares the result
|
|
177
|
+
# with other TP ranks via POSIX shared memory (one encoder call instead of
|
|
178
|
+
# world_size calls). Set to "0" to fall back to every TP rank running the
|
|
179
|
+
# vision encoder independently — the original behaviour, which avoids any
|
|
180
|
+
# SHM-related failure modes at the cost of redundant CPU work.
|
|
181
|
+
"SENDNN_INFERENCE_TP_MM_SHARING": lambda: bool(
|
|
182
|
+
int(os.getenv("SENDNN_INFERENCE_TP_MM_SHARING", "1"))
|
|
183
|
+
),
|
|
174
184
|
}
|
|
175
185
|
# --8<-- [end:env-vars-definition]
|
|
176
186
|
|