PyPI - sendnn-inference - Versions diffs - 2.1.4__tar.gz → 2.2.2__tar.gz - Mend

sendnn-inference 2.1.4tar.gz → 2.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (209) hide show

sendnn_inference-2.2.2/.github/ci_model_cache.yaml ADDED Viewed

@@ -0,0 +1,17 @@
+# Models pre-fetched into the GHA HuggingFace cache for the Test workflow.
+#
+# Adding/removing/changing an entry here changes the cache key and will trigger
+# a fresh download + cache save on the next push to main. Total uncompressed
+# size of all entries must stay under 8 GiB (GHA cache limit is 10 GiB).
+models:
+  - repo: ibm-ai-platform/micro-g3.3-8b-instruct-1b
+    revision: 6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f
+  - repo: ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8
+    revision: 0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e
+  - repo: sentence-transformers/all-roberta-large-v1
+    revision: cf74d8acd4f198de950bf004b262e6accfed5d2c
+  - repo: cross-encoder/stsb-roberta-large
+    revision: 2b12c2c0088918e76151fd5937b7bba986ef1f98
+  - repo: Qwen/Qwen3-Embedding-0.6B
+    revision: 97b0c614be4d77ee51c0cef4e5f07c00f9eb65b3

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/model_smoke.yml RENAMED Viewed

@@ -62,9 +62,13 @@ jobs:
     steps:
       - name: "Lightweight disk cleanup"
+        # all rm -rf, no apt/docker — completes in < 1s and frees ~20 GB
         run: |
-          rm -rf /usr/share/swift
-          rm -rf /user/local/share/chromium
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc /usr/local/.ghcup
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/share/swift
+          sudo rm -rf /usr/local/share/chromium
           sudo rm -rf /usr/local/share/powershell
       - name: "Checkout"

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/.github/workflows/test.yml RENAMED Viewed

@@ -20,8 +20,6 @@ env:
   VLLM_TARGET_DEVICE: "empty"
   VLLM_PLUGINS: "sendnn_inference"
   HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
-  DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
-  DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -29,7 +27,7 @@ concurrency:
 jobs:
   test:
-    timeout-minutes: 20
+    timeout-minutes: 25
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -49,26 +47,18 @@ jobs:
           - name: "fp8"
             markers: "cpu and quantized and multi"
             flags: "--timeout=600 -k 'basic and test_output' --durations=0"
-            hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
-            hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e"
           - name: "embedding"
             markers: "cpu and embedding and not quantized"
             flags: "--timeout=300"
-            hf_model: "sentence-transformers/all-roberta-large-v1"
-            hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
           - name: "scoring"
             markers: "cpu and scoring"
             flags: "--timeout=300"
-            hf_model: "cross-encoder/stsb-roberta-large"
-            hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98"
           - name: "worker and utils"
             markers: "not e2e and not quantized and not spyre and not multimodal"
             flags: "--timeout=300"
           - name: "multimodal"
             markers: "cpu and multimodal"
             flags: "--timeout=300 -sv"
-            # hf_model: "ibm-granite/granite-vision-3.2-2b"
-            # hf_model_rev: "2818ae5b93cb750b099df1b65f7864e4a0401271"
             env_overrides: "HF_HUB_OFFLINE=0"
         include:
           # Lower bound support
@@ -79,8 +69,6 @@ jobs:
               name: "backward compat"
               markers: "compat or (cpu and basic and not quantized)"
               flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
             os: "ubuntu-latest"
             python_version: "3.12"
           # Intermediate versions of vllm to check basic support for as well
@@ -91,8 +79,6 @@ jobs:
               name: "backward compat"
               markers: "compat or (cpu and basic and not quantized)"
               flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
             os: "ubuntu-latest"
             python_version: "3.12"
           - vllm_version:
@@ -102,8 +88,6 @@ jobs:
               name: "backward compat"
               markers: "compat or (cpu and basic and not quantized)"
               flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
             os: "ubuntu-latest"
             python_version: "3.12"
           - vllm_version:
@@ -113,8 +97,6 @@ jobs:
               name: "backward compat"
               markers: "compat or (cpu and basic and not quantized)"
               flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
             os: "ubuntu-latest"
             python_version: "3.12"
           - vllm_version:
@@ -124,8 +106,15 @@ jobs:
               name: "backward compat"
               markers: "compat or (cpu and basic and not quantized)"
               flags: "--timeout=300"
-              hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
+            os: "ubuntu-latest"
+            python_version: "3.12"
+          - vllm_version:
+              name: "vLLM:0.22.0"
+              repo: "git+https://github.com/vllm-project/vllm --tag v0.22.0"
+            test_suite:
+              name: "backward compat"
+              markers: "compat or (cpu and basic and not quantized)"
+              flags: "--timeout=300"
             os: "ubuntu-latest"
             python_version: "3.12"
@@ -146,10 +135,14 @@ jobs:
     steps:
       - name: "Lightweight disk cleanup"
         # super lightweight cleanup, not nearly as much as actions/free-up-disk-space
+        # all rm -rf, no apt/docker — completes in < 1s and frees ~20 GB
         shell: bash
         run: |
-          rm -rf /usr/share/swift
-          rm -rf /user/local/share/chromium
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc /usr/local/.ghcup
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/share/swift
+          sudo rm -rf /usr/local/share/chromium
           sudo rm -rf /usr/local/share/powershell
       - name: "Checkout"
@@ -199,117 +192,53 @@ jobs:
           # overwritten.
           uv pip install -v .
-      - name: "Standardize HF model names for caching"
-        id: standardize-names
-        if: steps.changed-src-files.outputs.any_changed == 'true'
-        run: |
-          # replace '/' characters in HF_MODEL with '--' for GHA cache keys and
-          # in model file names in local HF hub cache
-          # don't use in-line default values for variable expansion here to not
-          # use the default model revision with a non-default model like this:
-          #   model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}"
-          #   revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}"
-          if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
-            model="${{ matrix.test_suite.hf_model }}"
-            revision="${{ matrix.test_suite.hf_model_rev }}"
-          else
-            model="${{ env.DEFAULT_HF_MODEL }}"
-            revision="${{ env.DEFAULT_HF_MODEL_REV }}"
-          fi
-          safe_name="${model//\//--}"
-          echo "model_key=${safe_name}_${revision}"              >> "$GITHUB_ENV"
-          echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV"
-          if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
-            model_2="${{ matrix.test_suite.hf_model_2 }}"
-            revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
-            safe_name_2="${model_2//\//--}"
-            echo "model_2_key=${safe_name_2}_${revision_2}"            >> "$GITHUB_ENV"
-            echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
-          fi
       - name: "Restore HF models cache"
         id: cache_restore
         if: steps.changed-src-files.outputs.any_changed == 'true'
         uses: actions/cache/restore@v4
         with:
-          path: ${{ env.model_path }}
-          key: ${{ runner.os }}-hf-model-${{ env.model_key }}
-      - name: "Restore HF models cache for additional model"
-        id: cache_restore_2
-        if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
-        uses: actions/cache/restore@v4
-        with:
-          path: ${{ env.model_2_path }}
-          key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
+          path: ${{ env.HF_HUB_CACHE }}
+          key: ${{ runner.os }}-hf-cache-${{ hashFiles('.github/ci_model_cache.yaml') }}
       - name: "Download HF models"
-        if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true'))
+        if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' )
         run: |
-          # We are caching HF models (HF_HUB_CACHE) for reliability rather than
-          # speed, since HF downloads are flaky for concurrent jobs.
-          # Be careful when adding models to the cache here, as the GHA cache is
-          # limited to 10 GB.
-          # If a new model is added here, a new hash key is generated. The
-          # previous cache blob can then be removed by an admin or can be left
-          # to expire after 7 days.
-          if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
-            model="${{ matrix.test_suite.hf_model }}"
-            revision="${{ matrix.test_suite.hf_model_rev }}"
-          else
-            model="${{ env.DEFAULT_HF_MODEL }}"
-            revision="${{ env.DEFAULT_HF_MODEL_REV }}"
-          fi
-          model_2="${{ matrix.test_suite.hf_model_2 }}"
-          revision_2="${{ matrix.test_suite.hf_model_2_rev }}"
-          python3 tools/download_model.py -m "$model" -r "${revision:-main}" &
-          if [[ -n "$model_2" ]]; then
-            python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" &
-          fi
-          wait
+          # The full HF_HUB_CACHE is cached as a single GHA entry keyed on the
+          # hash of .github/ci_model_cache.yaml. Edit that file to add/remove
+          # models — a fresh download + cache save will run on the next push to
+          # main. Stale cache blobs expire after 7 days (or can be deleted by
+          # an admin).
+          #
+          # We cache for reliability, not speed: HF downloads are flaky under
+          # concurrent jobs.
+          source .venv/bin/activate
+          python3 tools/download_model.py --config .github/ci_model_cache.yaml
-      - name: "Check HF model cache sizes"
+      - name: "Check HF model cache size"
         if: steps.changed-src-files.outputs.any_changed == 'true'
         run: |
-          # Guard against accidentally caching multi-GB model artifacts (e.g.
-          # onnx/, openvino/, or duplicate framework weights). The GHA cache
-          # has a 10 GB total limit; keep individual model caches well under
-          # that so multiple matrix entries can coexist.
-          MAX_BYTES=$((3 * 1024 * 1024 * 1024))  # 3 GiB (uncompressed; caches are compressed before upload)
-          status=0
-          for path in "${{ env.model_path }}" "${{ env.model_2_path }}"; do
-            if [[ -n "$path" && -d "$path" ]]; then
-              size=$(du -sb "$path" | cut -f1)
-              human=$(du -sh "$path" | cut -f1)
-              echo "Model cache $path: $human ($size bytes)"
-              if (( size > MAX_BYTES )); then
-                echo "::error::Model cache at $path is $human, exceeding the 3 GiB limit. Update tools/download_model.py to exclude unused artifacts (onnx/, openvino/, duplicate framework weights, etc)."
-                status=1
-              fi
+          # GHA cache has a 10 GB total limit. Keep the combined HF cache well
+          # under that so other caches (uv, etc) can coexist. If this fails,
+          # either prune entries from .github/ci_model_cache.yaml or update
+          # tools/download_model.py to exclude unused artifacts (onnx/,
+          # openvino/, duplicate framework weights, etc).
+          MAX_BYTES=$((8 * 1024 * 1024 * 1024))  # 8 GiB uncompressed
+          if [[ -d "${HF_HUB_CACHE}" ]]; then
+            size=$(du -sb "${HF_HUB_CACHE}" | cut -f1)
+            human=$(du -sh "${HF_HUB_CACHE}" | cut -f1)
+            echo "HF cache ${HF_HUB_CACHE}: $human ($size bytes)"
+            if (( size > MAX_BYTES )); then
+              echo "::error::HF cache is $human, exceeding the 8 GiB limit."
+              exit 1
             fi
-          done
-          exit $status
+          fi
       - name: "Save HF models cache"
         if: ( steps.changed-src-files.outputs.any_changed == 'true' && github.event_name != 'pull_request' && steps.cache_restore.outputs.cache-hit != 'true' )
         uses: actions/cache/save@v4
         with:
-          path: ${{ env.model_path }}
-          key: ${{ runner.os }}-hf-model-${{ env.model_key }}
-      - name: "Save HF models cache for additional model"
-        if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
-        uses: actions/cache/save@v4
-        with:
-          path: ${{ env.model_2_path }}
-          key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
+          path: ${{ env.HF_HUB_CACHE }}
+          key: ${{ runner.os }}-hf-cache-${{ hashFiles('.github/ci_model_cache.yaml') }}
       - name: "Run tests"
         if: steps.changed-src-files.outputs.any_changed == 'true'

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/PKG-INFO RENAMED Viewed

@@ -1,14 +1,14 @@
 Metadata-Version: 2.4
 Name: sendnn-inference
-Version: 2.1.4
+Version: 2.2.2
 Summary: vLLM plugin for Spyre hardware support
 License: Apache 2
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: fms-model-optimizer[fp8-infer]<0.9,>=0.8.3
-Requires-Dist: ibm-fms<2,>=1.9.0
-Requires-Dist: vllm<0.22.1,>=0.19.1
+Requires-Dist: ibm-fms<2,>=1.11.1
+Requires-Dist: vllm<0.23.1,>=0.19.1
 Requires-Dist: torch
 Requires-Dist: torchvision
 Dynamic: license-file

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/docs/user_guide/configuration.md RENAMED Viewed

@@ -121,6 +121,15 @@ Prefix caching mirrors upstream vLLM, though the requirement for fixed-size pref
 When prefix caching is enabled, the `vllm:prefix_cache_queries` and `vllm:prefix_cache_hits` metrics correctly report prefix cache stats in tokens.
+### Multimodal Models
+For multimodal models, vision encoding is offloaded to the CPU. In order to prevent expensive duplication of vision encoding, prefill during multimodal models is slightly different than that of text-only models. Vision encoding is done once per request instead of per worker so the threading configuration for multimodal models is also slightly different to improve performance.
+Text-only models set the number of available threads through dividing the number of available CPUs available by number of worker and only assigning that per worker.
+Multimodal models currently set the number of available threads to the number of available cpus available, ignoring the number of workers. This may be changed in the future.
+The maximum available number of CPUs also can be set using `SENDNN_INFERENCE_NUM_CPUS`.
 ## Pooling Models
 For the embedding, scoring, and reranking tasks, vLLM supports running Pooling Models. More information on Pooling Models can be found in the [vLLM official documentation](https://docs.vllm.ai/en/latest/models/pooling_models/).

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/long_context.py RENAMED Viewed

@@ -45,6 +45,18 @@ if __name__ == "__main__":
     )
     parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
     parser.add_argument("--backend", type=str, default="sendnn", choices=["eager", "sendnn"])
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default=None,
+        help="HF tokenizer id or path. Defaults to --model.",
+    )
+    parser.add_argument(
+        "--load-format",
+        type=str,
+        default="auto",
+        help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
+    )
     args = parser.parse_args()
@@ -95,7 +107,7 @@ if __name__ == "__main__":
     prompts = prompts * (args.num_prompts // len(prompts) + 1)
     prompts = prompts[0 : args.num_prompts]
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer or args.model)
     tokenized_prompts = tokenizer(prompts)["input_ids"]
     tokenized_prompts = [p[: args.max_prompt_len] for p in tokenized_prompts]
@@ -124,7 +136,8 @@ if __name__ == "__main__":
     # Create an LLM.
     llm = LLM(
         model=args.model,
-        tokenizer=args.model,
+        tokenizer=args.tokenizer or args.model,
+        load_format=args.load_format,
         max_model_len=args.max_model_len,
         max_num_seqs=args.max_num_seqs,
         tensor_parallel_size=args.tp,

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/examples/offline_inference/text_inference.py RENAMED Viewed

@@ -29,6 +29,18 @@ if __name__ == "__main__":
     )
     parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
     parser.add_argument("--backend", type=str, default="eager", choices=["eager", "sendnn"])
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default=None,
+        help="HF tokenizer id or path. Defaults to --model.",
+    )
+    parser.add_argument(
+        "--load-format",
+        type=str,
+        default="auto",
+        help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
+    )
     args = parser.parse_args()
@@ -84,7 +96,8 @@ if __name__ == "__main__":
     # Create an LLM.
     llm = LLM(
         model=args.model,
-        tokenizer=args.model,
+        tokenizer=args.tokenizer or args.model,
+        load_format=args.load_format,
         max_model_len=args.max_model_len,
         max_num_seqs=args.max_num_seqs,
         tensor_parallel_size=args.tp,

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [build-system]
 requires = [
   "setuptools>=82",
-  "setuptools_scm>=8"
+  "setuptools_scm>=8,<10"
 ]
 build-backend = "setuptools.build_meta"
@@ -12,10 +12,10 @@ readme = "README.md"
 license = {text = "Apache 2"}
 dependencies = [
     "fms-model-optimizer[fp8-infer]>=0.8.3,<0.9",
-    "ibm-fms>=1.9.0,<2",
+    "ibm-fms>=1.11.1,<2",
     # NB: use strict < with the next patch version to not exclude versions with
     # build metadata suffixes
-    "vllm>=0.19.1,<0.22.1",
+    "vllm>=0.19.1,<0.23.1",
     # Specific torch version overrides handled by uv
     "torch",
@@ -54,6 +54,7 @@ git_describe_command = "git describe --dirty --tags --long --match 'v*'"
 # by accident
 override-dependencies = [
     "torch==2.11.0",
+    "torchvision==0.26.0",
     "triton; sys_platform == 'never'",
     "intel-extension-for-pytorch; sys_platform == 'never'",
@@ -89,7 +90,7 @@ build-constraint-dependencies = []
 extra-build-variables = { vllm = { VLLM_TARGET_DEVICE = "empty" } }
 [tool.uv.sources]
-vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.22.0" }
+vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.23.0" }
 torch = [
   { index = "pytorch-cpu" },
 ]
@@ -243,7 +244,7 @@ dev = [
     "pytest-forked>=1.6.0",
     "pytest-timeout==2.3.1",
     "requests==2.32.3",
-    "sentence-transformers==3.4.1",
+    "sentence-transformers>=3.4.1",
     "aiu-fms-testing-utils>=0.8.2",
     "pytest-mock>=3.15.0",
 ]

sendnn_inference-2.2.2/sendnn_inference/_version.py ADDED Viewed

@@ -0,0 +1,34 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+    COMMIT_ID = Union[str, None]
+else:
+    VERSION_TUPLE = object
+    COMMIT_ID = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+commit_id: COMMIT_ID
+__commit_id__: COMMIT_ID
+__version__ = version = '2.2.2'
+__version_tuple__ = version_tuple = (2, 2, 2)
+__commit_id__ = commit_id = 'gd054d78'

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/argparse_utils.py RENAMED Viewed

@@ -162,7 +162,7 @@ class ConditionalDefaultManager:
             namespace: argparse.Namespace | None = None,
         ) -> argparse.Namespace:
             result = original_parse_args(self, args, namespace)
-            assert result is not None  # type: ignore[redundant-expr]
+            assert result is not None
             if args is None or len(args) == 0:
                 # Don't override anything if there were no args parsed

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/config/model_configs.yaml RENAMED Viewed

@@ -9,6 +9,22 @@
 # templates for reuse via YAML anchors
 _templates:
+  granite_41_30b_architecture: &granite_41_30b_architecture
+    model_type: granite
+    num_hidden_layers: 64
+    max_position_embeddings: 131072
+    hidden_size: 4096
+    vocab_size: 100352
+    num_key_value_heads: 8
+    num_attention_heads: 32
+  # device config for TP=4 Granite 4.1 30b models
+  granite_41_30b_tp4_device_config: &granite_41_30b_tp4_device_config
+    env_vars:
+      VLLM_DT_MAX_BATCH_TKV_LIMIT: 131072  # 128k
+      FLEX_HDMA_P2PSIZE: 268435456  # 256MB
+      FLEX_HDMA_COLLSIZE: 33554432  # 32MB
+    num_gpu_blocks_override: 2080
   granite_4_8b_architecture: &granite_4_8b_architecture
     model_type: granite
@@ -35,7 +51,7 @@ _templates:
       FLEX_HDMA_P2PSIZE: 268435456  # 256MB
       FLEX_HDMA_COLLSIZE: 33554432  # 32MB
     num_gpu_blocks_override: 8192
   granite_vision_33_2b_architecture: &granite_vision_33_2b_architecture
    model_type: llava_next
    text_config:
@@ -166,7 +182,7 @@ models:
         max_model_len: 32768
         max_num_seqs: 32
         device_config: *granite_8b_tp4_device_config
   # Llama 3.1 8B Instruct
   meta-llama/Llama-3.1-8B-Instruct:
     architecture: *llama3_8b_architecture
@@ -247,6 +263,15 @@ models:
         max_num_seqs: 32
         device_config: *granite_8b_tp4_device_config
+  # Granite 4.1 30B
+  ibm-granite/granite-4.1-30b:
+    architecture: *granite_41_30b_architecture
+    continuous_batching_configs:
+      - tp_size: 4
+        max_model_len: 32768
+        max_num_seqs: 32
+        device_config: *granite_41_30b_tp4_device_config
   # Granite Vision 3.3 2B
   ibm-granite/granite-vision-3.3-2b:
     architecture: *granite_vision_33_2b_architecture
@@ -255,14 +280,14 @@ models:
         max_model_len: 8192
         max_num_seqs: 16
       - tp_size: 2
-        max_model_len: 16382
+        max_model_len: 16384
         max_num_seqs: 16
         device_config: *granite_vision_2b_tp2_device_config
       - tp_size: 4
         max_model_len: 32768
         max_num_seqs: 32
         device_config: *granite_vision_2b_tp4_device_config
   # Mistral Small 3.2 24B Instruct
   mistralai/Mistral-Small-3.2-24B-Instruct-2506:
     architecture: *mistral3_24b_architecture
@@ -279,6 +304,9 @@ models:
   mistralai/Ministral-3-14B-Instruct-2512-BF16:
     architecture: *ministral3_14b_architecture
     continuous_batching_configs:
+      - tp_size: 1
+        max_model_len: 4096
+        max_num_seqs: 32
       - tp_size: 4
         max_model_len: 32768
         max_num_seqs: 32
@@ -310,6 +338,30 @@ models:
           - prompt_len: 512
             batch_size: 64
+  Qwen/Qwen3-Embedding-0.6B:
+    architecture:
+      model_type: qwen3
+      num_hidden_layers: 28
+      vocab_size: 151669
+    static_batching_configs:
+      - tp_size: 1
+        warmup_shapes:
+          - prompt_len: 512
+            batch_size: 64
+  Qwen/Qwen3-Embedding-4B:
+    architecture:
+      model_type: qwen3
+      num_hidden_layers: 36
+      vocab_size: 151665
+    static_batching_configs:
+      - tp_size: 1
+        warmup_shapes:
+          - prompt_len: 512
+            batch_size: 64
   # Other supported models (static batching only)
   intfloat/multilingual-e5-large:
     architecture:

{sendnn_inference-2.1.4 → sendnn_inference-2.2.2}/sendnn_inference/envs.py RENAMED Viewed

@@ -26,6 +26,7 @@ if TYPE_CHECKING:
     SENDNN_INFERENCE_MODEL_CONFIG_FILE: str | None = None
     SENDNN_INFERENCE_CPU_MM_DTYPE: torch.dtype = torch.float16
     SENDNN_INFERENCE_MM_DEVICE: str = "auto"
+    SENDNN_INFERENCE_TP_MM_SHARING: bool = True
 logger = init_logger(__name__)
@@ -92,6 +93,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     ),
     # Allow sendnn-inference to update env vars related to multi-threading (eg. OMP)
     # based on the detected CPU cores and server configuration
+    # Multimodal models will not take into account the number of workers for configuration.
     "SENDNN_INFERENCE_UPDATE_THREAD_CONFIG": lambda: bool(
         int(os.getenv("SENDNN_INFERENCE_UPDATE_THREAD_CONFIG", "1"))
     ),
@@ -171,6 +173,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "SENDNN_INFERENCE_MM_DEVICE": lambda: parse_mm_device(
         os.getenv("SENDNN_INFERENCE_MM_DEVICE", "auto")
     ),
+    # When "1" (default), rank 0 runs the vision encoder and shares the result
+    # with other TP ranks via POSIX shared memory (one encoder call instead of
+    # world_size calls).  Set to "0" to fall back to every TP rank running the
+    # vision encoder independently — the original behaviour, which avoids any
+    # SHM-related failure modes at the cost of redundant CPU work.
+    "SENDNN_INFERENCE_TP_MM_SHARING": lambda: bool(
+        int(os.getenv("SENDNN_INFERENCE_TP_MM_SHARING", "1"))
+    ),
 }
 # --8<-- [end:env-vars-definition]

sendnn-inference 2.1.4__tar.gz → 2.2.2__tar.gz

sendnn-inference 2.1.4tar.gz → 2.2.2tar.gz