PyPI - sendnn-inference - Versions diffs - 2.2.0__tar.gz → 2.2.2__tar.gz - Mend

sendnn-inference 2.2.0tar.gz → 2.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/.github/workflows/test.yml RENAMED Viewed

@@ -27,7 +27,7 @@ concurrency:
 jobs:
   test:
-    timeout-minutes: 20
+    timeout-minutes: 25
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -108,6 +108,15 @@ jobs:
               flags: "--timeout=300"
             os: "ubuntu-latest"
             python_version: "3.12"
+          - vllm_version:
+              name: "vLLM:0.22.0"
+              repo: "git+https://github.com/vllm-project/vllm --tag v0.22.0"
+            test_suite:
+              name: "backward compat"
+              markers: "compat or (cpu and basic and not quantized)"
+              flags: "--timeout=300"
+            os: "ubuntu-latest"
+            python_version: "3.12"
         # Only run vllm:main jobs on PRs with `vllm:main` label

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sendnn-inference
-Version: 2.2.0
+Version: 2.2.2
 Summary: vLLM plugin for Spyre hardware support
 License: Apache 2
 Requires-Python: >=3.11
@@ -8,7 +8,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: fms-model-optimizer[fp8-infer]<0.9,>=0.8.3
 Requires-Dist: ibm-fms<2,>=1.11.1
-Requires-Dist: vllm<0.22.1,>=0.19.1
+Requires-Dist: vllm<0.23.1,>=0.19.1
 Requires-Dist: torch
 Requires-Dist: torchvision
 Dynamic: license-file

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/docs/user_guide/configuration.md RENAMED Viewed

@@ -121,6 +121,15 @@ Prefix caching mirrors upstream vLLM, though the requirement for fixed-size pref
 When prefix caching is enabled, the `vllm:prefix_cache_queries` and `vllm:prefix_cache_hits` metrics correctly report prefix cache stats in tokens.
+### Multimodal Models
+For multimodal models, vision encoding is offloaded to the CPU. In order to prevent expensive duplication of vision encoding, prefill during multimodal models is slightly different than that of text-only models. Vision encoding is done once per request instead of per worker so the threading configuration for multimodal models is also slightly different to improve performance.
+Text-only models set the number of available threads through dividing the number of available CPUs available by number of worker and only assigning that per worker.
+Multimodal models currently set the number of available threads to the number of available cpus available, ignoring the number of workers. This may be changed in the future.
+The maximum available number of CPUs also can be set using `SENDNN_INFERENCE_NUM_CPUS`.
 ## Pooling Models
 For the embedding, scoring, and reranking tasks, vLLM supports running Pooling Models. More information on Pooling Models can be found in the [vLLM official documentation](https://docs.vllm.ai/en/latest/models/pooling_models/).

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/long_context.py RENAMED Viewed

@@ -45,6 +45,18 @@ if __name__ == "__main__":
     )
     parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
     parser.add_argument("--backend", type=str, default="sendnn", choices=["eager", "sendnn"])
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default=None,
+        help="HF tokenizer id or path. Defaults to --model.",
+    )
+    parser.add_argument(
+        "--load-format",
+        type=str,
+        default="auto",
+        help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
+    )
     args = parser.parse_args()
@@ -95,7 +107,7 @@ if __name__ == "__main__":
     prompts = prompts * (args.num_prompts // len(prompts) + 1)
     prompts = prompts[0 : args.num_prompts]
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer or args.model)
     tokenized_prompts = tokenizer(prompts)["input_ids"]
     tokenized_prompts = [p[: args.max_prompt_len] for p in tokenized_prompts]
@@ -124,7 +136,8 @@ if __name__ == "__main__":
     # Create an LLM.
     llm = LLM(
         model=args.model,
-        tokenizer=args.model,
+        tokenizer=args.tokenizer or args.model,
+        load_format=args.load_format,
         max_model_len=args.max_model_len,
         max_num_seqs=args.max_num_seqs,
         tensor_parallel_size=args.tp,

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/examples/offline_inference/text_inference.py RENAMED Viewed

@@ -29,6 +29,18 @@ if __name__ == "__main__":
     )
     parser.add_argument("--max-num-batched-tokens", type=int, default=1024)
     parser.add_argument("--backend", type=str, default="eager", choices=["eager", "sendnn"])
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default=None,
+        help="HF tokenizer id or path. Defaults to --model.",
+    )
+    parser.add_argument(
+        "--load-format",
+        type=str,
+        default="auto",
+        help="vLLM load format: auto, dummy, safetensors, pt, ... `dummy` random-inits weights.",
+    )
     args = parser.parse_args()
@@ -84,7 +96,8 @@ if __name__ == "__main__":
     # Create an LLM.
     llm = LLM(
         model=args.model,
-        tokenizer=args.model,
+        tokenizer=args.tokenizer or args.model,
+        load_format=args.load_format,
         max_model_len=args.max_model_len,
         max_num_seqs=args.max_num_seqs,
         tensor_parallel_size=args.tp,

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [build-system]
 requires = [
   "setuptools>=82",
-  "setuptools_scm>=8"
+  "setuptools_scm>=8,<10"
 ]
 build-backend = "setuptools.build_meta"
@@ -15,7 +15,7 @@ dependencies = [
     "ibm-fms>=1.11.1,<2",
     # NB: use strict < with the next patch version to not exclude versions with
     # build metadata suffixes
-    "vllm>=0.19.1,<0.22.1",
+    "vllm>=0.19.1,<0.23.1",
     # Specific torch version overrides handled by uv
     "torch",
@@ -90,7 +90,7 @@ build-constraint-dependencies = []
 extra-build-variables = { vllm = { VLLM_TARGET_DEVICE = "empty" } }
 [tool.uv.sources]
-vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.22.0" }
+vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.23.0" }
 torch = [
   { index = "pytorch-cpu" },
 ]

sendnn_inference-2.2.2/sendnn_inference/_version.py ADDED Viewed

@@ -0,0 +1,34 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+    COMMIT_ID = Union[str, None]
+else:
+    VERSION_TUPLE = object
+    COMMIT_ID = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+commit_id: COMMIT_ID
+__commit_id__: COMMIT_ID
+__version__ = version = '2.2.2'
+__version_tuple__ = version_tuple = (2, 2, 2)
+__commit_id__ = commit_id = 'gd054d78'

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/argparse_utils.py RENAMED Viewed

@@ -162,7 +162,7 @@ class ConditionalDefaultManager:
             namespace: argparse.Namespace | None = None,
         ) -> argparse.Namespace:
             result = original_parse_args(self, args, namespace)
-            assert result is not None  # type: ignore[redundant-expr]
+            assert result is not None
             if args is None or len(args) == 0:
                 # Don't override anything if there were no args parsed

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/config/model_configs.yaml RENAMED Viewed

@@ -9,6 +9,22 @@
 # templates for reuse via YAML anchors
 _templates:
+  granite_41_30b_architecture: &granite_41_30b_architecture
+    model_type: granite
+    num_hidden_layers: 64
+    max_position_embeddings: 131072
+    hidden_size: 4096
+    vocab_size: 100352
+    num_key_value_heads: 8
+    num_attention_heads: 32
+  # device config for TP=4 Granite 4.1 30b models
+  granite_41_30b_tp4_device_config: &granite_41_30b_tp4_device_config
+    env_vars:
+      VLLM_DT_MAX_BATCH_TKV_LIMIT: 131072  # 128k
+      FLEX_HDMA_P2PSIZE: 268435456  # 256MB
+      FLEX_HDMA_COLLSIZE: 33554432  # 32MB
+    num_gpu_blocks_override: 2080
   granite_4_8b_architecture: &granite_4_8b_architecture
     model_type: granite
@@ -35,7 +51,7 @@ _templates:
       FLEX_HDMA_P2PSIZE: 268435456  # 256MB
       FLEX_HDMA_COLLSIZE: 33554432  # 32MB
     num_gpu_blocks_override: 8192
   granite_vision_33_2b_architecture: &granite_vision_33_2b_architecture
    model_type: llava_next
    text_config:
@@ -166,7 +182,7 @@ models:
         max_model_len: 32768
         max_num_seqs: 32
         device_config: *granite_8b_tp4_device_config
   # Llama 3.1 8B Instruct
   meta-llama/Llama-3.1-8B-Instruct:
     architecture: *llama3_8b_architecture
@@ -247,6 +263,15 @@ models:
         max_num_seqs: 32
         device_config: *granite_8b_tp4_device_config
+  # Granite 4.1 30B
+  ibm-granite/granite-4.1-30b:
+    architecture: *granite_41_30b_architecture
+    continuous_batching_configs:
+      - tp_size: 4
+        max_model_len: 32768
+        max_num_seqs: 32
+        device_config: *granite_41_30b_tp4_device_config
   # Granite Vision 3.3 2B
   ibm-granite/granite-vision-3.3-2b:
     architecture: *granite_vision_33_2b_architecture
@@ -262,7 +287,7 @@ models:
         max_model_len: 32768
         max_num_seqs: 32
         device_config: *granite_vision_2b_tp4_device_config
   # Mistral Small 3.2 24B Instruct
   mistralai/Mistral-Small-3.2-24B-Instruct-2506:
     architecture: *mistral3_24b_architecture
@@ -279,6 +304,9 @@ models:
   mistralai/Ministral-3-14B-Instruct-2512-BF16:
     architecture: *ministral3_14b_architecture
     continuous_batching_configs:
+      - tp_size: 1
+        max_model_len: 4096
+        max_num_seqs: 32
       - tp_size: 4
         max_model_len: 32768
         max_num_seqs: 32

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/envs.py RENAMED Viewed

@@ -26,6 +26,7 @@ if TYPE_CHECKING:
     SENDNN_INFERENCE_MODEL_CONFIG_FILE: str | None = None
     SENDNN_INFERENCE_CPU_MM_DTYPE: torch.dtype = torch.float16
     SENDNN_INFERENCE_MM_DEVICE: str = "auto"
+    SENDNN_INFERENCE_TP_MM_SHARING: bool = True
 logger = init_logger(__name__)
@@ -92,6 +93,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     ),
     # Allow sendnn-inference to update env vars related to multi-threading (eg. OMP)
     # based on the detected CPU cores and server configuration
+    # Multimodal models will not take into account the number of workers for configuration.
     "SENDNN_INFERENCE_UPDATE_THREAD_CONFIG": lambda: bool(
         int(os.getenv("SENDNN_INFERENCE_UPDATE_THREAD_CONFIG", "1"))
     ),
@@ -171,6 +173,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "SENDNN_INFERENCE_MM_DEVICE": lambda: parse_mm_device(
         os.getenv("SENDNN_INFERENCE_MM_DEVICE", "auto")
     ),
+    # When "1" (default), rank 0 runs the vision encoder and shares the result
+    # with other TP ranks via POSIX shared memory (one encoder call instead of
+    # world_size calls).  Set to "0" to fall back to every TP rank running the
+    # vision encoder independently — the original behaviour, which avoids any
+    # SHM-related failure modes at the cost of redundant CPU work.
+    "SENDNN_INFERENCE_TP_MM_SHARING": lambda: bool(
+        int(os.getenv("SENDNN_INFERENCE_TP_MM_SHARING", "1"))
+    ),
 }
 # --8<-- [end:env-vars-definition]

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/model_executor/model_loader/spyre.py RENAMED Viewed

@@ -72,6 +72,7 @@ class SpyreCausalLM(nn.Module):
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
         self.scheduler_config = vllm_config.scheduler_config
+        self.load_config = vllm_config.load_config
         self.dtype = self.get_dtype()
         # Wrappers for utils for multimodal
@@ -171,16 +172,30 @@ class SpyreCausalLM(nn.Module):
                     self.dtype,
                 )
-        is_local = os.path.isdir(model_config.model)
-        model_path = model_config.model
-        # Get location of model from HF cache.
-        if not is_local:
-            model_path = download_weights_from_hf(
-                model_name_or_path=model_path,
-                cache_dir=None,
-                allow_patterns=["*.safetensors", "*.bin", "*.pt"],
-                revision=model_config.revision,
+        # `--load-format dummy` skips the checkpoint download and routes through
+        # FMS's `hf_configured` path, which fetches only config.json and then
+        # random-inits the model via `reset_parameters()`.
+        variant: str | None = None
+        if self.load_config.load_format == "dummy":
+            logger.info(
+                "Loading model %s with random weights.",
+                model_config.model,
             )
+            architecture = "hf_configured"
+            variant = model_config.model
+            model_path: str | None = None
+        else:
+            architecture = "hf_pretrained"
+            is_local = os.path.isdir(model_config.model)
+            model_path = model_config.model
+            # Get location of model from HF cache.
+            if not is_local:
+                model_path = download_weights_from_hf(
+                    model_name_or_path=model_path,
+                    cache_dir=None,
+                    allow_patterns=["*.safetensors", "*.bin", "*.pt"],
+                    revision=model_config.revision,
+                )
         # Get any fixes needed that must be patched into the kwargs;
         # currently this is only use for multimodal models / llava next
@@ -192,7 +207,8 @@ class SpyreCausalLM(nn.Module):
             kwargs["rank"],
         ):
             self.fms_model = get_model(
-                architecture="hf_pretrained",
+                architecture=architecture,
+                variant=variant,
                 model_path=model_path,
                 distributed_strategy=distributed_strategy,
                 group=dist.group.WORLD,

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/platform.py RENAMED Viewed

@@ -242,6 +242,14 @@ class SpyrePlatform(Platform):
         if not is_decoder and not is_pooling:
             raise ValueError("Only the 'generate' and 'pooling' runners are supported")
+        if vllm_config.load_config.load_format == "dummy" and (
+            model_config.is_multimodal_model or is_pooling
+        ):
+            raise ValueError(
+                "--load-format dummy is only supported for text generation models; "
+                "random-weight init is not implemented for multimodal or pooling models."
+            )
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "sendnn_inference.v1.worker.spyre_worker.SpyreWorker"
@@ -345,7 +353,7 @@ class SpyrePlatform(Platform):
                 scheduler_config.max_num_batched_tokens = (
                     model_config.max_model_len * scheduler_config.max_num_seqs
                 )
-                cache_config.block_size = model_config.max_model_len  # ty: ignore[invalid-assignment]
+                cache_config.block_size = model_config.max_model_len
                 vllm_config.cache_config.enable_prefix_caching = False
             else:
@@ -635,7 +643,12 @@ class SpyrePlatform(Platform):
         # NOTE: math.ceil can output a number for each worker that sums
         # to a total greater than cpu_count.
-        cpus_per_worker = math.ceil(cpu_count / worker_count) if cpu_count is not None else None
+        thread_factor = worker_count
+        if cls._config.model_config.is_multimodal_model:
+            # thread_factor value/formula subject to further tuning
+            thread_factor = 1
+        cpus_per_worker = math.ceil(cpu_count / thread_factor) if cpu_count is not None else None
         thread_warning = (
             "Excessive threads may result in CPU contention. "
@@ -821,7 +834,7 @@ class SpyrePlatform(Platform):
     @classmethod
     def _set_batch_tkv_limit_from_env(cls) -> None:
         try:
-            cls._max_batch_tkv_limit = int(os.getenv("VLLM_DT_MAX_BATCH_TKV_LIMIT", "-1"))  #  ty: ignore
+            cls._max_batch_tkv_limit = int(os.getenv("VLLM_DT_MAX_BATCH_TKV_LIMIT", "-1"))
         except ValueError as e:
             raise ValueError("VLLM_DT_MAX_BATCH_TKV_LIMIT must be an integer") from e

{sendnn_inference-2.2.0 → sendnn_inference-2.2.2}/sendnn_inference/v1/core/scheduler.py RENAMED Viewed

@@ -216,23 +216,6 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
             "Expecting an instance of CPSpyreModelRunnerOutput when doing chunked prefill."
         )
-        # Update the correct num_computed_tokens value given left-padding and
-        # prefix cache hit info
-        for req in self.ongoing_prefills:
-            # The number of computed tokens only need to be adapted when it is
-            # the first chunk of a multi-chunk prefill
-            is_first_chunk = req.num_computed_tokens <= self.chunk_size
-            is_last_chunk = req.num_computed_tokens == req.num_prompt_tokens
-            if is_first_chunk and not is_last_chunk:
-                left_padding = model_runner_output.left_padding.get(req.request_id, 0)
-                prefix_cache_len = model_runner_output.prefix_cache_hit_len.get(req.request_id, 0)
-                req.num_computed_tokens = self.adjust_computed_tokens(
-                    computed_tokens=req.num_computed_tokens,
-                    left_padding=left_padding,
-                    prefix_cache_len=prefix_cache_len,
-                )
         # Remove completed prefills
         self.ongoing_prefills = [
             req for req in self.ongoing_prefills if req.num_computed_tokens < req.num_prompt_tokens
@@ -250,21 +233,39 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
         return result
-    def adjust_computed_tokens(
-        self, computed_tokens: int, left_padding: int, prefix_cache_len: int
-    ) -> int:
-        """
-        Returns an adjusted `num_computed_tokens` given left padding and prefix
-        cache hit info.
-        """
-        # The prefix cache length is already adjusted for left padding.
-        # If it's bigger than the number of computed tokens, then we hit more
-        # prefix cache than we scheduled.
-        if prefix_cache_len > computed_tokens:
-            assert (prefix_cache_len + left_padding) % self.chunk_size == 0
-            return prefix_cache_len
-        # Otherwise just account for the left padding
-        return computed_tokens - left_padding
+    def _current_chunk_token_threshold(self, new_prefill_candidates: list[Request]) -> int:
+        """Returns the `long_prefill_token_threshold` to use for this step.
+        For the chunk-0 step cap to `chunk_size - left_padding` so the base
+        scheduler is aware of the padding blocks.
+        Otherwise return `chunk_size`: the natural chunk boundary."""
+        # If there are no new prefill candidates, no cap is needed.
+        if not new_prefill_candidates:
+            return self.chunk_size
+        new_prefill = new_prefill_candidates[0]
+        # Calculate left-padding tokens for this prompt.
+        prompt_len = new_prefill.num_prompt_tokens
+        n_chunks = math.ceil(prompt_len / self.chunk_size)
+        padded_prompt_len = math.ceil(prompt_len / self.block_size) * self.block_size
+        left_padding = n_chunks * self.chunk_size - padded_prompt_len
+        # If the prefix cache already covers chunk 0's real content, no cap is
+        # needed: the base scheduler will start from chunk i>=1, which has no
+        # padding. `get_computed_blocks` records into `prefix_cache_stats` as
+        # a side effect; the base scheduler calls it again, so toggle
+        # log_stats off here to avoid double-counting.
+        prev_log_stats = self.kv_cache_manager.log_stats
+        self.kv_cache_manager.log_stats = False
+        _, prefix_token_len = self.kv_cache_manager.get_computed_blocks(new_prefill)
+        self.kv_cache_manager.log_stats = prev_log_stats
+        if prefix_token_len >= self.chunk_size - left_padding:
+            return self.chunk_size
+        # Adjust the token threshold to account for left padding
+        return self.chunk_size - left_padding
     def _get_required_blocks(self, request: Request, max_output: bool = False) -> tuple[int, int]:
         """
@@ -404,7 +405,7 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
             ready_to_prefill = [
                 r
                 for r in self.waiting
-                if r.status != RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR  # type: ignore[attr-defined]
+                if r.status != RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR
             ]
             if ready_to_prefill:
                 new_prefill_candidates = list(self.waiting)
@@ -424,6 +425,15 @@ class ChunkedPrefillSpyreScheduler(SpyreScheduler):
             self.previous_step_was_prefill = False
             running_holdback = []
+        # Cap chunk-0 token count to chunk_size - left_padding so the upstream KV
+        # cache manager doesn't allocate a real blocks for the left-padding region.
+        # Only matters at chunk 0; later chunks land on natural chunk boundaries.
+        # Mutating scheduler_config is safe: the SpyreScheduler is the only
+        # scheduler in this engine and at most one prefill is in flight per step.
+        self.scheduler_config.long_prefill_token_threshold = self._current_chunk_token_threshold(
+            new_prefill_candidates
+        )
         # delegate to super of SpyreScheduler: base V1 Scheduler
         outputs = super(SpyreScheduler, self).schedule()

sendnn_inference-2.2.2/sendnn_inference/v1/worker/mm_shared_memory.py ADDED Viewed

@@ -0,0 +1,130 @@
+"""Utilities for sharing multimodal embeddings across TP ranks via POSIX shared memory.
+During chunked prefill rank 0 computes the full vision-encoder embeddings and writes
+them here; non-zero ranks read after synchronisation in the model runner.
+This avoids running the (CPU-bound) vision encoder world_size times per request.
+"""
+import hashlib
+from multiprocessing.shared_memory import SharedMemory
+import torch
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+# Stable mapping between torch dtypes and compact integer identifiers used in
+# the broadcast metadata tensor.  torch.frombuffer handles all dtypes natively.
+_DTYPE_TO_IDX: dict[torch.dtype, int] = {
+    torch.float16: 0,
+    torch.float32: 1,
+    torch.bfloat16: 2,
+}
+_IDX_TO_DTYPE: dict[int, torch.dtype] = {v: k for k, v in _DTYPE_TO_IDX.items()}
+def dtype_to_idx(dtype: torch.dtype) -> int:
+    """Encode a torch dtype as a compact integer for the broadcast metadata tensor."""
+    if dtype not in _DTYPE_TO_IDX:
+        raise ValueError(f"Unsupported dtype for SHM transfer: {dtype}")
+    return _DTYPE_TO_IDX[dtype]
+def idx_to_dtype(idx: int) -> torch.dtype:
+    """Decode a compact integer back to the corresponding torch dtype."""
+    if idx not in _IDX_TO_DTYPE:
+        raise ValueError(f"Unknown dtype index: {idx}")
+    return _IDX_TO_DTYPE[idx]
+def _shm_name(req_id: str) -> str:
+    """Generate a short, deterministic POSIX SHM name for a request.
+    Uses an MD5 hash of the *full* req_id so that requests which share a
+    common prefix (e.g. all benchmark requests in a run share the
+    ``chatcmpl-bench-<uuid>-`` prefix) still get distinct SHM names.
+    Truncating the req_id (the previous approach) caused silent collisions:
+    ``chatcmpl-bench-34e3ed2d-1-…`` and ``chatcmpl-bench-34e3ed2d-39-…``
+    both hash to the same 20-char prefix, so every request in the benchmark
+    wrote to the same SHM segment — corrupting each other's embeddings.
+    Linux NAME_MAX is 255; macOS requires ≤ 30 chars for the name itself
+    (the kernel prefixes it with ``/``).  'sm' + 16 hex chars = 18 chars,
+    safely within every platform's limit.
+    """
+    digest = hashlib.md5(req_id.encode(), usedforsecurity=False).hexdigest()[:16]
+    return f"sm{digest}"
+def write_embeddings(tensor: torch.Tensor, req_id: str) -> SharedMemory:
+    """Write *tensor* to a shared-memory block keyed by *req_id*.
+    Returns the ``SharedMemory`` handle — the caller must keep it and pass it
+    to :func:`cleanup_embeddings` after all ranks have read.
+    Shape and dtype are NOT stored in SHM; the caller broadcasts them via a
+    tiny ``torch.distributed.broadcast`` so readers already have that info
+    before calling :func:`read_embeddings`.
+    """
+    if tensor.device.type != "cpu":
+        tensor = tensor.cpu()
+    tensor = tensor.contiguous()
+    assert tensor.ndim == 3, f"Expected 3-D embedding tensor, got shape {tensor.shape}"
+    assert tensor.dtype in _DTYPE_TO_IDX, f"Unsupported dtype for SHM transfer: {tensor.dtype}"
+    data_shm = SharedMemory(create=True, size=tensor.nbytes, name=_shm_name(req_id))
+    torch.frombuffer(data_shm.buf, dtype=tensor.dtype).reshape(tensor.shape).copy_(tensor)
+    logger.debug(
+        "Wrote MM embeddings to SHM for req '%s': shape=%s dtype=%s bytes=%d",
+        req_id,
+        tuple(tensor.shape),
+        tensor.dtype,
+        tensor.nbytes,
+    )
+    return data_shm
+def read_embeddings(
+    req_id: str,
+    shape: tuple[int, int, int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Read embeddings from shared memory and return a detached CPU tensor.
+    *shape* and *dtype* must be provided by the caller (obtained from the
+    broadcast metadata tensor) — they are not re-read from SHM.
+    Opens and closes the shared-memory handle internally.
+    """
+    data_shm = SharedMemory(name=_shm_name(req_id))
+    # .clone() detaches the tensor from the SHM buffer so the handle can be closed.
+    result = torch.frombuffer(data_shm.buf, dtype=dtype).reshape(shape).clone()
+    data_shm.close()
+    logger.debug(
+        "Read MM embeddings from SHM for req '%s': shape=%s dtype=%s",
+        req_id,
+        shape,
+        dtype,
+    )
+    return result
+def cleanup_embeddings(data_shm: SharedMemory) -> None:
+    """Unlink and close the shared-memory block.
+    Safe to call even if the block was already cleaned up — exceptions are
+    logged but not re-raised.
+    """
+    try:
+        data_shm.unlink()
+    except Exception as exc:
+        logger.debug("SHM unlink skipped (%s): %s", data_shm.name, exc)
+    try:
+        data_shm.close()
+    except Exception as exc:
+        logger.debug("SHM close skipped (%s): %s", data_shm.name, exc)

sendnn-inference 2.2.0__tar.gz → 2.2.2__tar.gz

sendnn-inference 2.2.0tar.gz → 2.2.2tar.gz