PyPI - embedl-deploy-tensorrt - Versions diffs - 0.4.1__tar.gz → 0.5.0__tar.gz - Mend

embedl-deploy-tensorrt 0.4.1tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{embedl_deploy_tensorrt-0.4.1 → embedl_deploy_tensorrt-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: embedl-deploy-tensorrt
-Version: 0.4.1
+Version: 0.5.0
 Summary: TensorRT backend for embedl-deploy.
 Author-email: Embedl AB <support@embedl.com>
 Project-URL: Homepage, https://www.embedl.com/
@@ -54,9 +54,10 @@ hardware target ensuring correct quantization and compilation.
 ## Supported Backends
-| Backend                 | Status      |
-|-------------------------|-------------|
-| NVIDIA TensorRT (v10.3) | Supported   |
+| Backend                   | Status          |
+|---------------------------|-----------------|
+| NVIDIA TensorRT  (v10.3)  | Supported       |
+| Lattice SensAI (v8.0)     | In Development  |
 Contact Embedl for other backends.
@@ -71,7 +72,7 @@ intermediate.
 ---
-## Quick Start
+## Quick Start for TensorRT Backend
 ```python
 import torch
@@ -85,7 +86,7 @@ model = Model().eval()
 example_input = torch.randn(1, 3, 224, 224)
 # 2. Transform — fuse and optimize for TensorRT in one call
-# For more compatibilty you can trace your model with torch.export.export
+# For more compatibility you can trace your model with torch.export.export
 # as follows:
 # model = torch.export.export(model, (example_input)).module()
 res = transform(model, patterns=TENSORRT_PATTERNS)

{embedl_deploy_tensorrt-0.4.1 → embedl_deploy_tensorrt-0.5.0}/README.md RENAMED Viewed

@@ -35,9 +35,10 @@ hardware target ensuring correct quantization and compilation.
 ## Supported Backends
-| Backend                 | Status      |
-|-------------------------|-------------|
-| NVIDIA TensorRT (v10.3) | Supported   |
+| Backend                   | Status          |
+|---------------------------|-----------------|
+| NVIDIA TensorRT  (v10.3)  | Supported       |
+| Lattice SensAI (v8.0)     | In Development  |
 Contact Embedl for other backends.
@@ -52,7 +53,7 @@ intermediate.
 ---
-## Quick Start
+## Quick Start for TensorRT Backend
 ```python
 import torch
@@ -66,7 +67,7 @@ model = Model().eval()
 example_input = torch.randn(1, 3, 224, 224)
 # 2. Transform — fuse and optimize for TensorRT in one call
-# For more compatibilty you can trace your model with torch.export.export
+# For more compatibility you can trace your model with torch.export.export
 # as follows:
 # model = torch.export.export(model, (example_input)).module()
 res = transform(model, patterns=TENSORRT_PATTERNS)

{embedl_deploy_tensorrt-0.4.1 → embedl_deploy_tensorrt-0.5.0}/src/embedl_deploy/_internal/tensorrt/backend.py RENAMED Viewed

@@ -11,6 +11,7 @@ from embedl_deploy._internal.tensorrt.plan import (
 )
 BACKEND = Backend(
+    name="tensorrt",
     conversion_patterns=TENSORRT_CONVERSION_PATTERNS,
     fusion_patterns=TENSORRT_FUSION_PATTERNS,
     smooth_patterns=TENSORRT_SMOOTH_PATTERNS,

{embedl_deploy_tensorrt-0.4.1 → embedl_deploy_tensorrt-0.5.0}/src/embedl_deploy/_internal/tensorrt/modules/attention.py RENAMED Viewed

@@ -69,7 +69,7 @@ class MHAInProjection(ConvertedModule):
         v = v.view(batch, seq, self.num_heads, self.head_dim).transpose(1, 2)
         return q, k, v
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         embed_dim = self.num_heads * self.head_dim
         return (
             f"MHAInProjection("
@@ -80,7 +80,7 @@ class MHAInProjection(ConvertedModule):
 class ScaledDotProductAttention(ConvertedModule):
-    """Core attention: ``softmax(Q · Kᵀ / √H) · V``.
+    """Core attention: ``softmax(Q · Kᵀ · scale) · V``.
     :param num_heads:
         Number of attention heads.
@@ -88,6 +88,14 @@ class ScaledDotProductAttention(ConvertedModule):
         Dimension of each head.
     :param dropout:
         Dropout probability (applied during training only).
+    :param is_causal:
+        Whether to apply a causal mask. Mirrors the ``is_causal`` kwarg
+        of ``F.scaled_dot_product_attention``.
+    :param scale:
+        Explicit attention score scale (multiplied on Q·Kᵀ). When
+        ``None`` the PyTorch default ``1/√head_dim`` is used. Models
+        that pre-scale Q themselves (e.g. chronos-2 + RoPE) must pass
+        ``scale=1.0`` so the default scaling does not apply twice.
     """
     def __init__(
@@ -95,11 +103,15 @@ class ScaledDotProductAttention(ConvertedModule):
         num_heads: int,
         head_dim: int,
         dropout: float = 0.0,
+        is_causal: bool = False,
+        scale: float | None = None,
     ) -> None:
         super().__init__()
         self.num_heads = num_heads
         self.head_dim = head_dim
         self.dropout = dropout
+        self.is_causal = is_causal
+        self.scale = scale
     def forward(
         self,
@@ -117,8 +129,9 @@ class ScaledDotProductAttention(ConvertedModule):
         :param v:
             Value tensor ``[B, num_heads, S, head_dim]``.
         :param attn_mask:
-            Optional attention mask. ``aten.scaled_dot_product_attention``
-            takes an optional 4th positional arg; ``WrapAtenSDPAPattern``
+            Optional attention mask.
+            ``torch.nn.functional.scaled_dot_product_attention`` takes an
+            optional 4th positional arg; ``WrapFunctionalSDPAPattern``
             forwards whatever positional args were on the source node, so
             this module accepts the mask too. SAM3, masked-LM, and
             similar models that compile with mixed-mask attention rely
@@ -135,14 +148,18 @@ class ScaledDotProductAttention(ConvertedModule):
             v,
             attn_mask=attn_mask,
             dropout_p=self.dropout if self.training else 0.0,
+            is_causal=self.is_causal,
+            scale=self.scale,
         )
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         return (
             f"ScaledDotProductAttention("
             f"num_heads={self.num_heads}, "
             f"head_dim={self.head_dim}, "
-            f"dropout={self.dropout})"
+            f"dropout={self.dropout}, "
+            f"is_causal={self.is_causal}, "
+            f"scale={self.scale})"
         )
@@ -197,7 +214,7 @@ class FusedMHAInProjection(FusedModule):
         v = v.view(batch, seq, num_heads, head_dim).transpose(1, 2)
         return q, k, v
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         embed_dim = self.in_proj.num_heads * self.in_proj.head_dim
         return (
             f"FusedMHAInProjection("
@@ -275,10 +292,18 @@ class FusedScaledDotProductAttention(FusedModule):
         # MHA kernel onto the slower INT8-aware variant for no gain.
         if not self.surrounded or not self.softmax_quant.enabled:
             return self.attention(q, k, v, attn_mask)
-        # Use ``1/sqrt(head_dim)`` rather than ``head_dim ** -0.5``: the
+        # Honour the wrapped attention module's explicit ``scale`` if
+        # set — models that pre-scale Q themselves (chronos-2 + RoPE,
+        # for example) build with ``scale=1.0`` to disable the default
+        # ``1/sqrt(head_dim)`` scaling. Falling back to the default
+        # here would apply it twice and collapse softmax.
+        # Note on ``1/sqrt(head_dim)`` vs ``head_dim ** -0.5``: the
         # tensor Pow with a negative float exponent traces to ONNX as a
         # ``Cast → complex128`` node that TRT 10.x can't parse.
-        scale = 1.0 / math.sqrt(q.shape[-1])
+        if self.attention.scale is not None:
+            scale = self.attention.scale
+        else:
+            scale = 1.0 / math.sqrt(q.shape[-1])
         attn_weight = torch.matmul(q, k.transpose(-2, -1)) * scale
         if attn_mask is not None:
             if attn_mask.dtype == torch.bool:
@@ -297,7 +322,7 @@ class FusedScaledDotProductAttention(FusedModule):
             )
         return torch.matmul(attn_weight, v)
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         a = self.attention
         qdq = "yes" if self.softmax_quant.enabled else "no"
         return (

{embedl_deploy_tensorrt-0.4.1 → embedl_deploy_tensorrt-0.5.0}/src/embedl_deploy/_internal/tensorrt/modules/conv.py RENAMED Viewed

@@ -91,7 +91,7 @@ class FusedConvBNAct(FusedModule):
             x = self.bn(x)
         return self.act(x)
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         bn_info = ""
         if self.bn is not None:
             bn_info = f", bn={self.bn.num_features} (foldable)"
@@ -129,7 +129,7 @@ class FusedConvBN(FusedModule):
             x = self.bn(x)
         return x
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         bn_info = ""
         if self.bn is not None:
             bn_info = f", bn={self.bn.num_features} (foldable)"
@@ -168,7 +168,7 @@ class FusedConvBNActMaxPool(FusedModule):
         x = self.act(x)
         return self.maxpool(x)
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         bn_info = ""
         if self.bn is not None:
             bn_info = f", bn={self.bn.num_features} (foldable)"
@@ -213,7 +213,7 @@ class FusedConvBNAddAct(FusedModule):
         x = self.bn(x)
         return self.act(x + residual)
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         return (
             f"FusedConvBNAddAct("
             f"{self.conv.in_channels}→{self.conv.out_channels}, "

{embedl_deploy_tensorrt-0.4.1 → embedl_deploy_tensorrt-0.5.0}/src/embedl_deploy/_internal/tensorrt/modules/linear.py RENAMED Viewed

@@ -82,7 +82,7 @@ class FusedLinear(FusedModule):
         # pylint: disable-next=not-callable
         return F.linear(x, weight, self.linear.bias)
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         return (
             f"FusedLinear("
             f"{self.linear.in_features}→{self.linear.out_features})"
@@ -113,7 +113,7 @@ class FusedLinearAct(FusedModule):
         x = F.linear(x, weight, self.linear.bias)
         return self.act(x)
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         act_name = type(self.act).__name__
         return (
             f"FusedLinearAct("
@@ -151,7 +151,7 @@ class FusedLayerNorm(FusedModule):
         """Apply ``layer_norm``."""
         return self.layer_norm(x)
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         return (
             f"FusedLayerNorm("
             f"normalized_shape={self.layer_norm.normalized_shape}, "

{embedl_deploy_tensorrt-0.4.1 → embedl_deploy_tensorrt-0.5.0}/src/embedl_deploy/_internal/tensorrt/modules/pointwise.py RENAMED Viewed

@@ -34,5 +34,5 @@ class FusedActAdd(FusedModule):
         """Apply ``act(x) + residual``."""
         return self.act(x) + residual
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         return f"FusedActAdd({type(self.act).__name__})"

{embedl_deploy_tensorrt-0.4.1 → embedl_deploy_tensorrt-0.5.0}/src/embedl_deploy/_internal/tensorrt/modules/pool.py RENAMED Viewed

@@ -21,5 +21,5 @@ class FusedAdaptiveAvgPool2d(FusedModule):
         """Apply adaptive average pooling."""
         return self.pool(x)
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         return f"FusedAdaptiveAvgPool2d(output_size={self.pool.output_size})"

{embedl_deploy_tensorrt-0.4.1 → embedl_deploy_tensorrt-0.5.0}/src/embedl_deploy/_internal/tensorrt/modules/swin_attention.py RENAMED Viewed

@@ -9,7 +9,7 @@ are spatial rearrangements that need no quantization.
 computes the shifted-window attention mask.
 """
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 import torch
 import torch.nn.functional as F
@@ -19,7 +19,7 @@ from embedl_deploy._internal.core.modules import ConvertedModule, FusedModule
 from embedl_deploy._internal.core.quantize.stubs import QuantStub
-@dataclass
+@dataclass(eq=False)
 class SwinSpatialState:
     """Shared mutable state for spatial dimensions.
@@ -34,9 +34,7 @@ class SwinSpatialState:
     pad_height: int = 0
     pad_width: int = 0
     #: Effective shift size after clamping for small feature maps.
-    effective_shift_size: list[int] = field(
-        default_factory=lambda: [0, 0],
-    )
+    effective_shift_size: tuple[int, int] = (0, 0)
 class SwinWindowPartition(ConvertedModule):
@@ -71,31 +69,29 @@ class SwinWindowPartition(ConvertedModule):
         :returns:
             Windowed tensor ``[B*nW, Ws*Ws, C]``.
         """
-        # pylint: disable-next=invalid-name
-        B, H, W, C = x.shape
+        b, h, w, c = x.shape
         # Pad to multiples of window size.
         ws_h, ws_w = self.window_size
-        pad_b = (ws_h - H % ws_h) % ws_h
-        pad_r = (ws_w - W % ws_w) % ws_w
+        pad_b = (ws_h - h % ws_h) % ws_h
+        pad_r = (ws_w - w % ws_w) % ws_w
         x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
-        # pylint: disable-next=invalid-name
-        _, pad_H, pad_W, _ = x.shape
+        _, pad_h, pad_w, _ = x.shape
         # Clamp shift size when the window covers the whole feature map.
-        eff_shift = list(self.shift_size)
-        if ws_h >= pad_H:
-            eff_shift[0] = 0
-        if ws_w >= pad_W:
-            eff_shift[1] = 0
+        sh, sw = self.shift_size
+        eff_shift = (
+            0 if ws_h >= pad_h else sh,
+            0 if ws_w >= pad_w else sw,
+        )
         # Write spatial state for downstream modules.
         st = self._spatial_state
-        st.batch_size = B
-        st.height = H
-        st.width = W
-        st.pad_height = pad_H
-        st.pad_width = pad_W
+        st.batch_size = b
+        st.height = h
+        st.width = w
+        st.pad_height = pad_h
+        st.pad_width = pad_w
         st.effective_shift_size = eff_shift
         # Cyclic shift.
@@ -108,22 +104,22 @@ class SwinWindowPartition(ConvertedModule):
         # Window partition.
         x = x.view(
-            B,
-            pad_H // ws_h,
+            b,
+            pad_h // ws_h,
             ws_h,
-            pad_W // ws_w,
+            pad_w // ws_w,
             ws_w,
-            C,
+            c,
         )
-        num_windows = (pad_H // ws_h) * (pad_W // ws_w)
+        num_windows = (pad_h // ws_h) * (pad_w // ws_w)
         x = x.permute(0, 1, 3, 2, 4, 5).reshape(
-            B * num_windows,
+            b * num_windows,
             ws_h * ws_w,
-            C,
+            c,
         )
         return x
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         return (
             f"SwinWindowPartition("
             f"window_size={self.window_size}, "
@@ -177,11 +173,10 @@ class SwinAttention(ConvertedModule):
     def _get_relative_position_bias(self) -> torch.Tensor:
         """Compute relative position bias ``[1, nH, N, N]``."""
-        # pylint: disable-next=invalid-name
-        N = self.window_size[0] * self.window_size[1]
+        n = self.window_size[0] * self.window_size[1]
         assert isinstance(self.relative_position_index, torch.Tensor)
         bias = self.relative_position_bias_table[self.relative_position_index]
-        bias = bias.view(N, N, -1).permute(2, 0, 1).contiguous()
+        bias = bias.view(n, n, -1).permute(2, 0, 1).contiguous()
         return bias.unsqueeze(0)
     def _compute_attn_mask(  # pylint: disable=too-many-locals
@@ -193,13 +188,13 @@ class SwinAttention(ConvertedModule):
         if sum(eff_shift) == 0:
             return None
-        pad_H = st.pad_height  # pylint: disable=invalid-name
-        pad_W = st.pad_width  # pylint: disable=invalid-name
+        pad_h = st.pad_height
+        pad_w = st.pad_width
         ws_h, ws_w = self.window_size
-        num_windows = (pad_H // ws_h) * (pad_W // ws_w)
+        num_windows = (pad_h // ws_h) * (pad_w // ws_w)
         attn_mask = torch.zeros(
-            (pad_H, pad_W),
+            (pad_h, pad_w),
             device=self.relative_position_bias_table.device,
         )
         h_slices = (
@@ -219,9 +214,9 @@ class SwinAttention(ConvertedModule):
                 count += 1
         attn_mask = attn_mask.view(
-            pad_H // ws_h,
+            pad_h // ws_h,
             ws_h,
-            pad_W // ws_w,
+            pad_w // ws_w,
             ws_w,
         )
         attn_mask = attn_mask.permute(0, 2, 1, 3).reshape(
@@ -259,10 +254,9 @@ class SwinAttention(ConvertedModule):
         attn_mask = self._compute_attn_mask()
         if attn_mask is not None:
-            # pylint: disable-next=invalid-name
-            B = self._spatial_state.batch_size
-            nW = attn.size(0) // B  # pylint: disable=invalid-name
-            attn = attn.view(B, nW, self.num_heads, -1, attn.size(-1))
+            b = self._spatial_state.batch_size
+            n_w = attn.size(0) // b
+            attn = attn.view(b, n_w, self.num_heads, -1, attn.size(-1))
             attn = attn + attn_mask.unsqueeze(1).unsqueeze(0)
             attn = attn.view(-1, self.num_heads, attn.size(-2), attn.size(-1))
@@ -282,7 +276,7 @@ class SwinAttention(ConvertedModule):
         )
         return x
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         embed_dim = self.num_heads * self.head_dim
         return (
             f"SwinAttention("
@@ -320,27 +314,25 @@ class SwinWindowReverse(ConvertedModule):
             Spatial tensor ``[B, H, W, C]``.
         """
         st = self._spatial_state
-        # pylint: disable=invalid-name
-        B = st.batch_size
-        pad_H = st.pad_height
-        pad_W = st.pad_width
-        H = st.height
-        W = st.width
-        # pylint: enable=invalid-name
+        b = st.batch_size
+        pad_h = st.pad_height
+        pad_w = st.pad_width
+        h = st.height
+        w = st.width
         eff_shift = st.effective_shift_size
         ws_h, ws_w = self.window_size
-        C = x.size(-1)  # pylint: disable=invalid-name
+        c = x.size(-1)
         # Reverse window partition.
         x = x.view(
-            B,
-            pad_H // ws_h,
-            pad_W // ws_w,
+            b,
+            pad_h // ws_h,
+            pad_w // ws_w,
             ws_h,
             ws_w,
-            C,
+            c,
         )
-        x = x.permute(0, 1, 3, 2, 4, 5).reshape(B, pad_H, pad_W, C)
+        x = x.permute(0, 1, 3, 2, 4, 5).reshape(b, pad_h, pad_w, c)
         # Reverse cyclic shift.
         if sum(eff_shift) > 0:
@@ -351,10 +343,10 @@ class SwinWindowReverse(ConvertedModule):
             )
         # Remove padding.
-        x = x[:, :H, :W, :].contiguous()
+        x = x[:, :h, :w, :].contiguous()
         return x
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         return f"SwinWindowReverse(window_size={self.window_size})"
@@ -411,13 +403,13 @@ class FusedSwinAttention(FusedModule):
         # pylint: disable-next=protected-access
         attn_mask = a._compute_attn_mask()  # noqa: SLF001
         if attn_mask is not None:
-            # pylint: disable-next=protected-access,invalid-name
-            B = a._spatial_state.batch_size  # noqa: SLF001
-            nW = attn_weight.size(0) // B  # pylint: disable=invalid-name
+            # pylint: disable-next=protected-access
+            b = a._spatial_state.batch_size  # noqa: SLF001
+            n_w = attn_weight.size(0) // b
             n = attn_weight.size(-2)
             attn_weight = attn_weight.view(
-                B,
-                nW,
+                b,
+                n_w,
                 a.num_heads,
                 n,
                 attn_weight.size(-1),
@@ -448,7 +440,7 @@ class FusedSwinAttention(FusedModule):
             )
         )
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:  # pragma: no cover
         a = self.attention
         qdq = "yes" if self.softmax_quant.enabled else "no"
         return (

embedl-deploy-tensorrt 0.4.1__tar.gz → 0.5.0__tar.gz

embedl-deploy-tensorrt 0.4.1tar.gz → 0.5.0tar.gz