PyPI - embedl-deploy - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

embedl-deploy 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

embedl_deploy-0.3.0/MANIFEST.in ADDED Viewed

@@ -0,0 +1,8 @@
+prune *
+graft src
+include LICENSE
+include NOTICE
+include README.md
+global-exclude CLAUDE.md
+global-exclude *.pyc
+global-exclude __pycache__

{embedl_deploy-0.2.0/src/embedl_deploy.egg-info → embedl_deploy-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: embedl-deploy
-Version: 0.2.0
+Version: 0.3.0
 Summary: Python package to make AI models deployment-ready for any hardware.
 Author-email: Embedl AB <support@embedl.com>
 Project-URL: Homepage, https://www.embedl.com/

{embedl_deploy-0.2.0 → embedl_deploy-0.3.0}/src/embedl_deploy/_internal/core/pattern.py RENAMED Viewed

@@ -287,6 +287,10 @@ class Pattern(ABC):
     ``symbolic_trace``. This pattern has no effect on graphs exported with
     ``torch.export`` because the nodes never appear in those graphs."""
+    export_graph_only: bool = False
+    """If ``True``, this pattern targets nodes that only appear in
+    ``torch.export`` aten graphs and has no effect on symbolic-trace output."""
     @abstractmethod
     def match(self, graph_module: fx.GraphModule) -> list["PatternMatch"]:
         """Find all occurrences of this pattern in `graph_module`.

{embedl_deploy-0.2.0 → embedl_deploy-0.3.0}/src/embedl_deploy/_internal/core/plan.py RENAMED Viewed

@@ -149,6 +149,18 @@ def get_transformation_plan(
         graph_module = copy.deepcopy(graph_module)
         setattr(graph_module, "_deep_copy_done", True)
+    # Strip torch.export shape-guard nodes that ShapeProp cannot evaluate.
+    guards = [
+        n
+        for n in graph_module.graph.nodes
+        if n.op == "call_module" and n.name.startswith("_guards")
+    ]
+    for node in guards:
+        node.replace_all_uses_with(next(iter(node.args)))
+        graph_module.graph.erase_node(node)
+    if guards:
+        graph_module.recompile()
     pattern_matches: list[PatternMatch] = []
     for pattern in patterns:
         pattern_matches.extend(pattern.match(graph_module))
@@ -219,10 +231,17 @@ def apply_transformation_plan(
         graph_module.recompile()
     graph_module.eval()
-    input_node = next(iter(graph_module.graph.nodes))
-    meta = input_node.meta.get("tensor_meta")
-    if meta is not None and hasattr(meta, "shape"):
-        ShapeProp(graph_module).propagate(torch.randn(meta.shape))  # type: ignore[no-untyped-call]
+    fake_args: list[torch.Tensor] = []
+    for n in graph_module.graph.nodes:
+        if n.op != "placeholder":
+            continue
+        meta = n.meta.get("tensor_meta")
+        if meta is None or not hasattr(meta, "shape"):
+            fake_args.clear()
+            break
+        fake_args.append(torch.randn(meta.shape))
+    if fake_args:
+        ShapeProp(graph_module).propagate(*fake_args)  # type: ignore[no-untyped-call]
     report = _build_report(enabled, skipped)

{embedl_deploy-0.2.0 → embedl_deploy-0.3.0}/src/embedl_deploy/_internal/tensorrt/modules/attention.py RENAMED Viewed

@@ -12,9 +12,10 @@ import torch.nn.functional as F
 from torch import nn
 from embedl_deploy._internal.core.modules import ConvertedModule, FusedModule
-from embedl_deploy._internal.core.quantize.stubs import (
-    QuantStub,
-    WeightFakeQuantize,
+from embedl_deploy._internal.core.quantize.stubs import QuantStub
+from embedl_deploy._internal.tensorrt.modules.linear import (
+    attach_int8_weight_quant,
+    maybe_quantize_weight,
 )
@@ -153,7 +154,7 @@ class FusedMHAInProjection(FusedModule):
     def __init__(self, in_proj: MHAInProjection) -> None:
         super().__init__()
         self.in_proj = in_proj
-        self.weight_fake_quant = WeightFakeQuantize({self})
+        attach_int8_weight_quant(self, in_proj.linear)
     def forward(
         self,
@@ -173,7 +174,7 @@ class FusedMHAInProjection(FusedModule):
         :returns:
             Tuple ``(Q, K, V)`` each of shape ``[B, num_heads, S, head_dim]``.
         """
-        weight = self.weight_fake_quant(self.in_proj.linear.weight)
+        weight = maybe_quantize_weight(self, self.in_proj.linear.weight)
         batch, seq, _ = query.shape
         # pylint: disable-next=not-callable
         qkv = F.linear(query, weight, self.in_proj.linear.bias)

{embedl_deploy-0.2.0 → embedl_deploy-0.3.0}/src/embedl_deploy/_internal/tensorrt/modules/conv.py RENAMED Viewed

@@ -22,13 +22,19 @@ from embedl_deploy._internal.core.quantize.stubs import (
 def _is_int8_compatible_conv(conv: nn.Conv2d) -> bool:
-    """Return ``True`` unless `conv` is a grouped conv violating TRT INT8.
-    TensorRT requires ``in_channels / groups`` and
-    ``out_channels / groups`` to both be multiples of 4 for INT8.
+    """Return ``True`` unless *conv* is a grouped conv violating TRT INT8.
+    TensorRT's documented constraint for ``IConvolutionLayer`` is that
+    ``in_channels / groups`` and ``out_channels / groups`` must both
+    be multiples of 4 in INT8 mode.  Depthwise convolutions
+    (``groups == in_channels``) are an exception: our benchmarks on
+    the target devices show they still benefit from INT8 despite
+    channels-per-group being 1, so we let them through.
     """
     if conv.groups <= 1:
         return True
+    if conv.groups == conv.in_channels:
+        return True
     in_per_group: int = conv.in_channels // conv.groups
     out_per_group: int = conv.out_channels // conv.groups
     return in_per_group % 4 == 0 and out_per_group % 4 == 0

{embedl_deploy-0.2.0 → embedl_deploy-0.3.0}/src/embedl_deploy/_internal/tensorrt/modules/linear.py RENAMED Viewed

@@ -14,6 +14,53 @@ from embedl_deploy._internal.core.quantize.stubs import (
     WeightFakeQuantize,
 )
+#: Minimum ``K * N / (K + N)`` for INT8 to outperform FP16.
+INT8_LINEAR_MIN_RATIO: int = 256
+def is_int8_beneficial_linear(linear: nn.Linear) -> bool:
+    """Return ``True`` when INT8 quantisation benefits *linear*.
+    Uses the harmonic mean of the weight dimensions ``K * N / (K + N)``
+    as a proxy for the ratio of INT8 compute savings to Q/DQ reformat
+    overhead.  Below :data:`INT8_LINEAR_MIN_RATIO`, the overhead from
+    quantise/dequantise boundary layers exceeds any INT8 GEMM speedup
+    and the layer is better left in FP16.
+    Reference: NVIDIA benchmarks show INT8 GEMM outperforms FP16 only
+    when all three matrix dimensions exceed ~2048 (A100).  The harmonic
+    mean threshold of 256 conservatively separates mobile-class models
+    (MobileViT FFN ratio ≤ 160) from server-class models (ViT-B/16 FFN
+    ratio = 614) where INT8 is beneficial.
+    """
+    k, n = linear.in_features, linear.out_features
+    return k * n / (k + n) >= INT8_LINEAR_MIN_RATIO
+def attach_int8_weight_quant(
+    mod: FusedModule,
+    linear: nn.Linear,
+) -> None:
+    """Attach a ``WeightFakeQuantize`` to *mod* when INT8 helps *linear*.
+    When INT8 wouldn't pay for its Q/DQ boundary cost, also clear
+    ``mod.input_quant_stubs`` so the surrounding Q/DQ pass leaves the
+    wrapped linear entirely in FP16.
+    """
+    if is_int8_beneficial_linear(linear):
+        mod.weight_fake_quant = WeightFakeQuantize({mod})
+    else:
+        mod.input_quant_stubs = {}
+def maybe_quantize_weight(
+    mod: nn.Module,
+    weight: torch.Tensor,
+) -> torch.Tensor:
+    """Fake-quantize *weight* through ``mod.weight_fake_quant`` if present."""
+    wfq = getattr(mod, "weight_fake_quant", None)
+    return wfq(weight) if wfq is not None else weight
 class FusedLinear(FusedModule):
     """Fused wrapper for a standalone ``Linear`` layer.
@@ -27,11 +74,11 @@ class FusedLinear(FusedModule):
     def __init__(self, linear: nn.Linear) -> None:
         super().__init__()
         self.linear = linear
-        self.weight_fake_quant = WeightFakeQuantize({self})
+        attach_int8_weight_quant(self, linear)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Apply ``linear``, fake-quantizing the weight."""
-        weight = self.weight_fake_quant(self.linear.weight)
+        weight = maybe_quantize_weight(self, self.linear.weight)
         # pylint: disable-next=not-callable
         return F.linear(x, weight, self.linear.bias)
@@ -57,11 +104,11 @@ class FusedLinearAct(FusedModule):
         super().__init__()
         self.linear = linear
         self.act = act
-        self.weight_fake_quant = WeightFakeQuantize({self})
+        attach_int8_weight_quant(self, linear)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Apply ``linear → activation``, fake-quantizing the weight."""
-        weight = self.weight_fake_quant(self.linear.weight)
+        weight = maybe_quantize_weight(self, self.linear.weight)
         # pylint: disable-next=not-callable
         x = F.linear(x, weight, self.linear.bias)
         return self.act(x)

{embedl_deploy-0.2.0 → embedl_deploy-0.3.0}/src/embedl_deploy/_internal/tensorrt/patterns/conversions/attention.py RENAMED Viewed

@@ -21,7 +21,9 @@ from embedl_deploy._internal.core.pattern import (
     Fork,
     Pattern,
     PatternMatch,
+    SharedNodeCheck,
     Tree,
+    TreeMatch,
     Wildcard,
     get_module,
     node_check,
@@ -689,3 +691,201 @@ class ComposeScaledDotProductAttentionPattern(Pattern):
         # matmul pinned as a non-tree user and block erasure.
         pattern_match.graph_module.graph.eliminate_dead_code()
         return replace_tree(pattern_match, [sdpa])
+# -- Compose parallel linears into MHAInProjection ----------------------
+def _is_transpose_1_2(node: fx.Node) -> bool:
+    """Return ``True`` for ``tensor.transpose(1, 2)``."""
+    if node.op != "call_method" or node.target != "transpose":
+        return False
+    non_node = [a for a in node.args if not isinstance(a, fx.Node)]
+    return set(non_node) == {1, 2}
+@node_check
+def _is_sdpa_module(node: fx.Node) -> bool:
+    """Return ``True`` for a ``ScaledDotProductAttention`` call_module node."""
+    return isinstance(get_module(node), ScaledDotProductAttention)
+#: Shared across the three Q/K/V branches so they are all constrained to
+#: the same physical source tensor.  Re-using one instance means the
+#: first branch to run caches the source node and the other two succeed
+#: only when they see that exact node.
+_parallel_linears_shared_input = SharedNodeCheck(lambda _: True)
+#: One of the three Q/K/V projection branches: the ``transpose(1, 2)``
+#: tail, a ``view``/``reshape`` with two shape arguments, and the
+#: ``nn.Linear`` whose input is constrained by
+#: :data:`_parallel_linears_shared_input`.
+_parallel_linears_branch: Tree = Fork(
+    inputs=(
+        (_parallel_linears_shared_input, nn.Linear),
+        (),
+        (),
+    ),
+    operator=_is_view_or_reshape,
+    output=(_is_transpose_1_2,),
+)
+def _get_parallel_linears_insert(
+    shared_input: fx.Node,
+    in_proj: MHAInProjection,
+    sdpa_node: fx.Node,
+) -> ReplacementFn:
+    """Return a replacement that inserts ``MHAInProjection → getitem×3`` and
+    rewires the existing SDPA to consume the Q/K/V getitems.
+    The three old ``transpose → view → nn.Linear`` input chains are part
+    of the matched tree and erased by
+    :func:`~embedl_deploy._internal.core.replace.replace_tree` once the
+    SDPA's args are rewired.
+    """
+    def _insert(
+        graph_module: fx.GraphModule,
+        prev_args: tuple[fx.Node, ...],
+    ) -> list[fx.Node]:
+        del prev_args  # inputs are derived from the shared pre-linear tensor
+        replaced = get_replaced_nodes(graph_module)
+        resolved_input = replaced.get(shared_input, shared_input)
+        graph = graph_module.graph
+        ip_name = get_auto_name(graph_module, in_proj)
+        graph_module.add_module(ip_name, in_proj)
+        with graph.inserting_after(resolved_input):
+            ip_node = graph.call_module(
+                ip_name,
+                (resolved_input, resolved_input, resolved_input),
+            )
+        gis: list[fx.Node] = []
+        prev = ip_node
+        for i in range(3):
+            with graph.inserting_after(prev):
+                gi = graph.call_function(
+                    operator.getitem,
+                    (ip_node, i),
+                )
+            gis.append(gi)
+            prev = gi
+        sdpa_node.args = tuple(gis)
+        return [resolved_input, ip_node, *gis, sdpa_node]
+    return _insert
+def _branch_linear(tree_match: TreeMatch, branch: int) -> nn.Linear:
+    """Return the ``nn.Linear`` module matched in the *branch*-th Q/K/V arm."""
+    linear_node = tree_match.get_node(branch, 0, 1)
+    return resolve_module(linear_node, nn.Linear)
+class ComposeParallelLinearsPattern(Pattern):
+    """Compose three parallel ``nn.Linear`` Q/K/V into ``MHAInProjection``.
+    Matches a
+    :class:`~embedl_deploy._internal.tensorrt.modules.attention.ScaledDotProductAttention`
+    node whose three inputs each trace back through
+    ``transpose(1, 2) → view → nn.Linear`` from the same source tensor.
+    The three branches are tied to a single source node by a
+    :class:`~embedl_deploy._internal.core.pattern.SharedNodeCheck` shared
+    across their data sub-trunks.
+    Packs the three separate linear weights into a single
+    ``nn.Linear(embed_dim, 3 * embed_dim)`` and wraps it in an
+    :class:`~embedl_deploy._internal.tensorrt.modules.attention.MHAInProjection`.
+    Depends on
+    :class:`ComposeScaledDotProductAttentionPattern` having run
+    first (handled automatically by the iterative conversion loop).
+    """
+    is_conversion = True
+    tree: Tree = Fork(
+        inputs=(
+            _parallel_linears_branch,
+            _parallel_linears_branch,
+            _parallel_linears_branch,
+        ),
+        operator=_is_sdpa_module,
+        output=(),
+    )
+    def match(
+        self,
+        graph_module: fx.GraphModule,
+    ) -> list[PatternMatch]:
+        matches = match_tree(graph_module, pattern=self)
+        return [m for m in matches if self._linears_compatible(m)]
+    @staticmethod
+    def _linears_compatible(pattern_match: PatternMatch) -> bool:
+        """Return ``True`` when all three matched Linears are shape-compatible.
+        Required for weight packing: shape/bias constraints can't be
+        expressed in the tree grammar, so they are checked here to
+        reject otherwise-structural matches before replacement runs.
+        """
+        first = _branch_linear(pattern_match.tree_match, 0)
+        for i in (1, 2):
+            lin = _branch_linear(pattern_match.tree_match, i)
+            if lin.in_features != first.in_features:
+                return False
+            if lin.out_features != first.out_features:
+                return False
+            if (lin.bias is None) != (first.bias is None):
+                return False
+        return True
+    def replace(
+        self,
+        pattern_match: PatternMatch,
+    ) -> list[fx.Node]:
+        assert pattern_match.pattern is self
+        tree_match = pattern_match.tree_match
+        sdpa_node = tree_match.pre_trunk_nodes[0]
+        sdpa_mod = resolve_module(sdpa_node, ScaledDotProductAttention)
+        num_heads = sdpa_mod.num_heads
+        head_dim = sdpa_mod.head_dim
+        embed_dim = num_heads * head_dim
+        shared_input = tree_match.get_node(0, 0, 0)
+        q_lin = _branch_linear(tree_match, 0)
+        k_lin = _branch_linear(tree_match, 1)
+        v_lin = _branch_linear(tree_match, 2)
+        has_bias = q_lin.bias is not None
+        packed = nn.Linear(embed_dim, 3 * embed_dim, bias=has_bias)
+        packed.weight = nn.Parameter(
+            torch.cat(
+                [
+                    q_lin.weight,
+                    k_lin.weight,
+                    v_lin.weight,
+                ],
+                dim=0,
+            )
+        )
+        if has_bias:
+            packed.bias = nn.Parameter(
+                torch.cat(
+                    [
+                        q_lin.bias,
+                        k_lin.bias,
+                        v_lin.bias,  # type: ignore[arg-type]
+                    ],
+                    dim=0,
+                )
+            )
+        in_proj = MHAInProjection(packed, num_heads, head_dim)
+        return replace_tree(
+            pattern_match,
+            [_get_parallel_linears_insert(shared_input, in_proj, sdpa_node)],
+        )

{embedl_deploy-0.2.0 → embedl_deploy-0.3.0}/src/embedl_deploy/_internal/tensorrt/patterns/conversions/general.py RENAMED Viewed

@@ -101,6 +101,62 @@ class RemoveDeadAssertPattern(Pattern):
         return replace_tree(pattern_match, [])
+def _is_export_assert(node: fx.Node) -> bool:
+    """Return ``True`` for ``torch.export`` device/dtype guard nodes.
+    ``torch.export`` inserts ``aten._assert_tensor_metadata.default`` and
+    ``aten._assert_async.msg`` nodes to enforce the device, dtype, and layout
+    of tensors at the export site.  These guards always fail when the model is
+    moved to a different device (e.g. CPU export → CUDA inference) and must be
+    removed for deployment.
+    Both ops return ``None`` and have no downstream users, so removal is safe.
+    """
+    if node.op != "call_function":
+        return False
+    target = node.target
+    _metadata = getattr(
+        getattr(torch.ops, "aten", None),
+        "_assert_tensor_metadata",
+        None,
+    )
+    _async = getattr(
+        getattr(torch.ops, "aten", None),
+        "_assert_async",
+        None,
+    )
+    if _metadata is not None and target is getattr(_metadata, "default", None):
+        return True
+    if _async is not None and target is getattr(_async, "msg", None):
+        return True
+    return False
+class RemoveExportAssertPattern(Pattern):
+    """Remove ``torch.export`` tensor-metadata guard nodes.
+    ``torch.export`` inserts ``aten._assert_tensor_metadata`` calls to enforce
+    the device/dtype/layout of inputs at export time.  These guards always
+    raise when the model is run on a device other than the one used during
+    export (e.g. CPU-exported model deployed on CUDA).
+    Unlike :class:`RemoveAssertPattern` this pattern is not
+    ``symbolic_trace_only`` — it targets ``torch.export`` graph modules
+    specifically.
+    """
+    is_conversion = True
+    export_graph_only = True
+    tree: Tree = (_is_export_assert,)
+    def match(self, graph_module: fx.GraphModule) -> list[PatternMatch]:
+        return match_tree(graph_module, pattern=self)
+    def replace(self, pattern_match: PatternMatch) -> list[fx.Node]:
+        assert pattern_match.pattern is self
+        return replace_tree(pattern_match, [])
 def _is_flatten(node: fx.Node) -> bool:
     """Return ``True`` when `node` is a flatten call with shape metadata."""
     if node.op == "call_function":

{embedl_deploy-0.2.0 → embedl_deploy-0.3.0}/src/embedl_deploy/_internal/tensorrt/patterns/quantizations.py RENAMED Viewed

@@ -38,6 +38,23 @@ from embedl_deploy._internal.tensorrt.modules.pool import (
 from embedl_deploy._internal.tensorrt.modules.swin_attention import (
     FusedSwinAttention,
 )
+from embedl_deploy._internal.tensorrt.patterns.utils import get_input_shape
+#: Head sizes for which TensorRT has a fused INT8 MHA kernel.
+INT8_MHA_HEAD_SIZES: frozenset[int] = frozenset({16, 32, 64})
+#: Maximum sequence length supported by the fused INT8 MHA kernel.
+#:
+#: TensorRT's fused INT8 multi-head attention kernel (SM75–SM90, SM120,
+#: SM121) only supports head sizes in :data:`INT8_MHA_HEAD_SIZES` and
+#: sequence lengths at most :data:`INT8_MHA_MAX_SEQ`.  Outside those
+#: bounds, quantising the softmax output forces an unfused FP32
+#: softmax + INT8 requantise path that is slower than leaving the
+#: attention block in FP16.
+# Reference:
+# pylint: disable-next=line-too-long
+# https://docs.nvidia.com/deeplearning/tensorrt/latest/inference-library/work-with-transformers.html
+INT8_MHA_MAX_SEQ: int = 512
 def _has_quant_stubs(node: fx.Node) -> bool:
@@ -169,16 +186,36 @@ class PropagateQuantStubPattern(Pattern):
 def _needs_surround(node: fx.Node) -> bool:
-    """Return ``True`` for a surround-type ``FusedModule`` not yet surrounded."""
+    """Return ``True`` for a surround-type ``FusedModule`` not yet surrounded.
+    For :class:`FusedScaledDotProductAttention`, surrounding is skipped
+    when the head size or sequence length fall outside TensorRT's fused
+    INT8 MHA constraints (:data:`INT8_MHA_HEAD_SIZES`,
+    :data:`INT8_MHA_MAX_SEQ`).  The internal ``softmax_quant`` is also
+    disabled so the attention block stays entirely in FP16.
+    """
     mod = get_module(node)
-    return isinstance(
+    if not isinstance(
         mod,
         (
             FusedAdaptiveAvgPool2d,
             FusedScaledDotProductAttention,
             FusedSwinAttention,
         ),
-    ) and not getattr(mod, "_surrounded", False)
+    ):
+        return False
+    if getattr(mod, "_surrounded", False):
+        return False
+    if isinstance(mod, FusedScaledDotProductAttention):
+        head_dim = mod.attention.head_dim
+        shape = get_input_shape(node)
+        seq_len = shape[-2] if shape is not None and len(shape) >= 3 else None
+        if head_dim not in INT8_MHA_HEAD_SIZES or (
+            seq_len is not None and seq_len > INT8_MHA_MAX_SEQ
+        ):
+            mod.softmax_quant.enabled = False
+            return False
+    return True
 class SurroundWithQuantStubsPattern(Pattern):

embedl-deploy 0.2.0__tar.gz → 0.3.0__tar.gz

embedl-deploy 0.2.0tar.gz → 0.3.0tar.gz