embedl-deploy-tensorrt 0.4.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/PKG-INFO +63 -32
  2. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/README.md +62 -30
  3. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/pyproject.toml +1 -1
  4. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/conversions/attention.py +1 -1
  5. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/conversions/general.py +17 -2
  6. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/version/public.py +1 -1
  7. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/LICENSE +0 -0
  8. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/MANIFEST.in +0 -0
  9. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/NOTICE +0 -0
  10. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/setup.cfg +0 -0
  11. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/__init__.py +0 -0
  12. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/__init__.py +0 -0
  13. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/__init__.py +0 -0
  14. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/backend.py +0 -0
  15. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/__init__.py +0 -0
  16. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/attention.py +0 -0
  17. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/conv.py +0 -0
  18. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/linear.py +0 -0
  19. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/pointwise.py +0 -0
  20. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/pool.py +0 -0
  21. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/swin_attention.py +0 -0
  22. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/__init__.py +0 -0
  23. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/conversions/__init__.py +0 -0
  24. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/fusions.py +0 -0
  25. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/quantizations.py +0 -0
  26. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/recompositions.py +0 -0
  27. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/smoothings.py +0 -0
  28. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/utils.py +0 -0
  29. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/plan.py +0 -0
  30. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/tensorrt/__init__.py +0 -0
  31. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/tensorrt/modules/__init__.py +0 -0
  32. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/tensorrt/patterns/__init__.py +0 -0
  33. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/version/__init__.py +0 -0
  34. {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy_tensorrt.egg-info/SOURCES.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: embedl-deploy-tensorrt
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: TensorRT backend for embedl-deploy.
5
5
  Author-email: Embedl AB <support@embedl.com>
6
6
  Project-URL: Homepage, https://www.embedl.com/
@@ -13,7 +13,6 @@ Requires-Python: >=3.10
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
15
  License-File: NOTICE
16
- Requires-Dist: tensorrt
17
16
  Provides-Extra: core
18
17
  Requires-Dist: embedl-deploy; extra == "core"
19
18
  Dynamic: license-file
@@ -55,16 +54,16 @@ hardware target ensuring correct quantization and compilation.
55
54
 
56
55
  ## Supported Backends
57
56
 
58
- | Backend | Status |
59
- |---------------------|-------------|
60
- | NVIDIA TensorRT | Supported |
57
+ | Backend | Status |
58
+ |-------------------------|-------------|
59
+ | NVIDIA TensorRT (v10.3) | Supported |
61
60
 
62
- Contact us for other backends.
61
+ Contact Embedl for other backends.
63
62
 
64
63
  ## Installation
65
64
 
66
65
  ```bash
67
- pip install embedl-deploy
66
+ pip install "embedl-deploy[tensorrt]"
68
67
  ```
69
68
  Note that you may need to also install `onnx` and `onnx-simplifier` to export
70
69
  and get the exported model compiled with TensorRT if using ONNX as an
@@ -86,6 +85,9 @@ model = Model().eval()
86
85
  example_input = torch.randn(1, 3, 224, 224)
87
86
 
88
87
  # 2. Transform — fuse and optimize for TensorRT in one call
88
+ # For more compatibilty you can trace your model with torch.export.export
89
+ # as follows:
90
+ # model = torch.export.export(model, (example_input)).module()
89
91
  res = transform(model, patterns=TENSORRT_PATTERNS)
90
92
  print("Model\n", res.model.print_readable())
91
93
  print("Matches", "\n".join([str(match) for match in res.matches]))
@@ -112,28 +114,54 @@ torch.onnx.export(
112
114
  qat_model = quantized_model.train()
113
115
  # Freeze BatchNorm, or apply other QAT utilities as needed
114
116
  # train(qat_model)
117
+ ```
118
+
119
+ ### Compile
120
+
121
+ Compilation can be done with TensorRT's trtexec tool, which can take the ONNX
122
+ model and compile it for inference. The exported layer info and profile can
123
+ be used for debugging, optimization and visualization.
124
+
125
+ Note: that the ONNX model might need to be simplified with onnx-simplifier to
126
+ make trtexec compile it. Dynamo exported models may have compilation issues,
127
+ so it's recommended to export with dynamo=False.
128
+
129
+ ```bash
130
+ onnxsim model.onnx model.onnx
131
+ /usr/src/tensorrt/bin/trtexec --onnx=model.onnx --fp16 --int8 --useCudaGraph
132
+ ```
133
+
134
+ Optionally you can get the layer profile with the following flags:
135
+ ```
136
+ --exportLayerInfo=layer_info.json
137
+ --exportProfile=profile.json
138
+ --profilingVerbosity=detailed
139
+ ```
115
140
 
116
- # Compile
117
- # -------
118
- # Compilation can be done with TensorRT's trtexec tool, which can take the ONNX
119
- # model and compile it for inference. The exported layer info and profile can
120
- # be used for debugging, optimization and visualization.
121
- #
122
- # Note: that the ONNX model might need to be simplified with onnx-simplifier to
123
- # make trtexec compile it. Dynamo exported models may have compilation issues,
124
- # so it's recommended to export with dynamo=False.
125
- #
126
- # We are working on a Aten-based export path that should be more robust and
127
- # support more models in the future.
128
-
129
- # >> onnxsim model.onnx model.onnx
130
- # >> trtexec \
131
- # --onnx=model.onnx \
132
- # --exportLayerInfo=layer_info.json \
133
- # --exportProfile=profile.json \
134
- # --profilingVerbosity=detailed
135
-
136
- # More benchmarking scripts can be found in the examples/ directory
141
+ ## Mixed Precision
142
+
143
+ To keep a specific layer in higher precision while quantizing the rest to INT8,
144
+ pass its `nn.Conv2d` instance to `ModulesToSkip` after `transform`. Note that
145
+ `torch.fx.GraphModule` deep-copies submodules during tracing, so you must take
146
+ the reference **from the fused graph**, not from the original model:
147
+
148
+ ```python
149
+ from embedl_deploy.quantize import quantize, QuantConfig, ModulesToSkip
150
+
151
+ res = transform(model, patterns=TENSORRT_PATTERNS)
152
+
153
+ # Grab the conv instance from the fused graph (not from the original model)
154
+ first_conv = res.model.FusedConvBNActMaxPool_0.conv
155
+
156
+ config = QuantConfig(
157
+ skip=ModulesToSkip(
158
+ stub={first_conv}, # disables input activation quantization
159
+ weight={first_conv}, # disables weight fake-quantization
160
+ )
161
+ )
162
+ quantized_model = quantize(
163
+ res.model, (example_input,), config=config, forward_loop=calibration_loop
164
+ )
137
165
  ```
138
166
 
139
167
  ## Design Principles
@@ -150,10 +178,13 @@ qat_model = quantized_model.train()
150
178
  `transform()` is a convenience for the common case where you want
151
179
  everything applied.
152
180
 
153
- 3. **FX-graph-based.**
154
- All graph analysis and surgery uses `torch.fx`. Models are traced once
155
- and manipulated as `fx.GraphModule` objects. Support for Aten graphs
156
- produced by `torch.export.export` is planned for the future.
181
+ 3. **Graph-based models (torch.export.export and symbolic traced).**
182
+ All graph analysis and surgery uses traced graphs. Models are traced once
183
+ and manipulated as `fx.GraphModule` objects with suport for tracing via both
184
+ `torch.fx` (symbolic) as well as `torch.export.export` (Aten). Support for
185
+ Aten graphs is automatically enabled using Aten recomposition
186
+ patterns that compose Aten operations into equivalent `torch.nn` modules
187
+ automatically before conversions and fusions.
157
188
 
158
189
  ## Support
159
190
 
@@ -35,16 +35,16 @@ hardware target ensuring correct quantization and compilation.
35
35
 
36
36
  ## Supported Backends
37
37
 
38
- | Backend | Status |
39
- |---------------------|-------------|
40
- | NVIDIA TensorRT | Supported |
38
+ | Backend | Status |
39
+ |-------------------------|-------------|
40
+ | NVIDIA TensorRT (v10.3) | Supported |
41
41
 
42
- Contact us for other backends.
42
+ Contact Embedl for other backends.
43
43
 
44
44
  ## Installation
45
45
 
46
46
  ```bash
47
- pip install embedl-deploy
47
+ pip install "embedl-deploy[tensorrt]"
48
48
  ```
49
49
  Note that you may need to also install `onnx` and `onnx-simplifier` to export
50
50
  and get the exported model compiled with TensorRT if using ONNX as an
@@ -66,6 +66,9 @@ model = Model().eval()
66
66
  example_input = torch.randn(1, 3, 224, 224)
67
67
 
68
68
  # 2. Transform — fuse and optimize for TensorRT in one call
69
+ # For more compatibilty you can trace your model with torch.export.export
70
+ # as follows:
71
+ # model = torch.export.export(model, (example_input)).module()
69
72
  res = transform(model, patterns=TENSORRT_PATTERNS)
70
73
  print("Model\n", res.model.print_readable())
71
74
  print("Matches", "\n".join([str(match) for match in res.matches]))
@@ -92,28 +95,54 @@ torch.onnx.export(
92
95
  qat_model = quantized_model.train()
93
96
  # Freeze BatchNorm, or apply other QAT utilities as needed
94
97
  # train(qat_model)
98
+ ```
99
+
100
+ ### Compile
101
+
102
+ Compilation can be done with TensorRT's trtexec tool, which can take the ONNX
103
+ model and compile it for inference. The exported layer info and profile can
104
+ be used for debugging, optimization and visualization.
105
+
106
+ Note: that the ONNX model might need to be simplified with onnx-simplifier to
107
+ make trtexec compile it. Dynamo exported models may have compilation issues,
108
+ so it's recommended to export with dynamo=False.
109
+
110
+ ```bash
111
+ onnxsim model.onnx model.onnx
112
+ /usr/src/tensorrt/bin/trtexec --onnx=model.onnx --fp16 --int8 --useCudaGraph
113
+ ```
114
+
115
+ Optionally you can get the layer profile with the following flags:
116
+ ```
117
+ --exportLayerInfo=layer_info.json
118
+ --exportProfile=profile.json
119
+ --profilingVerbosity=detailed
120
+ ```
95
121
 
96
- # Compile
97
- # -------
98
- # Compilation can be done with TensorRT's trtexec tool, which can take the ONNX
99
- # model and compile it for inference. The exported layer info and profile can
100
- # be used for debugging, optimization and visualization.
101
- #
102
- # Note: that the ONNX model might need to be simplified with onnx-simplifier to
103
- # make trtexec compile it. Dynamo exported models may have compilation issues,
104
- # so it's recommended to export with dynamo=False.
105
- #
106
- # We are working on a Aten-based export path that should be more robust and
107
- # support more models in the future.
108
-
109
- # >> onnxsim model.onnx model.onnx
110
- # >> trtexec \
111
- # --onnx=model.onnx \
112
- # --exportLayerInfo=layer_info.json \
113
- # --exportProfile=profile.json \
114
- # --profilingVerbosity=detailed
115
-
116
- # More benchmarking scripts can be found in the examples/ directory
122
+ ## Mixed Precision
123
+
124
+ To keep a specific layer in higher precision while quantizing the rest to INT8,
125
+ pass its `nn.Conv2d` instance to `ModulesToSkip` after `transform`. Note that
126
+ `torch.fx.GraphModule` deep-copies submodules during tracing, so you must take
127
+ the reference **from the fused graph**, not from the original model:
128
+
129
+ ```python
130
+ from embedl_deploy.quantize import quantize, QuantConfig, ModulesToSkip
131
+
132
+ res = transform(model, patterns=TENSORRT_PATTERNS)
133
+
134
+ # Grab the conv instance from the fused graph (not from the original model)
135
+ first_conv = res.model.FusedConvBNActMaxPool_0.conv
136
+
137
+ config = QuantConfig(
138
+ skip=ModulesToSkip(
139
+ stub={first_conv}, # disables input activation quantization
140
+ weight={first_conv}, # disables weight fake-quantization
141
+ )
142
+ )
143
+ quantized_model = quantize(
144
+ res.model, (example_input,), config=config, forward_loop=calibration_loop
145
+ )
117
146
  ```
118
147
 
119
148
  ## Design Principles
@@ -130,10 +159,13 @@ qat_model = quantized_model.train()
130
159
  `transform()` is a convenience for the common case where you want
131
160
  everything applied.
132
161
 
133
- 3. **FX-graph-based.**
134
- All graph analysis and surgery uses `torch.fx`. Models are traced once
135
- and manipulated as `fx.GraphModule` objects. Support for Aten graphs
136
- produced by `torch.export.export` is planned for the future.
162
+ 3. **Graph-based models (torch.export.export and symbolic traced).**
163
+ All graph analysis and surgery uses traced graphs. Models are traced once
164
+ and manipulated as `fx.GraphModule` objects with suport for tracing via both
165
+ `torch.fx` (symbolic) as well as `torch.export.export` (Aten). Support for
166
+ Aten graphs is automatically enabled using Aten recomposition
167
+ patterns that compose Aten operations into equivalent `torch.nn` modules
168
+ automatically before conversions and fusions.
137
169
 
138
170
  ## Support
139
171
 
@@ -24,7 +24,7 @@ license-files = [
24
24
  readme = "README.md"
25
25
  description = "TensorRT backend for embedl-deploy."
26
26
  dynamic = ["version"]
27
- dependencies = ["tensorrt"]
27
+ dependencies = []
28
28
 
29
29
  [project.optional-dependencies]
30
30
  core = ["embedl-deploy"]
@@ -47,7 +47,7 @@ from embedl_deploy._internal.tensorrt.modules.swin_attention import (
47
47
  )
48
48
 
49
49
  try:
50
- from torchvision.models.swin_transformer import ( # type: ignore[import-untyped]
50
+ from torchvision.models.swin_transformer import (
51
51
  shifted_window_attention,
52
52
  )
53
53
 
@@ -139,7 +139,13 @@ class RemoveExportAssertPattern(Pattern):
139
139
 
140
140
 
141
141
  def _is_flatten(node: fx.Node) -> bool:
142
- """Return ``True`` when `node` is a flatten call with shape metadata."""
142
+ """Return ``True`` when `node` is a 4D→2D flatten with shape metadata.
143
+
144
+ Only matches flattens with ``start_dim=1`` on a 4-D input, which
145
+ produces a 2-D output (the classification-head pattern). Flattens
146
+ with ``start_dim >= 2`` (e.g. the MHA head-merging ``flatten(2)``
147
+ that produces 3-D output) are rejected.
148
+ """
143
149
  if node.op == "call_function":
144
150
  is_flat = node.target is torch.flatten
145
151
  elif node.op == "call_method":
@@ -149,7 +155,16 @@ def _is_flatten(node: fx.Node) -> bool:
149
155
  if not is_flat:
150
156
  return False
151
157
  shape = get_input_shape(node)
152
- return shape is not None
158
+ if shape is None or len(shape) != 4:
159
+ return False
160
+ mod = get_module(node)
161
+ if isinstance(mod, nn.Flatten):
162
+ start_dim: int = mod.start_dim
163
+ elif len(node.args) > 1 and isinstance(node.args[1], int):
164
+ start_dim = node.args[1]
165
+ else:
166
+ start_dim = 0
167
+ return start_dim == 1
153
168
 
154
169
 
155
170
  ElementWiseLike: TypeAlias = (
@@ -2,4 +2,4 @@
2
2
 
3
3
  """The hardcoded public version of the package."""
4
4
 
5
- PUBLIC_VERSION = '0.4.0'
5
+ PUBLIC_VERSION = '0.4.1'