embedl-deploy-tensorrt 0.4.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/PKG-INFO +63 -32
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/README.md +62 -30
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/pyproject.toml +1 -1
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/conversions/attention.py +1 -1
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/conversions/general.py +17 -2
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/version/public.py +1 -1
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/LICENSE +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/MANIFEST.in +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/NOTICE +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/setup.cfg +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/backend.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/attention.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/conv.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/linear.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/pointwise.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/pool.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/modules/swin_attention.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/conversions/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/fusions.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/quantizations.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/recompositions.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/smoothings.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/patterns/utils.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/_internal/tensorrt/plan.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/tensorrt/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/tensorrt/modules/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/tensorrt/patterns/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/version/__init__.py +0 -0
- {embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy_tensorrt.egg-info/SOURCES.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: embedl-deploy-tensorrt
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: TensorRT backend for embedl-deploy.
|
|
5
5
|
Author-email: Embedl AB <support@embedl.com>
|
|
6
6
|
Project-URL: Homepage, https://www.embedl.com/
|
|
@@ -13,7 +13,6 @@ Requires-Python: >=3.10
|
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
15
|
License-File: NOTICE
|
|
16
|
-
Requires-Dist: tensorrt
|
|
17
16
|
Provides-Extra: core
|
|
18
17
|
Requires-Dist: embedl-deploy; extra == "core"
|
|
19
18
|
Dynamic: license-file
|
|
@@ -55,16 +54,16 @@ hardware target ensuring correct quantization and compilation.
|
|
|
55
54
|
|
|
56
55
|
## Supported Backends
|
|
57
56
|
|
|
58
|
-
| Backend
|
|
59
|
-
|
|
60
|
-
| NVIDIA TensorRT
|
|
57
|
+
| Backend | Status |
|
|
58
|
+
|-------------------------|-------------|
|
|
59
|
+
| NVIDIA TensorRT (v10.3) | Supported |
|
|
61
60
|
|
|
62
|
-
Contact
|
|
61
|
+
Contact Embedl for other backends.
|
|
63
62
|
|
|
64
63
|
## Installation
|
|
65
64
|
|
|
66
65
|
```bash
|
|
67
|
-
pip install embedl-deploy
|
|
66
|
+
pip install "embedl-deploy[tensorrt]"
|
|
68
67
|
```
|
|
69
68
|
Note that you may need to also install `onnx` and `onnx-simplifier` to export
|
|
70
69
|
and get the exported model compiled with TensorRT if using ONNX as an
|
|
@@ -86,6 +85,9 @@ model = Model().eval()
|
|
|
86
85
|
example_input = torch.randn(1, 3, 224, 224)
|
|
87
86
|
|
|
88
87
|
# 2. Transform — fuse and optimize for TensorRT in one call
|
|
88
|
+
# For more compatibilty you can trace your model with torch.export.export
|
|
89
|
+
# as follows:
|
|
90
|
+
# model = torch.export.export(model, (example_input)).module()
|
|
89
91
|
res = transform(model, patterns=TENSORRT_PATTERNS)
|
|
90
92
|
print("Model\n", res.model.print_readable())
|
|
91
93
|
print("Matches", "\n".join([str(match) for match in res.matches]))
|
|
@@ -112,28 +114,54 @@ torch.onnx.export(
|
|
|
112
114
|
qat_model = quantized_model.train()
|
|
113
115
|
# Freeze BatchNorm, or apply other QAT utilities as needed
|
|
114
116
|
# train(qat_model)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Compile
|
|
120
|
+
|
|
121
|
+
Compilation can be done with TensorRT's trtexec tool, which can take the ONNX
|
|
122
|
+
model and compile it for inference. The exported layer info and profile can
|
|
123
|
+
be used for debugging, optimization and visualization.
|
|
124
|
+
|
|
125
|
+
Note: that the ONNX model might need to be simplified with onnx-simplifier to
|
|
126
|
+
make trtexec compile it. Dynamo exported models may have compilation issues,
|
|
127
|
+
so it's recommended to export with dynamo=False.
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
onnxsim model.onnx model.onnx
|
|
131
|
+
/usr/src/tensorrt/bin/trtexec --onnx=model.onnx --fp16 --int8 --useCudaGraph
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Optionally you can get the layer profile with the following flags:
|
|
135
|
+
```
|
|
136
|
+
--exportLayerInfo=layer_info.json
|
|
137
|
+
--exportProfile=profile.json
|
|
138
|
+
--profilingVerbosity=detailed
|
|
139
|
+
```
|
|
115
140
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
#
|
|
134
|
-
#
|
|
135
|
-
|
|
136
|
-
|
|
141
|
+
## Mixed Precision
|
|
142
|
+
|
|
143
|
+
To keep a specific layer in higher precision while quantizing the rest to INT8,
|
|
144
|
+
pass its `nn.Conv2d` instance to `ModulesToSkip` after `transform`. Note that
|
|
145
|
+
`torch.fx.GraphModule` deep-copies submodules during tracing, so you must take
|
|
146
|
+
the reference **from the fused graph**, not from the original model:
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from embedl_deploy.quantize import quantize, QuantConfig, ModulesToSkip
|
|
150
|
+
|
|
151
|
+
res = transform(model, patterns=TENSORRT_PATTERNS)
|
|
152
|
+
|
|
153
|
+
# Grab the conv instance from the fused graph (not from the original model)
|
|
154
|
+
first_conv = res.model.FusedConvBNActMaxPool_0.conv
|
|
155
|
+
|
|
156
|
+
config = QuantConfig(
|
|
157
|
+
skip=ModulesToSkip(
|
|
158
|
+
stub={first_conv}, # disables input activation quantization
|
|
159
|
+
weight={first_conv}, # disables weight fake-quantization
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
quantized_model = quantize(
|
|
163
|
+
res.model, (example_input,), config=config, forward_loop=calibration_loop
|
|
164
|
+
)
|
|
137
165
|
```
|
|
138
166
|
|
|
139
167
|
## Design Principles
|
|
@@ -150,10 +178,13 @@ qat_model = quantized_model.train()
|
|
|
150
178
|
`transform()` is a convenience for the common case where you want
|
|
151
179
|
everything applied.
|
|
152
180
|
|
|
153
|
-
3. **
|
|
154
|
-
All graph analysis and surgery uses
|
|
155
|
-
and manipulated as `fx.GraphModule` objects
|
|
156
|
-
|
|
181
|
+
3. **Graph-based models (torch.export.export and symbolic traced).**
|
|
182
|
+
All graph analysis and surgery uses traced graphs. Models are traced once
|
|
183
|
+
and manipulated as `fx.GraphModule` objects with suport for tracing via both
|
|
184
|
+
`torch.fx` (symbolic) as well as `torch.export.export` (Aten). Support for
|
|
185
|
+
Aten graphs is automatically enabled using Aten recomposition
|
|
186
|
+
patterns that compose Aten operations into equivalent `torch.nn` modules
|
|
187
|
+
automatically before conversions and fusions.
|
|
157
188
|
|
|
158
189
|
## Support
|
|
159
190
|
|
|
@@ -35,16 +35,16 @@ hardware target ensuring correct quantization and compilation.
|
|
|
35
35
|
|
|
36
36
|
## Supported Backends
|
|
37
37
|
|
|
38
|
-
| Backend
|
|
39
|
-
|
|
40
|
-
| NVIDIA TensorRT
|
|
38
|
+
| Backend | Status |
|
|
39
|
+
|-------------------------|-------------|
|
|
40
|
+
| NVIDIA TensorRT (v10.3) | Supported |
|
|
41
41
|
|
|
42
|
-
Contact
|
|
42
|
+
Contact Embedl for other backends.
|
|
43
43
|
|
|
44
44
|
## Installation
|
|
45
45
|
|
|
46
46
|
```bash
|
|
47
|
-
pip install embedl-deploy
|
|
47
|
+
pip install "embedl-deploy[tensorrt]"
|
|
48
48
|
```
|
|
49
49
|
Note that you may need to also install `onnx` and `onnx-simplifier` to export
|
|
50
50
|
and get the exported model compiled with TensorRT if using ONNX as an
|
|
@@ -66,6 +66,9 @@ model = Model().eval()
|
|
|
66
66
|
example_input = torch.randn(1, 3, 224, 224)
|
|
67
67
|
|
|
68
68
|
# 2. Transform — fuse and optimize for TensorRT in one call
|
|
69
|
+
# For more compatibilty you can trace your model with torch.export.export
|
|
70
|
+
# as follows:
|
|
71
|
+
# model = torch.export.export(model, (example_input)).module()
|
|
69
72
|
res = transform(model, patterns=TENSORRT_PATTERNS)
|
|
70
73
|
print("Model\n", res.model.print_readable())
|
|
71
74
|
print("Matches", "\n".join([str(match) for match in res.matches]))
|
|
@@ -92,28 +95,54 @@ torch.onnx.export(
|
|
|
92
95
|
qat_model = quantized_model.train()
|
|
93
96
|
# Freeze BatchNorm, or apply other QAT utilities as needed
|
|
94
97
|
# train(qat_model)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Compile
|
|
101
|
+
|
|
102
|
+
Compilation can be done with TensorRT's trtexec tool, which can take the ONNX
|
|
103
|
+
model and compile it for inference. The exported layer info and profile can
|
|
104
|
+
be used for debugging, optimization and visualization.
|
|
105
|
+
|
|
106
|
+
Note: that the ONNX model might need to be simplified with onnx-simplifier to
|
|
107
|
+
make trtexec compile it. Dynamo exported models may have compilation issues,
|
|
108
|
+
so it's recommended to export with dynamo=False.
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
onnxsim model.onnx model.onnx
|
|
112
|
+
/usr/src/tensorrt/bin/trtexec --onnx=model.onnx --fp16 --int8 --useCudaGraph
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Optionally you can get the layer profile with the following flags:
|
|
116
|
+
```
|
|
117
|
+
--exportLayerInfo=layer_info.json
|
|
118
|
+
--exportProfile=profile.json
|
|
119
|
+
--profilingVerbosity=detailed
|
|
120
|
+
```
|
|
95
121
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
#
|
|
114
|
-
#
|
|
115
|
-
|
|
116
|
-
|
|
122
|
+
## Mixed Precision
|
|
123
|
+
|
|
124
|
+
To keep a specific layer in higher precision while quantizing the rest to INT8,
|
|
125
|
+
pass its `nn.Conv2d` instance to `ModulesToSkip` after `transform`. Note that
|
|
126
|
+
`torch.fx.GraphModule` deep-copies submodules during tracing, so you must take
|
|
127
|
+
the reference **from the fused graph**, not from the original model:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from embedl_deploy.quantize import quantize, QuantConfig, ModulesToSkip
|
|
131
|
+
|
|
132
|
+
res = transform(model, patterns=TENSORRT_PATTERNS)
|
|
133
|
+
|
|
134
|
+
# Grab the conv instance from the fused graph (not from the original model)
|
|
135
|
+
first_conv = res.model.FusedConvBNActMaxPool_0.conv
|
|
136
|
+
|
|
137
|
+
config = QuantConfig(
|
|
138
|
+
skip=ModulesToSkip(
|
|
139
|
+
stub={first_conv}, # disables input activation quantization
|
|
140
|
+
weight={first_conv}, # disables weight fake-quantization
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
quantized_model = quantize(
|
|
144
|
+
res.model, (example_input,), config=config, forward_loop=calibration_loop
|
|
145
|
+
)
|
|
117
146
|
```
|
|
118
147
|
|
|
119
148
|
## Design Principles
|
|
@@ -130,10 +159,13 @@ qat_model = quantized_model.train()
|
|
|
130
159
|
`transform()` is a convenience for the common case where you want
|
|
131
160
|
everything applied.
|
|
132
161
|
|
|
133
|
-
3. **
|
|
134
|
-
All graph analysis and surgery uses
|
|
135
|
-
and manipulated as `fx.GraphModule` objects
|
|
136
|
-
|
|
162
|
+
3. **Graph-based models (torch.export.export and symbolic traced).**
|
|
163
|
+
All graph analysis and surgery uses traced graphs. Models are traced once
|
|
164
|
+
and manipulated as `fx.GraphModule` objects with suport for tracing via both
|
|
165
|
+
`torch.fx` (symbolic) as well as `torch.export.export` (Aten). Support for
|
|
166
|
+
Aten graphs is automatically enabled using Aten recomposition
|
|
167
|
+
patterns that compose Aten operations into equivalent `torch.nn` modules
|
|
168
|
+
automatically before conversions and fusions.
|
|
137
169
|
|
|
138
170
|
## Support
|
|
139
171
|
|
|
@@ -139,7 +139,13 @@ class RemoveExportAssertPattern(Pattern):
|
|
|
139
139
|
|
|
140
140
|
|
|
141
141
|
def _is_flatten(node: fx.Node) -> bool:
|
|
142
|
-
"""Return ``True`` when `node` is a flatten
|
|
142
|
+
"""Return ``True`` when `node` is a 4D→2D flatten with shape metadata.
|
|
143
|
+
|
|
144
|
+
Only matches flattens with ``start_dim=1`` on a 4-D input, which
|
|
145
|
+
produces a 2-D output (the classification-head pattern). Flattens
|
|
146
|
+
with ``start_dim >= 2`` (e.g. the MHA head-merging ``flatten(2)``
|
|
147
|
+
that produces 3-D output) are rejected.
|
|
148
|
+
"""
|
|
143
149
|
if node.op == "call_function":
|
|
144
150
|
is_flat = node.target is torch.flatten
|
|
145
151
|
elif node.op == "call_method":
|
|
@@ -149,7 +155,16 @@ def _is_flatten(node: fx.Node) -> bool:
|
|
|
149
155
|
if not is_flat:
|
|
150
156
|
return False
|
|
151
157
|
shape = get_input_shape(node)
|
|
152
|
-
|
|
158
|
+
if shape is None or len(shape) != 4:
|
|
159
|
+
return False
|
|
160
|
+
mod = get_module(node)
|
|
161
|
+
if isinstance(mod, nn.Flatten):
|
|
162
|
+
start_dim: int = mod.start_dim
|
|
163
|
+
elif len(node.args) > 1 and isinstance(node.args[1], int):
|
|
164
|
+
start_dim = node.args[1]
|
|
165
|
+
else:
|
|
166
|
+
start_dim = 0
|
|
167
|
+
return start_dim == 1
|
|
153
168
|
|
|
154
169
|
|
|
155
170
|
ElementWiseLike: TypeAlias = (
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/tensorrt/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{embedl_deploy_tensorrt-0.4.0 → embedl_deploy_tensorrt-0.4.1}/src/embedl_deploy/version/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|