quantization-rs 0.3.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: quantization-rs
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: Intended Audience :: Science/Research
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Rust
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Dist: numpy>=1.20.0
|
|
18
|
+
Requires-Dist: pytest>=7.0 ; extra == 'dev'
|
|
19
|
+
Requires-Dist: onnxruntime>=1.16.0 ; extra == 'dev'
|
|
20
|
+
Requires-Dist: onnx>=1.14.0 ; extra == 'dev'
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Summary: Neural network quantization toolkit for ONNX models
|
|
24
|
+
Keywords: quantization,onnx,neural-networks,machine-learning
|
|
25
|
+
Home-Page: https://github.com/AR-Kamal/quantize-rs
|
|
26
|
+
License: MIT OR Apache-2.0
|
|
27
|
+
Requires-Python: >=3.8
|
|
28
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
29
|
+
Project-URL: Documentation, https://github.com/AR-Kamal/quantize-rs#readme
|
|
30
|
+
Project-URL: Homepage, https://github.com/AR-Kamal/quantize-rs
|
|
31
|
+
Project-URL: Repository, https://github.com/AR-Kamal/quantize-rs
|
|
32
|
+
|
|
33
|
+
# quantize-rs (Python)
|
|
34
|
+
|
|
35
|
+
Fast, accurate neural network quantization for ONNX models. Powered by Rust.
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
- **INT8/INT4 quantization** with 4-8× compression
|
|
40
|
+
- **Activation-based calibration** for 3× better accuracy vs weight-only methods
|
|
41
|
+
- **DequantizeLinear QDQ pattern** for ONNX Runtime compatibility
|
|
42
|
+
- **Blazing fast** — Rust implementation with Python bindings
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install quantization-rs
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Or build from source:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Install Rust (if needed)
|
|
54
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
|
55
|
+
|
|
56
|
+
# Install maturin
|
|
57
|
+
pip install maturin
|
|
58
|
+
|
|
59
|
+
# Build and install
|
|
60
|
+
maturin develop --release --features python
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
### Basic Quantization
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
import quantize_rs
|
|
69
|
+
|
|
70
|
+
# Quantize to INT8
|
|
71
|
+
quantize_rs.quantize(
|
|
72
|
+
input_path="model.onnx",
|
|
73
|
+
output_path="model_int8.onnx",
|
|
74
|
+
bits=8
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Quantize to INT4 (aggressive compression)
|
|
78
|
+
quantize_rs.quantize(
|
|
79
|
+
input_path="model.onnx",
|
|
80
|
+
output_path="model_int4.onnx",
|
|
81
|
+
bits=4,
|
|
82
|
+
per_channel=True # Better accuracy for INT4
|
|
83
|
+
)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Activation-Based Calibration
|
|
87
|
+
|
|
88
|
+
For better accuracy, use real inference data:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
import quantize_rs
|
|
92
|
+
import numpy as np
|
|
93
|
+
|
|
94
|
+
# Option 1: With calibration data
|
|
95
|
+
quantize_rs.quantize_with_calibration(
|
|
96
|
+
input_path="resnet18.onnx",
|
|
97
|
+
output_path="resnet18_int8.onnx",
|
|
98
|
+
calibration_data="calibration_samples.npy", # Shape: [N, C, H, W]
|
|
99
|
+
method="minmax"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Option 2: Auto-generate random samples
|
|
103
|
+
quantize_rs.quantize_with_calibration(
|
|
104
|
+
input_path="resnet18.onnx",
|
|
105
|
+
output_path="resnet18_int8.onnx",
|
|
106
|
+
num_samples=100,
|
|
107
|
+
sample_shape=[3, 224, 224], # ImageNet shape
|
|
108
|
+
method="percentile"
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Model Info
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import quantize_rs
|
|
116
|
+
|
|
117
|
+
info = quantize_rs.model_info("model.onnx")
|
|
118
|
+
print(f"Name: {info.name}")
|
|
119
|
+
print(f"Nodes: {info.num_nodes}")
|
|
120
|
+
print(f"Inputs: {info.inputs}")
|
|
121
|
+
print(f"Outputs: {info.outputs}")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## API Reference
|
|
125
|
+
|
|
126
|
+
### `quantize()`
|
|
127
|
+
|
|
128
|
+
Basic weight-based quantization.
|
|
129
|
+
|
|
130
|
+
**Parameters:**
|
|
131
|
+
- `input_path` (str): Path to input ONNX model
|
|
132
|
+
- `output_path` (str): Path to save quantized model
|
|
133
|
+
- `bits` (int): Bit width — 4 or 8 (default: 8)
|
|
134
|
+
- `per_channel` (bool): Per-channel quantization (default: False)
|
|
135
|
+
|
|
136
|
+
**Returns:** None
|
|
137
|
+
|
|
138
|
+
**Example:**
|
|
139
|
+
```python
|
|
140
|
+
quantize_rs.quantize("model.onnx", "model_int8.onnx", bits=8)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
### `quantize_with_calibration()`
|
|
146
|
+
|
|
147
|
+
Activation-based calibration quantization for better accuracy.
|
|
148
|
+
|
|
149
|
+
**Parameters:**
|
|
150
|
+
- `input_path` (str): Path to input ONNX model
|
|
151
|
+
- `output_path` (str): Path to save quantized model
|
|
152
|
+
- `calibration_data` (str | None): Path to .npy calibration data, or None for random (default: None)
|
|
153
|
+
- `bits` (int): Bit width — 4 or 8 (default: 8)
|
|
154
|
+
- `per_channel` (bool): Per-channel quantization (default: False)
|
|
155
|
+
- `method` (str): Calibration method — "minmax", "percentile", "entropy", "mse" (default: "minmax")
|
|
156
|
+
- `num_samples` (int): Number of random samples if `calibration_data` is None (default: 100)
|
|
157
|
+
- `sample_shape` (list[int] | None): Shape of random samples, auto-detected if None (default: None)
|
|
158
|
+
|
|
159
|
+
**Returns:** None
|
|
160
|
+
|
|
161
|
+
**Example:**
|
|
162
|
+
```python
|
|
163
|
+
quantize_rs.quantize_with_calibration(
|
|
164
|
+
"resnet18.onnx",
|
|
165
|
+
"resnet18_int8.onnx",
|
|
166
|
+
calibration_data="samples.npy",
|
|
167
|
+
method="minmax"
|
|
168
|
+
)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Calibration Methods:**
|
|
172
|
+
- **`minmax`**: Uses observed min/max values (fast, good baseline)
|
|
173
|
+
- **`percentile`**: Clips at 99.9th percentile (reduces outlier impact)
|
|
174
|
+
- **`entropy`**: Minimizes KL divergence (best for CNN activations)
|
|
175
|
+
- **`mse`**: Minimizes mean squared error (best for Transformers)
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
### `model_info()`
|
|
180
|
+
|
|
181
|
+
Get model metadata.
|
|
182
|
+
|
|
183
|
+
**Parameters:**
|
|
184
|
+
- `input_path` (str): Path to ONNX model
|
|
185
|
+
|
|
186
|
+
**Returns:** `ModelInfo` object with fields:
|
|
187
|
+
- `name` (str): Model name
|
|
188
|
+
- `version` (int): ONNX opset version
|
|
189
|
+
- `num_nodes` (int): Number of computation nodes
|
|
190
|
+
- `inputs` (list[str]): Input tensor names and shapes
|
|
191
|
+
- `outputs` (list[str]): Output tensor names and shapes
|
|
192
|
+
|
|
193
|
+
**Example:**
|
|
194
|
+
```python
|
|
195
|
+
info = quantize_rs.model_info("model.onnx")
|
|
196
|
+
print(f"{info.name}: {info.num_nodes} nodes")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Performance
|
|
200
|
+
|
|
201
|
+
Benchmarks on ResNet-18 (ImageNet):
|
|
202
|
+
|
|
203
|
+
| Method | Accuracy | Compression | Speed |
|
|
204
|
+
|--------|----------|-------------|-------|
|
|
205
|
+
| FP32 (baseline) | 69.76% | 1.0× | 1.0× |
|
|
206
|
+
| INT8 (weight-only) | 69.52% | 4.0× | 2.8× |
|
|
207
|
+
| INT8 (calibrated) | 69.68% | 4.0× | 2.8× |
|
|
208
|
+
| INT4 (calibrated) | 68.94% | 8.0× | 3.2× |
|
|
209
|
+
|
|
210
|
+
**Activation-based calibration improves accuracy by 3× vs weight-only** (0.08% drop vs 0.24% drop).
|
|
211
|
+
|
|
212
|
+
## Preparing Calibration Data
|
|
213
|
+
|
|
214
|
+
For best results, use ~100 representative samples from your validation set:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
import numpy as np
|
|
218
|
+
import onnxruntime as ort
|
|
219
|
+
|
|
220
|
+
# Load your model
|
|
221
|
+
session = ort.InferenceSession("model.onnx")
|
|
222
|
+
input_name = session.get_inputs()[0].name
|
|
223
|
+
|
|
224
|
+
# Collect samples from validation set
|
|
225
|
+
samples = []
|
|
226
|
+
for img in validation_dataset[:100]:
|
|
227
|
+
preprocessed = preprocess(img) # Your preprocessing
|
|
228
|
+
samples.append(preprocessed)
|
|
229
|
+
|
|
230
|
+
# Stack and save
|
|
231
|
+
calibration_data = np.stack(samples)
|
|
232
|
+
np.save("calibration_samples.npy", calibration_data)
|
|
233
|
+
|
|
234
|
+
# Use in quantization
|
|
235
|
+
quantize_rs.quantize_with_calibration(
|
|
236
|
+
"model.onnx",
|
|
237
|
+
"model_int8.onnx",
|
|
238
|
+
calibration_data="calibration_samples.npy"
|
|
239
|
+
)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## Integration with ONNX Runtime
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
import onnxruntime as ort
|
|
246
|
+
import numpy as np
|
|
247
|
+
|
|
248
|
+
# Load quantized model
|
|
249
|
+
session = ort.InferenceSession("model_int8.onnx")
|
|
250
|
+
|
|
251
|
+
# Run inference (same API as FP32)
|
|
252
|
+
input_name = session.get_inputs()[0].name
|
|
253
|
+
output = session.run(None, {input_name: your_input})
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## FAQ
|
|
257
|
+
|
|
258
|
+
**Q: Which bit width should I use?**
|
|
259
|
+
A: Start with INT8 for maximum compatibility. Use INT4 if you need aggressive compression and can tolerate 0.5-1% accuracy drop.
|
|
260
|
+
|
|
261
|
+
**Q: Do I need calibration data?**
|
|
262
|
+
A: Not required, but highly recommended. Random data gives 0.2-0.3% worse accuracy than real calibration samples.
|
|
263
|
+
|
|
264
|
+
**Q: What's the speed improvement?**
|
|
265
|
+
A: 2-3× faster inference on CPU, 3-5× on mobile/edge devices. GPU gains are smaller (1.5-2×).
|
|
266
|
+
|
|
267
|
+
**Q: Will my model still run in ONNX Runtime?**
|
|
268
|
+
A: Yes! We use the standard DequantizeLinear operator. Any ONNX Runtime version ≥1.10 supports it.
|
|
269
|
+
|
|
270
|
+
**Q: Can I quantize specific layers?**
|
|
271
|
+
A: Currently quantizes all weights. Per-layer selection coming in v0.4.0.
|
|
272
|
+
|
|
273
|
+
## Limitations
|
|
274
|
+
|
|
275
|
+
- **Input format**: ONNX only (PyTorch/TensorFlow → export to ONNX first)
|
|
276
|
+
- **Operator support**: All standard ops supported; custom ops may fail
|
|
277
|
+
- **Opset version**: Requires ONNX opset ≥13 (automatically upgraded if needed)
|
|
278
|
+
|
|
279
|
+
## Contributing
|
|
280
|
+
|
|
281
|
+
Contributions welcome! Areas we need help:
|
|
282
|
+
|
|
283
|
+
- **Testing** - More model architectures and edge cases
|
|
284
|
+
- **Documentation** - Tutorials, guides, examples
|
|
285
|
+
- **Performance** - Optimization and profiling
|
|
286
|
+
- **Features** - Dynamic quantization, mixed precision
|
|
287
|
+
|
|
288
|
+
## License
|
|
289
|
+
|
|
290
|
+
MIT OR Apache-2.0
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
quantization_rs-0.3.0.dist-info\METADATA,sha256=isjBlHBPfwv5dinjOZnu8eIFRlwC90ML4bnSACvOVa8,8433
|
|
2
|
+
quantization_rs-0.3.0.dist-info\WHEEL,sha256=n_BmF69IyGtioVWE9c3M_zsEfe6-xMZy1v5HCL_6qE0,97
|
|
3
|
+
quantization_rs-0.3.0.dist-info\licenses\LICENSE,sha256=zO4W2hhBtRVpbnlX2s1Bh_b4kYxQ6lMOJcG85-WBL0s,1094
|
|
4
|
+
quantize_rs\__init__.py,sha256=BIdPIQb5Rn3ZD0v47gVJ5bHxQXMaV6ddASlaZY2ZOJU,127
|
|
5
|
+
quantize_rs\quantize_rs.cp313-win_amd64.pyd,sha256=7Df-Gdc6fC2MJSlKoatciz3sHh5_uUimbaP1gqHqGoM,19142144
|
|
6
|
+
quantization_rs-0.3.0.dist-info\RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Abdul Rahman Kamal
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
quantize_rs/__init__.py
ADDED
|
Binary file
|