neuroquant 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroquant-2.0.0/LICENSE +21 -0
- neuroquant-2.0.0/PKG-INFO +248 -0
- neuroquant-2.0.0/README.md +196 -0
- neuroquant-2.0.0/neuroquant/__init__.py +95 -0
- neuroquant-2.0.0/neuroquant/cli.py +3471 -0
- neuroquant-2.0.0/neuroquant/config.py +1050 -0
- neuroquant-2.0.0/neuroquant/data/__init__.py +1 -0
- neuroquant-2.0.0/neuroquant/data/data_loader.py +764 -0
- neuroquant-2.0.0/neuroquant/models/__init__.py +1 -0
- neuroquant-2.0.0/neuroquant/models/model_loader.py +602 -0
- neuroquant-2.0.0/neuroquant/py.typed +0 -0
- neuroquant-2.0.0/neuroquant/quantization/__init__.py +57 -0
- neuroquant-2.0.0/neuroquant/quantization/_default_config.yaml +206 -0
- neuroquant-2.0.0/neuroquant/quantization/adaround.py +1075 -0
- neuroquant-2.0.0/neuroquant/quantization/alpha_search.py +312 -0
- neuroquant-2.0.0/neuroquant/quantization/awq.py +646 -0
- neuroquant-2.0.0/neuroquant/quantization/base.py +289 -0
- neuroquant-2.0.0/neuroquant/quantization/bn_folding.py +194 -0
- neuroquant-2.0.0/neuroquant/quantization/gptq.py +261 -0
- neuroquant-2.0.0/neuroquant/quantization/hessian_clustering.py +731 -0
- neuroquant-2.0.0/neuroquant/quantization/latency_lut.py +368 -0
- neuroquant-2.0.0/neuroquant/quantization/nsga_ii_search.py +1255 -0
- neuroquant-2.0.0/neuroquant/quantization/ptq.py +684 -0
- neuroquant-2.0.0/neuroquant/quantization/qat.py +739 -0
- neuroquant-2.0.0/neuroquant/quantization/smoothquant.py +698 -0
- neuroquant-2.0.0/neuroquant/quantization/smoothquant_gptq.py +142 -0
- neuroquant-2.0.0/neuroquant/quantization/surrogate.py +291 -0
- neuroquant-2.0.0/neuroquant/tracking/__init__.py +1 -0
- neuroquant-2.0.0/neuroquant/tracking/mlflow_logger.py +267 -0
- neuroquant-2.0.0/neuroquant/utils/__init__.py +1 -0
- neuroquant-2.0.0/neuroquant/utils/checkpointing.py +469 -0
- neuroquant-2.0.0/neuroquant/utils/common.py +227 -0
- neuroquant-2.0.0/neuroquant/utils/deployment_export.py +322 -0
- neuroquant-2.0.0/neuroquant/utils/metrics.py +295 -0
- neuroquant-2.0.0/neuroquant/utils/numerics.py +88 -0
- neuroquant-2.0.0/neuroquant/utils/onnx_export.py +522 -0
- neuroquant-2.0.0/neuroquant/visualization/__init__.py +61 -0
- neuroquant-2.0.0/neuroquant/visualization/error_attribution.py +403 -0
- neuroquant-2.0.0/neuroquant/visualization/pareto_analysis.py +1158 -0
- neuroquant-2.0.0/neuroquant/visualization/report.py +452 -0
- neuroquant-2.0.0/neuroquant/visualization/sensitivity.py +244 -0
- neuroquant-2.0.0/neuroquant/visualization/style.py +144 -0
- neuroquant-2.0.0/neuroquant/xai/__init__.py +1 -0
- neuroquant-2.0.0/neuroquant/xai/explainability.py +1299 -0
- neuroquant-2.0.0/neuroquant.egg-info/PKG-INFO +248 -0
- neuroquant-2.0.0/neuroquant.egg-info/SOURCES.txt +53 -0
- neuroquant-2.0.0/neuroquant.egg-info/dependency_links.txt +1 -0
- neuroquant-2.0.0/neuroquant.egg-info/entry_points.txt +2 -0
- neuroquant-2.0.0/neuroquant.egg-info/requires.txt +28 -0
- neuroquant-2.0.0/neuroquant.egg-info/top_level.txt +1 -0
- neuroquant-2.0.0/pyproject.toml +141 -0
- neuroquant-2.0.0/setup.cfg +4 -0
- neuroquant-2.0.0/tests/test_config.py +31 -0
- neuroquant-2.0.0/tests/test_imports.py +35 -0
- neuroquant-2.0.0/tests/test_quantizers.py +58 -0
neuroquant-2.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 NeuroQuant Authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: neuroquant
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Production-grade neural-network quantization framework with NSGA + ONNX + hardware-aware search
|
|
5
|
+
Author: NeuroQuant Authors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/AbdelazizElHelaly11/NeuroQuant
|
|
8
|
+
Project-URL: Documentation, https://github.com/AbdelazizElHelaly11/NeuroQuant/blob/main/README.md
|
|
9
|
+
Project-URL: Repository, https://github.com/AbdelazizElHelaly11/NeuroQuant
|
|
10
|
+
Project-URL: Issues, https://github.com/AbdelazizElHelaly11/NeuroQuant/issues
|
|
11
|
+
Keywords: quantization,deep-learning,pytorch,onnx,ptq,qat,gptq,smoothquant,awq,nsga,neural-network-compression,edge-ai
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
16
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
17
|
+
Classifier: Operating System :: MacOS
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: torch>=2.4
|
|
27
|
+
Requires-Dist: torchvision>=0.19
|
|
28
|
+
Requires-Dist: numpy>=1.26
|
|
29
|
+
Requires-Dist: pandas>=2.0
|
|
30
|
+
Requires-Dist: matplotlib>=3.7
|
|
31
|
+
Requires-Dist: seaborn>=0.13
|
|
32
|
+
Requires-Dist: pyyaml>=6.0
|
|
33
|
+
Requires-Dist: pydantic<3.0,>=2.5
|
|
34
|
+
Requires-Dist: mlflow>=2.10
|
|
35
|
+
Requires-Dist: pymoo>=0.6.0
|
|
36
|
+
Requires-Dist: onnx>=1.14
|
|
37
|
+
Requires-Dist: onnxruntime>=1.16
|
|
38
|
+
Requires-Dist: onnxscript>=0.1
|
|
39
|
+
Requires-Dist: scikit-learn>=1.3
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
42
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
44
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
45
|
+
Provides-Extra: xai
|
|
46
|
+
Requires-Dist: shap>=0.42.0; extra == "xai"
|
|
47
|
+
Provides-Extra: docs
|
|
48
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
49
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == "docs"
|
|
50
|
+
Requires-Dist: pymdown-extensions>=10.0; extra == "docs"
|
|
51
|
+
Dynamic: license-file
|
|
52
|
+
|
|
53
|
+
# NeuroQuant v2.0
|
|
54
|
+
|
|
55
|
+
[]()
|
|
56
|
+
[]()
|
|
57
|
+
|
|
58
|
+
**Production-grade neural-network quantization framework with multi-objective NSGA search, ONNX deployment, and hardware-aware optimisation.**
|
|
59
|
+
|
|
60
|
+
NeuroQuant takes a pre-trained PyTorch model and produces deployable INT8 / mixed-precision artefacts that have been measured (not estimated) on the same runtime that ships in production. Every public number is the result of running a real quantized graph through ONNX Runtime — no synthetic shortcuts.
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## What it does
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
┌────────────────────────────────────────────────────────────────────────┐
|
|
68
|
+
│ │
|
|
69
|
+
│ FP32 PyTorch model ─────► 10-phase pipeline ─────► INT8 .onnx │
|
|
70
|
+
│ + metrics │
|
|
71
|
+
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
72
|
+
│ │ P0 Prepare model + dataset, FP32 baseline │ │
|
|
73
|
+
│ │ P1a Hessian / Fisher per-layer sensitivity │ │
|
|
74
|
+
│ │ P1b FITCompress warm-start seed │ │
|
|
75
|
+
│ │ P1c NSGA multi-objective search (2- or 3-obj) │ │
|
|
76
|
+
│ │ P1d AdaRound canonical-order weight rounding │ │
|
|
77
|
+
│ │ P1e Real W+A QAT with FP32 teacher distillation │ │
|
|
78
|
+
│ │ P1f GPTQ + SmoothQuant + AWQ + SmoothQuant→GPTQ │ │
|
|
79
|
+
│ │ P2 Pareto analysis + plots │ │
|
|
80
|
+
│ │ P3 Grad-CAM + SHAP explainability │ │
|
|
81
|
+
│ │ P4 MLflow finalisation + reproducibility manifest │ │
|
|
82
|
+
│ └──────────────────────────────────────────────────────────────┘ │
|
|
83
|
+
│ │
|
|
84
|
+
└────────────────────────────────────────────────────────────────────────┘
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
The pipeline runs to completion in **~60 seconds** on CPU for a CIFAR-class model.
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Why it is production-grade
|
|
92
|
+
|
|
93
|
+
This framework was built deliberately to avoid the "research prototype" failure modes that disqualify most academic quantization tooling from real deployment:
|
|
94
|
+
|
|
95
|
+
| Concern | What NeuroQuant does |
|
|
96
|
+
| ------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
|
|
97
|
+
| **Real INT inference** | Wave 4 emits true static-INT8 ONNX graphs via `onnxruntime.quantization.quantize_static`, not FP32 simulation. |
|
|
98
|
+
| **Real on-disk size** | `model_size_mb` is the literal `.onnx` filesystem size, not `numel × bw / 8`. The synthetic estimate is kept as `theoretical_size_mb` for ablation. |
|
|
99
|
+
| **Real latency** | `latency_ms` is measured under ONNX Runtime on the same machine that will deploy the artefact. |
|
|
100
|
+
| **Hardware-aware search** | The NSGA third objective sums a per-layer ORT latency LUT (Wave 4 C2). Every gene's latency cost is a real timing.|
|
|
101
|
+
| **No leakage between splits** | Train / search / val / test are 80/10/10/test-set; NSGA fitness reads search, QAT early-stop reads val, headline reads test. |
|
|
102
|
+
| **Strict determinism** | `set_seed(strict=True)` enforces `CUBLAS_WORKSPACE_CONFIG`, `use_deterministic_algorithms`, `cudnn.deterministic`. |
|
|
103
|
+
| **Safe checkpoints** | All `torch.load(weights_only=True)`; pickle path is closed. Architectural wrappers persist as JSON manifests. |
|
|
104
|
+
| **Real W+A QAT** | INT8 activations always; weight parametrisation via `torch.nn.utils.parametrize` (autograd-aware STE). |
|
|
105
|
+
| **Validated config** | Pydantic v2 dataclasses with field validators — bad values fail at load, not deep in a phase. |
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Install
|
|
110
|
+
|
|
111
|
+
### From the wheel
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
pip install neuroquant-2.0.0-py3-none-any.whl
|
|
115
|
+
neuroquant --help
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### From source
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
git clone https://github.com/AbdelazizElHelaly11/NeuroQuant
|
|
122
|
+
cd NeuroQuant
|
|
123
|
+
pip install -e ".[dev]" # editable + dev extras
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
GPU users:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
|
130
|
+
pip install -e ".[dev]"
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Run
|
|
136
|
+
|
|
137
|
+
The console-script `neuroquant` is installed by the wheel; it accepts the same flags as `python main.py`.
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
# Full pipeline on the bundled config (CIFAR-10 + MobileNetV2)
|
|
141
|
+
neuroquant --config config.yaml --epochs 20
|
|
142
|
+
|
|
143
|
+
# Fast smoke (CPU, no training, first three phases)
|
|
144
|
+
neuroquant --config config.yaml --epochs 0 --device cpu \
|
|
145
|
+
--phases phase_0_preparation phase_1a_hessian_clustering phase_1b_fitcompress
|
|
146
|
+
|
|
147
|
+
# Resume after interruption
|
|
148
|
+
neuroquant --config config.yaml --epochs 20 --resume
|
|
149
|
+
|
|
150
|
+
# Hardware-aware mode (3-objective NSGA + ORT latency LUT)
|
|
151
|
+
# Set hardware_aware_search: true in config.yaml, then:
|
|
152
|
+
neuroquant --config config.yaml --epochs 20
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
The pipeline writes everything to `output_dir` (default `./artifacts/`):
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
artifacts/
|
|
159
|
+
├── checkpoints/ # per-phase resume points
|
|
160
|
+
├── onnx/ # FP32 + per-method INT8 .onnx files
|
|
161
|
+
├── pareto/ # Pareto plots + JSON
|
|
162
|
+
├── reports/ # pipeline_report.txt, pareto_summary.json
|
|
163
|
+
├── reproducibility_manifest.json
|
|
164
|
+
├── latency_lut.json # only when hardware_aware_search=true
|
|
165
|
+
└── pipeline_report.txt
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Configuration
|
|
171
|
+
|
|
172
|
+
All knobs live in [`config.yaml`](config.yaml). Common overrides:
|
|
173
|
+
|
|
174
|
+
```yaml
|
|
175
|
+
model:
|
|
176
|
+
name: resnet18 # any torchvision name
|
|
177
|
+
num_classes: 10
|
|
178
|
+
input_shape: [3, 32, 32]
|
|
179
|
+
|
|
180
|
+
dataset:
|
|
181
|
+
name: cifar10 # cifar10 | cifar100 | imagefolder | synthetic | custom
|
|
182
|
+
class: null # optional "pkg.module.MyDataset"
|
|
183
|
+
train_dir: null # optional ImageFolder split dirs
|
|
184
|
+
val_dir: null
|
|
185
|
+
test_dir: null
|
|
186
|
+
batch_size: 128
|
|
187
|
+
|
|
188
|
+
methods: [ptq, qat, gptq, smoothquant, awq]
|
|
189
|
+
bitwidths:
|
|
190
|
+
supported: [4, 8]
|
|
191
|
+
io_layer: 8 # force first/last layers to INT8
|
|
192
|
+
|
|
193
|
+
hyperparams:
|
|
194
|
+
hardware_aware_search: true # Wave 4 J4: 3-obj NSGA
|
|
195
|
+
onnx_export_enabled: true # Wave 4 J1/J2/J3
|
|
196
|
+
qat_distill_alpha: 0.5 # Wave 2 E5: KD with FP32 teacher
|
|
197
|
+
smoothquant_per_layer_alpha: true # Wave 3 F3
|
|
198
|
+
hessian_estimator: fisher # Wave 3 B2: 3× faster than diag
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Pydantic field validators run at load time — invalid values surface immediately with the offending field path:
|
|
202
|
+
|
|
203
|
+
```text
|
|
204
|
+
ValueError: Configuration validation failed:
|
|
205
|
+
num_classes must be >= 2.
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
## Architecture
|
|
211
|
+
|
|
212
|
+
The framework was built in seven waves, each ending with a strict-format report. Per-wave architecture notes live in [`docs/architecture/`](docs/architecture/):
|
|
213
|
+
|
|
214
|
+
| Wave | Theme | Notes |
|
|
215
|
+
| ---- | ------------------------------ | ------------------------------------------ |
|
|
216
|
+
| 1 | Foundation (security + leakage) | [wave1.md](docs/architecture/wave1.md) |
|
|
217
|
+
| 2 | Real W+A QAT pipeline | [wave2.md](docs/architecture/wave2.md) |
|
|
218
|
+
| 3 | Method audits + Fisher | [wave3.md](docs/architecture/wave3.md) |
|
|
219
|
+
| 4 | ONNX + hardware-aware search | [wave4.md](docs/architecture/wave4.md) |
|
|
220
|
+
| 5 | Reporting + MLflow | [wave5.md](docs/architecture/wave5.md) |
|
|
221
|
+
| 6 | Config validation (Pydantic) | [wave6.md](docs/architecture/wave6.md) |
|
|
222
|
+
| 7 | Packaging + docs | [wave7.md](docs/architecture/wave7.md) |
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Quantization methods
|
|
227
|
+
|
|
228
|
+
| Method | When to use | Module |
|
|
229
|
+
| --------------------- | --------------------------------------------------------------------- | ----------------------------------------------------- |
|
|
230
|
+
| **PTQ** | Fast baseline; INT8 with bitwidth-aware calibration. | [`quantization/ptq.py`](quantization/ptq.py) |
|
|
231
|
+
| **QAT** | Best accuracy at INT8; requires fine-tuning data. | [`quantization/qat.py`](quantization/qat.py) |
|
|
232
|
+
| **GPTQ** | Best accuracy at INT4 weights; data-aware optimal rounding. | [`quantization/gptq.py`](quantization/gptq.py) |
|
|
233
|
+
| **SmoothQuant** | Activation-friendly INT8; per-layer α grid search. | [`quantization/smoothquant.py`](quantization/smoothquant.py) |
|
|
234
|
+
| **AWQ** | INT4 with salient-channel preservation; per-layer α + FP16 carve-out. | [`quantization/awq.py`](quantization/awq.py) |
|
|
235
|
+
| **SmoothQuant→GPTQ** | Production recipe — strict-Pareto improvement over either method alone. | [`quantization/smoothquant_gptq.py`](quantization/smoothquant_gptq.py) |
|
|
236
|
+
| **AdaRound** | Post-PTQ refinement; canonical input→output traversal. | [`quantization/adaround.py`](quantization/adaround.py) |
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## License
|
|
241
|
+
|
|
242
|
+
MIT. See [LICENSE](LICENSE) for the full text.
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## Acknowledgements
|
|
247
|
+
|
|
248
|
+
The seven-wave production hardening was specified, implemented, and refined in collaboration with **Claude Opus 4.7 (1M context)**. Per-wave architecture notes live under [`docs/architecture/`](docs/architecture/).
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# NeuroQuant v2.0
|
|
2
|
+
|
|
3
|
+
[]()
|
|
4
|
+
[]()
|
|
5
|
+
|
|
6
|
+
**Production-grade neural-network quantization framework with multi-objective NSGA search, ONNX deployment, and hardware-aware optimisation.**
|
|
7
|
+
|
|
8
|
+
NeuroQuant takes a pre-trained PyTorch model and produces deployable INT8 / mixed-precision artefacts that have been measured (not estimated) on the same runtime that ships in production. Every public number is the result of running a real quantized graph through ONNX Runtime — no synthetic shortcuts.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## What it does
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
┌────────────────────────────────────────────────────────────────────────┐
|
|
16
|
+
│ │
|
|
17
|
+
│ FP32 PyTorch model ─────► 10-phase pipeline ─────► INT8 .onnx │
|
|
18
|
+
│ + metrics │
|
|
19
|
+
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
20
|
+
│ │ P0 Prepare model + dataset, FP32 baseline │ │
|
|
21
|
+
│ │ P1a Hessian / Fisher per-layer sensitivity │ │
|
|
22
|
+
│ │ P1b FITCompress warm-start seed │ │
|
|
23
|
+
│ │ P1c NSGA multi-objective search (2- or 3-obj) │ │
|
|
24
|
+
│ │ P1d AdaRound canonical-order weight rounding │ │
|
|
25
|
+
│ │ P1e Real W+A QAT with FP32 teacher distillation │ │
|
|
26
|
+
│ │ P1f GPTQ + SmoothQuant + AWQ + SmoothQuant→GPTQ │ │
|
|
27
|
+
│ │ P2 Pareto analysis + plots │ │
|
|
28
|
+
│ │ P3 Grad-CAM + SHAP explainability │ │
|
|
29
|
+
│ │ P4 MLflow finalisation + reproducibility manifest │ │
|
|
30
|
+
│ └──────────────────────────────────────────────────────────────┘ │
|
|
31
|
+
│ │
|
|
32
|
+
└────────────────────────────────────────────────────────────────────────┘
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
The pipeline runs to completion in **~60 seconds** on CPU for a CIFAR-class model.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Why it is production-grade
|
|
40
|
+
|
|
41
|
+
This framework was built deliberately to avoid the "research prototype" failure modes that disqualify most academic quantization tooling from real deployment:
|
|
42
|
+
|
|
43
|
+
| Concern | What NeuroQuant does |
|
|
44
|
+
| ------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
|
|
45
|
+
| **Real INT inference** | Wave 4 emits true static-INT8 ONNX graphs via `onnxruntime.quantization.quantize_static`, not FP32 simulation. |
|
|
46
|
+
| **Real on-disk size** | `model_size_mb` is the literal `.onnx` filesystem size, not `numel × bw / 8`. The synthetic estimate is kept as `theoretical_size_mb` for ablation. |
|
|
47
|
+
| **Real latency** | `latency_ms` is measured under ONNX Runtime on the same machine that will deploy the artefact. |
|
|
48
|
+
| **Hardware-aware search** | The NSGA third objective sums a per-layer ORT latency LUT (Wave 4 C2). Every gene's latency cost is a real timing.|
|
|
49
|
+
| **No leakage between splits** | Train / search / val / test are 80/10/10/test-set; NSGA fitness reads search, QAT early-stop reads val, headline reads test. |
|
|
50
|
+
| **Strict determinism** | `set_seed(strict=True)` enforces `CUBLAS_WORKSPACE_CONFIG`, `use_deterministic_algorithms`, `cudnn.deterministic`. |
|
|
51
|
+
| **Safe checkpoints** | All `torch.load(weights_only=True)`; pickle path is closed. Architectural wrappers persist as JSON manifests. |
|
|
52
|
+
| **Real W+A QAT** | INT8 activations always; weight parametrisation via `torch.nn.utils.parametrize` (autograd-aware STE). |
|
|
53
|
+
| **Validated config** | Pydantic v2 dataclasses with field validators — bad values fail at load, not deep in a phase. |
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
### From the wheel
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install neuroquant-2.0.0-py3-none-any.whl
|
|
63
|
+
neuroquant --help
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### From source
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
git clone https://github.com/AbdelazizElHelaly11/NeuroQuant
|
|
70
|
+
cd NeuroQuant
|
|
71
|
+
pip install -e ".[dev]" # editable + dev extras
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
GPU users:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
|
78
|
+
pip install -e ".[dev]"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Run
|
|
84
|
+
|
|
85
|
+
The console-script `neuroquant` is installed by the wheel; it accepts the same flags as `python main.py`.
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Full pipeline on the bundled config (CIFAR-10 + MobileNetV2)
|
|
89
|
+
neuroquant --config config.yaml --epochs 20
|
|
90
|
+
|
|
91
|
+
# Fast smoke (CPU, no training, first three phases)
|
|
92
|
+
neuroquant --config config.yaml --epochs 0 --device cpu \
|
|
93
|
+
--phases phase_0_preparation phase_1a_hessian_clustering phase_1b_fitcompress
|
|
94
|
+
|
|
95
|
+
# Resume after interruption
|
|
96
|
+
neuroquant --config config.yaml --epochs 20 --resume
|
|
97
|
+
|
|
98
|
+
# Hardware-aware mode (3-objective NSGA + ORT latency LUT)
|
|
99
|
+
# Set hardware_aware_search: true in config.yaml, then:
|
|
100
|
+
neuroquant --config config.yaml --epochs 20
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
The pipeline writes everything to `output_dir` (default `./artifacts/`):
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
artifacts/
|
|
107
|
+
├── checkpoints/ # per-phase resume points
|
|
108
|
+
├── onnx/ # FP32 + per-method INT8 .onnx files
|
|
109
|
+
├── pareto/ # Pareto plots + JSON
|
|
110
|
+
├── reports/ # pipeline_report.txt, pareto_summary.json
|
|
111
|
+
├── reproducibility_manifest.json
|
|
112
|
+
├── latency_lut.json # only when hardware_aware_search=true
|
|
113
|
+
└── pipeline_report.txt
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Configuration
|
|
119
|
+
|
|
120
|
+
All knobs live in [`config.yaml`](config.yaml). Common overrides:
|
|
121
|
+
|
|
122
|
+
```yaml
|
|
123
|
+
model:
|
|
124
|
+
name: resnet18 # any torchvision name
|
|
125
|
+
num_classes: 10
|
|
126
|
+
input_shape: [3, 32, 32]
|
|
127
|
+
|
|
128
|
+
dataset:
|
|
129
|
+
name: cifar10 # cifar10 | cifar100 | imagefolder | synthetic | custom
|
|
130
|
+
class: null # optional "pkg.module.MyDataset"
|
|
131
|
+
train_dir: null # optional ImageFolder split dirs
|
|
132
|
+
val_dir: null
|
|
133
|
+
test_dir: null
|
|
134
|
+
batch_size: 128
|
|
135
|
+
|
|
136
|
+
methods: [ptq, qat, gptq, smoothquant, awq]
|
|
137
|
+
bitwidths:
|
|
138
|
+
supported: [4, 8]
|
|
139
|
+
io_layer: 8 # force first/last layers to INT8
|
|
140
|
+
|
|
141
|
+
hyperparams:
|
|
142
|
+
hardware_aware_search: true # Wave 4 J4: 3-obj NSGA
|
|
143
|
+
onnx_export_enabled: true # Wave 4 J1/J2/J3
|
|
144
|
+
qat_distill_alpha: 0.5 # Wave 2 E5: KD with FP32 teacher
|
|
145
|
+
smoothquant_per_layer_alpha: true # Wave 3 F3
|
|
146
|
+
hessian_estimator: fisher # Wave 3 B2: 3× faster than diag
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Pydantic field validators run at load time — invalid values surface immediately with the offending field path:
|
|
150
|
+
|
|
151
|
+
```text
|
|
152
|
+
ValueError: Configuration validation failed:
|
|
153
|
+
num_classes must be >= 2.
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Architecture
|
|
159
|
+
|
|
160
|
+
The framework was built in seven waves, each ending with a strict-format report. Per-wave architecture notes live in [`docs/architecture/`](docs/architecture/):
|
|
161
|
+
|
|
162
|
+
| Wave | Theme | Notes |
|
|
163
|
+
| ---- | ------------------------------ | ------------------------------------------ |
|
|
164
|
+
| 1 | Foundation (security + leakage) | [wave1.md](docs/architecture/wave1.md) |
|
|
165
|
+
| 2 | Real W+A QAT pipeline | [wave2.md](docs/architecture/wave2.md) |
|
|
166
|
+
| 3 | Method audits + Fisher | [wave3.md](docs/architecture/wave3.md) |
|
|
167
|
+
| 4 | ONNX + hardware-aware search | [wave4.md](docs/architecture/wave4.md) |
|
|
168
|
+
| 5 | Reporting + MLflow | [wave5.md](docs/architecture/wave5.md) |
|
|
169
|
+
| 6 | Config validation (Pydantic) | [wave6.md](docs/architecture/wave6.md) |
|
|
170
|
+
| 7 | Packaging + docs | [wave7.md](docs/architecture/wave7.md) |
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## Quantization methods
|
|
175
|
+
|
|
176
|
+
| Method | When to use | Module |
|
|
177
|
+
| --------------------- | --------------------------------------------------------------------- | ----------------------------------------------------- |
|
|
178
|
+
| **PTQ** | Fast baseline; INT8 with bitwidth-aware calibration. | [`quantization/ptq.py`](quantization/ptq.py) |
|
|
179
|
+
| **QAT** | Best accuracy at INT8; requires fine-tuning data. | [`quantization/qat.py`](quantization/qat.py) |
|
|
180
|
+
| **GPTQ** | Best accuracy at INT4 weights; data-aware optimal rounding. | [`quantization/gptq.py`](quantization/gptq.py) |
|
|
181
|
+
| **SmoothQuant** | Activation-friendly INT8; per-layer α grid search. | [`quantization/smoothquant.py`](quantization/smoothquant.py) |
|
|
182
|
+
| **AWQ** | INT4 with salient-channel preservation; per-layer α + FP16 carve-out. | [`quantization/awq.py`](quantization/awq.py) |
|
|
183
|
+
| **SmoothQuant→GPTQ** | Production recipe — strict-Pareto improvement over either method alone. | [`quantization/smoothquant_gptq.py`](quantization/smoothquant_gptq.py) |
|
|
184
|
+
| **AdaRound** | Post-PTQ refinement; canonical input→output traversal. | [`quantization/adaround.py`](quantization/adaround.py) |
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## License
|
|
189
|
+
|
|
190
|
+
MIT. See [LICENSE](LICENSE) for the full text.
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Acknowledgements
|
|
195
|
+
|
|
196
|
+
The seven-wave production hardening was specified, implemented, and refined in collaboration with **Claude Opus 4.7 (1M context)**. Per-wave architecture notes live under [`docs/architecture/`](docs/architecture/).
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NeuroQuant — production-grade neural-network quantization framework.
|
|
3
|
+
|
|
4
|
+
Public API for library users::
|
|
5
|
+
|
|
6
|
+
from neuroquant import (
|
|
7
|
+
# Quantizers (notebook / library use)
|
|
8
|
+
PTQQuantizer, AWQQuantizer, GPTQQuantizer,
|
|
9
|
+
SmoothQuantQuantizer, SmoothQuantGPTQQuantizer,
|
|
10
|
+
QATTrainer, AdaroundOptimizer,
|
|
11
|
+
# Multi-objective search + clustering + surrogate
|
|
12
|
+
NSGAIIClusterSearch, LayerClusterer, AccuracySurrogate,
|
|
13
|
+
# Configuration object (every quantizer accepts ``config=None``
|
|
14
|
+
# and falls back to ``QuantizationConfig()`` defaults)
|
|
15
|
+
QuantizationConfig,
|
|
16
|
+
# Explainability + Pareto visualization
|
|
17
|
+
XAIGenerator, ParetoAnalyzer, ParetoVisualizer,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
The ``neuroquant`` command-line entry point lives in
|
|
21
|
+
:mod:`neuroquant.cli` and is exposed via ``[project.scripts]`` in
|
|
22
|
+
``pyproject.toml``. Library users normally do not need to import it
|
|
23
|
+
directly — instantiate :class:`PTQQuantizer` (etc.) and drive the
|
|
24
|
+
pipeline themselves.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
__version__ = "2.0.0"
|
|
30
|
+
|
|
31
|
+
# Re-export the configuration dataclass first because every other
|
|
32
|
+
# public symbol depends on it (directly or transitively).
|
|
33
|
+
from neuroquant.config import QuantizationConfig
|
|
34
|
+
|
|
35
|
+
# Quantizers and the search / clustering / surrogate trio. The
|
|
36
|
+
# subpackage __init__ already curates these — re-export from there so
|
|
37
|
+
# any future additions land in one place.
|
|
38
|
+
from neuroquant.quantization import (
|
|
39
|
+
BaseQuantizer,
|
|
40
|
+
PTQQuantizer,
|
|
41
|
+
AWQQuantizer,
|
|
42
|
+
GPTQQuantizer,
|
|
43
|
+
SmoothQuantQuantizer,
|
|
44
|
+
SmoothQuantGPTQQuantizer,
|
|
45
|
+
AdaroundOptimizer,
|
|
46
|
+
QATTrainer,
|
|
47
|
+
NSGAIIClusterSearch,
|
|
48
|
+
LayerClusterer,
|
|
49
|
+
AccuracySurrogate,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Visualization surface (Pareto + plot helpers). ``XAIGenerator`` is
|
|
53
|
+
# re-exported from the visualization package via a guarded import so
|
|
54
|
+
# users who skipped the optional ``xai`` extras (shap / captum) don't
|
|
55
|
+
# crash at ``import neuroquant``.
|
|
56
|
+
from neuroquant.visualization import (
|
|
57
|
+
ParetoAnalyzer,
|
|
58
|
+
ParetoVisualizer,
|
|
59
|
+
XAIGenerator,
|
|
60
|
+
compute_layer_errors,
|
|
61
|
+
plot_error_attribution,
|
|
62
|
+
plot_error_comparison,
|
|
63
|
+
plot_sensitivity_heatmap,
|
|
64
|
+
plot_tier_distribution,
|
|
65
|
+
generate_html_report,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
__all__ = [
|
|
69
|
+
"__version__",
|
|
70
|
+
# Configuration
|
|
71
|
+
"QuantizationConfig",
|
|
72
|
+
# Quantizers
|
|
73
|
+
"BaseQuantizer",
|
|
74
|
+
"PTQQuantizer",
|
|
75
|
+
"AWQQuantizer",
|
|
76
|
+
"GPTQQuantizer",
|
|
77
|
+
"SmoothQuantQuantizer",
|
|
78
|
+
"SmoothQuantGPTQQuantizer",
|
|
79
|
+
"AdaroundOptimizer",
|
|
80
|
+
"QATTrainer",
|
|
81
|
+
# Search / clustering / surrogate
|
|
82
|
+
"NSGAIIClusterSearch",
|
|
83
|
+
"LayerClusterer",
|
|
84
|
+
"AccuracySurrogate",
|
|
85
|
+
# Visualization
|
|
86
|
+
"ParetoAnalyzer",
|
|
87
|
+
"ParetoVisualizer",
|
|
88
|
+
"XAIGenerator",
|
|
89
|
+
"compute_layer_errors",
|
|
90
|
+
"plot_error_attribution",
|
|
91
|
+
"plot_error_comparison",
|
|
92
|
+
"plot_sensitivity_heatmap",
|
|
93
|
+
"plot_tier_distribution",
|
|
94
|
+
"generate_html_report",
|
|
95
|
+
]
|