aphex-ml 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aphex_ml-0.1.0a1.dist-info/METADATA +467 -0
- aphex_ml-0.1.0a1.dist-info/RECORD +42 -0
- aphex_ml-0.1.0a1.dist-info/WHEEL +4 -0
- aphex_ml-0.1.0a1.dist-info/entry_points.txt +2 -0
- aphex_ml-0.1.0a1.dist-info/licenses/LICENSE +21 -0
- infermap/__init__.py +3 -0
- infermap/benchmark.py +1061 -0
- infermap/candidates.py +319 -0
- infermap/checker.py +105 -0
- infermap/cli.py +1923 -0
- infermap/cloud/__init__.py +0 -0
- infermap/cloud/config.py +36 -0
- infermap/cloud/instances.py +192 -0
- infermap/cloud/registry.py +112 -0
- infermap/cloud/remote.py +173 -0
- infermap/cloud/storage.py +189 -0
- infermap/converter.py +391 -0
- infermap/cost_model.py +212 -0
- infermap/deployment.py +216 -0
- infermap/distillation.py +301 -0
- infermap/evaluator.py +865 -0
- infermap/inspector.py +456 -0
- infermap/pareto.py +63 -0
- infermap/plugin.py +56 -0
- infermap/plugins/__init__.py +0 -0
- infermap/plugins/llm.py +472 -0
- infermap/plugins/pytorch.py +63 -0
- infermap/plugins/sklearn.py +571 -0
- infermap/preflight.py +180 -0
- infermap/profiler.py +225 -0
- infermap/pruning.py +175 -0
- infermap/recommender.py +118 -0
- infermap/registry.py +59 -0
- infermap/report.py +195 -0
- infermap/selector.py +204 -0
- infermap/serving/__init__.py +32 -0
- infermap/serving/base.py +14 -0
- infermap/serving/bentoml.py +82 -0
- infermap/serving/fastapi.py +86 -0
- infermap/serving/torchserve.py +79 -0
- infermap/serving/triton.py +111 -0
- infermap/system_recommender.py +137 -0
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aphex-ml
|
|
3
|
+
Version: 0.1.0a1
|
|
4
|
+
Summary: Hardware-aware ML deployment optimization and recommendation framework
|
|
5
|
+
Project-URL: Homepage, https://github.com/ray-singh/aphex
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/ray-singh/aphex/issues
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 Rayansh Singh
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Keywords: benchmarking,deployment,inference,machine-learning,mlops,optimization
|
|
30
|
+
Classifier: Development Status :: 3 - Alpha
|
|
31
|
+
Classifier: Environment :: Console
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: Intended Audience :: Science/Research
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Programming Language :: Python :: 3
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
37
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
38
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
39
|
+
Classifier: Topic :: System :: Hardware
|
|
40
|
+
Requires-Python: >=3.12
|
|
41
|
+
Requires-Dist: psutil>=6.0.0
|
|
42
|
+
Requires-Dist: rich>=13.7.0
|
|
43
|
+
Requires-Dist: typer>=0.12.0
|
|
44
|
+
Provides-Extra: aws
|
|
45
|
+
Requires-Dist: boto3>=1.34.0; extra == 'aws'
|
|
46
|
+
Provides-Extra: cuda
|
|
47
|
+
Requires-Dist: pynvml>=11.5.0; extra == 'cuda'
|
|
48
|
+
Provides-Extra: dev
|
|
49
|
+
Requires-Dist: mypy>=1.9.0; extra == 'dev'
|
|
50
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
|
|
51
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: ruff>=0.4.0; extra == 'dev'
|
|
53
|
+
Provides-Extra: full
|
|
54
|
+
Requires-Dist: boto3>=1.34.0; extra == 'full'
|
|
55
|
+
Requires-Dist: google-cloud-storage>=2.16.0; extra == 'full'
|
|
56
|
+
Requires-Dist: onnx>=1.17.0; extra == 'full'
|
|
57
|
+
Requires-Dist: onnxruntime>=1.17.0; extra == 'full'
|
|
58
|
+
Requires-Dist: pynvml>=11.5.0; extra == 'full'
|
|
59
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == 'full'
|
|
60
|
+
Requires-Dist: tensorflow>=2.15.0; extra == 'full'
|
|
61
|
+
Requires-Dist: torch>=2.2.0; extra == 'full'
|
|
62
|
+
Requires-Dist: torchinfo>=1.8.0; extra == 'full'
|
|
63
|
+
Requires-Dist: torchvision>=0.17.0; extra == 'full'
|
|
64
|
+
Provides-Extra: gcp
|
|
65
|
+
Requires-Dist: google-cloud-storage>=2.16.0; extra == 'gcp'
|
|
66
|
+
Provides-Extra: onnx
|
|
67
|
+
Requires-Dist: onnx>=1.17.0; extra == 'onnx'
|
|
68
|
+
Requires-Dist: onnxruntime>=1.17.0; extra == 'onnx'
|
|
69
|
+
Provides-Extra: sklearn
|
|
70
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == 'sklearn'
|
|
71
|
+
Provides-Extra: tensorflow
|
|
72
|
+
Requires-Dist: tensorflow>=2.15.0; extra == 'tensorflow'
|
|
73
|
+
Provides-Extra: torch
|
|
74
|
+
Requires-Dist: torch>=2.2.0; extra == 'torch'
|
|
75
|
+
Requires-Dist: torchinfo>=1.8.0; extra == 'torch'
|
|
76
|
+
Requires-Dist: torchvision>=0.17.0; extra == 'torch'
|
|
77
|
+
Description-Content-Type: text/markdown
|
|
78
|
+
|
|
79
|
+
<p align="center">
|
|
80
|
+
<img src="docs/logo/lockup-light.svg#gh-light-mode-only" alt="aphex" height="120"/>
|
|
81
|
+
<img src="docs/logo/lockup-dark.svg#gh-dark-mode-only" alt="aphex" height="120"/>
|
|
82
|
+
</p>
|
|
83
|
+
|
|
84
|
+
A hardware-aware deployment planner that profiles arbitrary ML models, searches the deployment space, and produces a recommended serving configuration with predicted latency/throughput tradeoffs — locally or on a remote cloud machine.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
- [Features](#features)
|
|
89
|
+
- [Installation](#installation)
|
|
90
|
+
- [Quickstart](#quickstart)
|
|
91
|
+
- [Example output](#example-output)
|
|
92
|
+
- [CLI reference](#cli-reference)
|
|
93
|
+
- [Pruning](#pruning)
|
|
94
|
+
- [Distillation](#distillation)
|
|
95
|
+
- [Multi-GPU benchmarking](#multi-gpu-benchmarking)
|
|
96
|
+
- [AWS integration](#aws-integration)
|
|
97
|
+
- [Pipeline](#pipeline)
|
|
98
|
+
- [Supported backends](#supported-backends)
|
|
99
|
+
- [Out of scope](#out-of-scope)
|
|
100
|
+
- [Requirements](#requirements)
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Features
|
|
105
|
+
|
|
106
|
+
- **Hardware profiling**: detects CPU cores, RAM, CUDA GPUs, Apple MPS, and CoreML availability
|
|
107
|
+
- **Model inspection**: parameter count, memory footprint (FP32/FP16/BF16), architecture family
|
|
108
|
+
- **Pre-flight checks**: fast feasibility check before committing to a full benchmark run
|
|
109
|
+
- **Multi-backend benchmarking**: PyTorch (FP32/FP16/BF16), ONNX Runtime (CPU/CUDA/CoreML), `torch.compile`, INT8 quantization, TensorRT, OpenVINO
|
|
110
|
+
- **Batch size sweep**: benchmarks every backend across multiple batch sizes in one run
|
|
111
|
+
- **Quality-constrained recommendation**: requires a labelled test dataset; measures accuracy/F1/MAE/RMSE drop from quantization and filters candidates that exceed your tolerance before ranking
|
|
112
|
+
- **Magnitude pruning**: 30 / 50 / 70 % unstructured + 2:4 structured (Ampere+ sparse Tensor Cores) appear as first-class benchmark candidates with measured accuracy drop
|
|
113
|
+
- **Knowledge distillation**: `aphex distill` trains a user-supplied student model from a teacher using soft-label KD (Hinton KL on temperature-scaled logits + optional CE)
|
|
114
|
+
- **Multi-GPU data-parallel sweep**: when ≥2 CUDA devices are detected, aphex automatically benchmarks `nn.DataParallel` variants (`pytorch_dp{2,4,8}_{fp32,fp16,bf16}`) so throughput scaling on multi-GPU boxes is measured, not assumed
|
|
115
|
+
- **Artifact export**: converts the recommended model to its deployment format (`.pt`, `.onnx`, `.engine`, `.xml`)
|
|
116
|
+
- **HTML report**: interactive latency-vs-throughput chart with full candidate table
|
|
117
|
+
- **Remote execution**: runs the full benchmark pipeline on an EC2 instance (or any SSH host) and pulls results back locally
|
|
118
|
+
- **Cloud registry**: push/pull versioned model artifacts to S3
|
|
119
|
+
- **sklearn / XGBoost / LightGBM / CatBoost support**: ONNX export for traditional ML models
|
|
120
|
+
|
|
121
|
+
## Installation
|
|
122
|
+
|
|
123
|
+
Install the core CLI (no ML frameworks):
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
pip install aphex
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Add the extras you need:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
pip install 'aphex[torch]' # PyTorch benchmarking (~2 GB)
|
|
133
|
+
pip install 'aphex[sklearn]' # scikit-learn / tree model support
|
|
134
|
+
pip install 'aphex[onnx]' # ONNX export + ONNX Runtime
|
|
135
|
+
pip install 'aphex[tensorflow]' # TensorFlow models
|
|
136
|
+
pip install 'aphex[aws]' # S3 registry + EC2 remote execution
|
|
137
|
+
pip install 'aphex[full]' # everything above
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Quickstart
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
# Inspect hardware and model
|
|
144
|
+
aphex analyze model.pt
|
|
145
|
+
|
|
146
|
+
# Feasibility check before benchmarking
|
|
147
|
+
aphex preflight model.pt --dtype fp16
|
|
148
|
+
|
|
149
|
+
# Benchmark all deployment strategies
|
|
150
|
+
aphex benchmark model.pt --input-shape 3,224,224
|
|
151
|
+
|
|
152
|
+
# Get an optimized recommendation (eval data + inference callable required)
|
|
153
|
+
aphex optimize model.pt --input-shape 3,224,224 \
|
|
154
|
+
--eval val.pt --infer-fn infer.py:predict \
|
|
155
|
+
--max-accuracy-loss 0.02
|
|
156
|
+
|
|
157
|
+
# Regression model: constrain by MAE instead
|
|
158
|
+
aphex optimize model.pt --input-shape 16 --eval val.pt --max-mae-loss 0.05 --objective latency
|
|
159
|
+
|
|
160
|
+
# Save an HTML report and metrics JSON
|
|
161
|
+
aphex optimize model.pt --input-shape 3,224,224 --eval val.pt --max-accuracy-loss 0.02 \
|
|
162
|
+
--report report.html --metrics metrics.json
|
|
163
|
+
|
|
164
|
+
# Distill a teacher into a smaller student (training-based; requires labels)
|
|
165
|
+
aphex distill teacher.pt --student make_student.py:tiny_mlp \
|
|
166
|
+
--eval val.pt --epochs 5 --output student.pt
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Example output
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
racing 7 backends × 4 batch sizes
|
|
173
|
+
|
|
174
|
+
✓ PyTorch FP32 CPU bs=1 17.55 ms 57 req/s
|
|
175
|
+
✓ PyTorch FP32 CPU bs=8 2.44 ms 410 req/s
|
|
176
|
+
✓ ONNX Runtime + CoreML bs=1 0.92 ms 1085 req/s
|
|
177
|
+
✓ ONNX Runtime + CoreML bs=8 0.31 ms 3226 req/s
|
|
178
|
+
✓ ONNX Runtime INT8 (CPU) bs=1 0.01 ms 9200 req/s
|
|
179
|
+
✓ ONNX Runtime INT8 (CPU) bs=8 0.04 ms 24800 req/s
|
|
180
|
+
...
|
|
181
|
+
|
|
182
|
+
#1 ONNX Runtime INT8 (CPU) bs=8 0.04 ms 24800 req/s ████████████████░░░░
|
|
183
|
+
#2 ONNX Runtime INT8 (CPU) bs=4 0.03 ms 16600 req/s █████████████░░░░░░░
|
|
184
|
+
#3 ONNX Runtime + CoreML bs=8 0.31 ms 3226 req/s ██░░░░░░░░░░░░░░░░░░
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## CLI reference
|
|
188
|
+
|
|
189
|
+
| Command | Description |
|
|
190
|
+
|---|---|
|
|
191
|
+
| `aphex analyze <model>` | Hardware profile + model inspection |
|
|
192
|
+
| `aphex preflight <model>` | Feasibility check (fast, no benchmarking) |
|
|
193
|
+
| `aphex benchmark <model>` | Full benchmark across all backends |
|
|
194
|
+
| `aphex optimize <model>` | Benchmark + Pareto-optimal recommendation + artifact export |
|
|
195
|
+
| `aphex convert <model>` | Convert a model to a specific backend format |
|
|
196
|
+
| `aphex distill <teacher>` | Knowledge-distill a teacher into a smaller student model |
|
|
197
|
+
| `aphex check <model> --from-config deployment.yaml` | Regression-check a model against a saved deployment baseline (CI-friendly; exits 1 on threshold breach) |
|
|
198
|
+
| `aphex targets` | List available hardware targets |
|
|
199
|
+
| `aphex push <deployment.yaml> <artifact>` | Push a versioned model to S3 |
|
|
200
|
+
| `aphex pull <name>` | Pull a model artifact from S3 |
|
|
201
|
+
| `aphex ls` | List models and versions in the S3 registry |
|
|
202
|
+
|
|
203
|
+
### Common options
|
|
204
|
+
|
|
205
|
+
```
|
|
206
|
+
--input-shape 3,224,224 Input tensor shape (no batch dim)
|
|
207
|
+
--batch-sizes 1,2,4,8 Batch sizes to sweep (comma-separated)
|
|
208
|
+
--objective latency Optimization goal: latency | throughput | memory
|
|
209
|
+
--eval PATH Labelled test dataset (.pt, .csv, .parquet, image dir, or s3://, gs:// URI). Required for optimize.
|
|
210
|
+
--infer-fn module.py:fn Inference callable for true accuracy measurement. Required to score `--eval` against the user's full pipeline.
|
|
211
|
+
--max-accuracy-loss 0.02 Max relative accuracy drop vs original model baseline (classification)
|
|
212
|
+
--max-f1-loss 0.02 Max relative macro-F1 drop vs original model baseline (classification)
|
|
213
|
+
--max-mae-loss 0.05 Max relative MAE increase vs original model baseline (regression)
|
|
214
|
+
--max-rmse-loss 0.05 Max relative RMSE increase vs original model baseline (regression)
|
|
215
|
+
--max-latency-ms 5.0 Hard latency constraint (p50)
|
|
216
|
+
--max-memory-mb 512 Hard memory constraint
|
|
217
|
+
--min-throughput-rps 200 Hard throughput constraint
|
|
218
|
+
--calibration-data PATH .pt file or image dir for INT8 quantization calibration
|
|
219
|
+
--format table|json Output format (json suppresses Rich output)
|
|
220
|
+
--report PATH Write an HTML benchmark report
|
|
221
|
+
--metrics PATH Write benchmark metrics as JSON
|
|
222
|
+
--remote HOST Run benchmark on a remote SSH host
|
|
223
|
+
--output PATH Where to write the deployment artifact
|
|
224
|
+
--jobs N, -j N Run N (candidate, batch_size) benchmarks in parallel (default 1; keep at 1 for accurate latency/memory)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Loading user models safely
|
|
228
|
+
|
|
229
|
+
Some models can only be loaded with PyTorch's pickle-based loader, which executes arbitrary code on load. aphex defaults to the safe `weights_only=True` path; if it fails with a pickle-related error you get a friendly message listing both options
|
|
230
|
+
(re-save as a `state_dict`, or opt in for trusted files):
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
APHEX_TRUST_PICKLE=1 aphex optimize model.pt --input-shape 3,224,224 --eval val.pt ...
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Do not set this for models from untrusted sources.
|
|
237
|
+
|
|
238
|
+
## Pruning
|
|
239
|
+
|
|
240
|
+
Magnitude pruning is wired in as four additional benchmark candidates on every device path, so `aphex benchmark` / `aphex optimize` automatically score them alongside FP16, INT8, etc.:
|
|
241
|
+
|
|
242
|
+
| Backend | Sparsity | Notes |
|
|
243
|
+
|---|---|---|
|
|
244
|
+
| `pytorch_prune_unstructured_30` | 30 % | L1-magnitude, every Linear/Conv weight |
|
|
245
|
+
| `pytorch_prune_unstructured_50` | 50 % | same, more aggressive |
|
|
246
|
+
| `pytorch_prune_unstructured_70` | 70 % | accuracy cost is usually visible past 50 % |
|
|
247
|
+
| `pytorch_prune_2_4` | 50 % structured | 2-of-4 pattern for NVIDIA Ampere+ sparse Tensor Cores |
|
|
248
|
+
|
|
249
|
+
aphex measures both **latency** and **accuracy drop** for pruned candidates through the same pipeline as quantized backends. Latency improvement on dense CPUs is usually modest; the value is the storage / accuracy tradeoff and, on Ampere+ GPUs, the 2:4 sparse-kernel speedup. Use the existing `--max-accuracy-loss` / `--max-f1-loss` / etc. flags to filter out pruned variants that exceed your quality budget before ranking.
|
|
250
|
+
|
|
251
|
+
aphex's pruning is **post-training**: no labels, no gradient updates. For recovery training, distill into a smaller dense student instead (below).
|
|
252
|
+
|
|
253
|
+
## Distillation
|
|
254
|
+
|
|
255
|
+
`aphex distill` is the only command that performs gradient updates. It trains a **student** model to imitate a **teacher** using soft-label knowledge
|
|
256
|
+
|
|
257
|
+
Distillation:
|
|
258
|
+
```
|
|
259
|
+
L = α · KL( softmax(student / T) || softmax(teacher / T) ) · T²
|
|
260
|
+
+ (1 - α) · CE(student, hard_label) # classification
|
|
261
|
+
L = MSE(student, teacher) # regression
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
The student architecture is yours; provide a zero-argument factory function and aphex handles the training loop, scoring, and report:
|
|
265
|
+
|
|
266
|
+
```bash
|
|
267
|
+
# Write a tiny factory file
|
|
268
|
+
cat > make_student.py <<'PY'
|
|
269
|
+
import torch.nn as nn
|
|
270
|
+
def tiny_mlp():
|
|
271
|
+
return nn.Sequential(nn.Linear(8, 4), nn.ReLU(), nn.Linear(4, 3))
|
|
272
|
+
PY
|
|
273
|
+
|
|
274
|
+
aphex distill teacher.pt \
|
|
275
|
+
--student make_student.py:tiny_mlp \
|
|
276
|
+
--eval val.pt \
|
|
277
|
+
--epochs 8 --batch-size 16 --lr 1e-2 \
|
|
278
|
+
--temperature 3.0 --alpha 0.7 \
|
|
279
|
+
--task classification --device cpu \
|
|
280
|
+
--output student.pt --report distill_report.json
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Output (excerpt):
|
|
284
|
+
|
|
285
|
+
```
|
|
286
|
+
teacher params 387
|
|
287
|
+
student params 51
|
|
288
|
+
epochs=8 batch=16 lr=0.01 temp=3.0 alpha=0.7 task=classification device=cpu
|
|
289
|
+
|
|
290
|
+
compression 7.6×
|
|
291
|
+
final loss 1.93 (first epoch 3.99)
|
|
292
|
+
accuracy teacher 1.0000 → student 0.7350
|
|
293
|
+
✓ student state_dict → student.pt
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
Common flags:
|
|
297
|
+
|
|
298
|
+
```
|
|
299
|
+
--student module.py:fn Student factory: zero-arg callable returning an nn.Module
|
|
300
|
+
--eval PATH Labelled dataset for distillation
|
|
301
|
+
--task classification|regression
|
|
302
|
+
--temperature 4.0 KD softmax temperature (higher = softer teacher distribution)
|
|
303
|
+
--alpha 0.7 Weight on KD loss; (1 - alpha) on hard-label CE (classification only)
|
|
304
|
+
--epochs 3 --batch-size 32 --lr 1e-3
|
|
305
|
+
--device cpu|cuda|mps
|
|
306
|
+
--output student.pt Destination for the trained student state_dict
|
|
307
|
+
--report report.json Optional JSON: per-epoch losses, param counts, teacher/student scores
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
The teacher is frozen during training. With `labels=None` aphex falls back to
|
|
311
|
+
pure KD (`alpha=1.0`). The output is a `state_dict` — reconstruct your student
|
|
312
|
+
with the same factory + `load_state_dict()` to deploy or feed back into
|
|
313
|
+
`aphex optimize` for a deployment-format search.
|
|
314
|
+
|
|
315
|
+
## Multi-GPU benchmarking
|
|
316
|
+
|
|
317
|
+
When `profile_hardware()` detects more than one CUDA device, the candidate generator emits single-process `nn.DataParallel` variants alongside the regular PyTorch backends:
|
|
318
|
+
|
|
319
|
+
| Backend | Devices | Dtype |
|
|
320
|
+
|---|---|---|
|
|
321
|
+
| `pytorch_dp2_{fp32,fp16,bf16}` | 2 × GPU | matches dtype suffix |
|
|
322
|
+
| `pytorch_dp4_{fp32,fp16,bf16}` | 4 × GPU (host must have ≥4) | — |
|
|
323
|
+
| `pytorch_dp8_{fp32,fp16,bf16}` | 8 × GPU (host must have ≥8) | — |
|
|
324
|
+
|
|
325
|
+
BF16 variants are only emitted on Ampere+ (sm_80+). DP shards the batch dimension across replicas, so it's a **throughput** win at large batch and a **latency** no-op (or slight loss) at `batch=1`. The runner enforces `--batch-size >= N` and surfaces a clear error otherwise — feed a multi-GPU sweep a sensible batch list:
|
|
326
|
+
|
|
327
|
+
```bash
|
|
328
|
+
aphex benchmark model.pt --input-shape 3,224,224 --batch-sizes 8,16,32
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
DP candidates do not run the cosine-similarity accuracy proxy: replication doesn't alter weights, so accuracy is identical to the underlying-dtype candidate (e.g. `pytorch_dp4_fp16` shares the same accuracy signal as `pytorch_fp16`).
|
|
332
|
+
|
|
333
|
+
For real distributed training / inference (DDP, tensor parallelism, pipeline parallelism, multi-node), see **Out of scope** below.
|
|
334
|
+
|
|
335
|
+
## AWS integration
|
|
336
|
+
|
|
337
|
+
### Remote benchmarking on EC2
|
|
338
|
+
|
|
339
|
+
Run the full benchmark pipeline on a remote machine — useful when you want results for a GPU instance without setting up a local GPU environment.
|
|
340
|
+
|
|
341
|
+
```bash
|
|
342
|
+
# Benchmark on an EC2 instance and pull results back locally
|
|
343
|
+
aphex optimize model.pt \
|
|
344
|
+
--input-shape 3,224,224 \
|
|
345
|
+
--eval val.pt \
|
|
346
|
+
--max-accuracy-loss 0.02 \
|
|
347
|
+
--remote ec2-user@<instance-ip> \
|
|
348
|
+
--output deployment.yaml \
|
|
349
|
+
--report report.html \
|
|
350
|
+
--metrics metrics.json
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
aphex uploads the model and eval dataset, runs the full benchmark on the remote host, streams output to your terminal, then downloads `deployment.yaml`, the HTML report, and the metrics JSON. The remote temp directory is cleaned up automatically.
|
|
354
|
+
|
|
355
|
+
**Setup**
|
|
356
|
+
|
|
357
|
+
1. Add the instance to `~/.ssh/config`:
|
|
358
|
+
|
|
359
|
+
```
|
|
360
|
+
Host <instance-ip>
|
|
361
|
+
IdentityFile ~/.ssh/your-key.pem
|
|
362
|
+
User ec2-user
|
|
363
|
+
StrictHostKeyChecking no
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
2. Install aphex on the instance:
|
|
367
|
+
|
|
368
|
+
```bash
|
|
369
|
+
ssh ec2-user@<instance-ip> "pip install 'aphex[torch,onnx]'"
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
3. Verify the connection:
|
|
373
|
+
|
|
374
|
+
```bash
|
|
375
|
+
ssh ec2-user@<instance-ip> "aphex --help"
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
**Recommended instance type for cost-effective benchmarking:** `t3a.large` (8 GB RAM, ~$0.02/hr as a spot instance) covers most CPU/ONNX workloads. Use a `g4dn.xlarge` for GPU benchmarking.
|
|
379
|
+
|
|
380
|
+
### S3 model registry
|
|
381
|
+
|
|
382
|
+
Push versioned model artifacts to S3 and pull them from any machine.
|
|
383
|
+
|
|
384
|
+
```bash
|
|
385
|
+
# Configure storage (one-time)
|
|
386
|
+
export APHEX_BUCKET=my-models-bucket
|
|
387
|
+
export AWS_REGION=us-east-1
|
|
388
|
+
|
|
389
|
+
# Push a deployment artifact
|
|
390
|
+
aphex push deployment.yaml model.onnx --name resnet50 --version v1
|
|
391
|
+
|
|
392
|
+
# Pull on another machine
|
|
393
|
+
aphex pull resnet50 # latest version
|
|
394
|
+
aphex pull resnet50@v1 # specific version
|
|
395
|
+
aphex pull resnet50 --out ./models/
|
|
396
|
+
|
|
397
|
+
# List what's in the registry
|
|
398
|
+
aphex ls # all models
|
|
399
|
+
aphex ls resnet50 # versions of a specific model
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
Credentials are picked up from the standard AWS chain (`AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY`, `~/.aws/credentials`, or an IAM instance role).
|
|
403
|
+
|
|
404
|
+
## Pipeline
|
|
405
|
+
|
|
406
|
+
```
|
|
407
|
+
model.pt + hardware
|
|
408
|
+
|
|
|
409
|
+
v
|
|
410
|
+
inspect_model() → parameters, memory, family
|
|
411
|
+
profile_hardware() → CPU, RAM, GPU / MPS / CoreML
|
|
412
|
+
|
|
|
413
|
+
v
|
|
414
|
+
run_preflight() → feasibility: ok / tight / unlikely / impossible
|
|
415
|
+
|
|
|
416
|
+
v
|
|
417
|
+
generate_candidates() → (backend, dtype, device, batch_size) combos
|
|
418
|
+
incl. quantized, pruned (30/50/70/2:4), and torch.compile variants
|
|
419
|
+
|
|
|
420
|
+
v
|
|
421
|
+
benchmark_candidate() × (backends × batch sizes) → p50 / p95 / p99, throughput, memory
|
|
422
|
+
|
|
|
423
|
+
v
|
|
424
|
+
evaluate_quality() → accuracy/F1/MAE/RMSE drop vs original model baseline (--eval dataset)
|
|
425
|
+
|
|
|
426
|
+
v
|
|
427
|
+
recommend() → Pareto frontier → filter by quality constraint → best candidate for objective
|
|
428
|
+
|
|
|
429
|
+
v
|
|
430
|
+
convert() → deployment artifact (.pt / .onnx / .engine / .xml)
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
## Supported backends
|
|
434
|
+
|
|
435
|
+
| Backend | Device | Dtype |
|
|
436
|
+
|---|---|---|
|
|
437
|
+
| PyTorch eager | CPU | FP32 |
|
|
438
|
+
| PyTorch eager | MPS (Apple Silicon) | FP32, FP16 |
|
|
439
|
+
| PyTorch eager | CUDA | FP32, FP16, BF16 |
|
|
440
|
+
| torch.compile | CPU / CUDA | FP32 |
|
|
441
|
+
| ONNX Runtime | CPU | FP32 |
|
|
442
|
+
| ONNX Runtime + CoreML | Apple Silicon | FP32 |
|
|
443
|
+
| ONNX Runtime | CUDA | FP32 |
|
|
444
|
+
| PyTorch INT8 dynamic | CPU | INT8 |
|
|
445
|
+
| ONNX Runtime INT8 | CPU | INT8 |
|
|
446
|
+
| TensorRT | CUDA | FP32, FP16, INT8 |
|
|
447
|
+
| OpenVINO | CPU | FP32, INT8 |
|
|
448
|
+
| PyTorch + magnitude prune | CPU / MPS / CUDA | FP32 @ 30 / 50 / 70 % sparsity |
|
|
449
|
+
| PyTorch + 2:4 structured prune | CPU / MPS / CUDA (Ampere+ for speedup) | FP32 @ 50 % sparsity |
|
|
450
|
+
| PyTorch + `nn.DataParallel` | CUDA × {2, 4, 8} GPUs | FP32, FP16, BF16 (throughput-oriented) |
|
|
451
|
+
|
|
452
|
+
## Out of scope (for current version)
|
|
453
|
+
|
|
454
|
+
- **Pruning recovery training**: aphex's pruning is post-training only. If your model degrades past tolerance at the sparsity you want, distill into a smaller dense student instead.
|
|
455
|
+
- **Quantization-aware training (QAT)**: only post-training quantization is supported.
|
|
456
|
+
- **LLM-specific quality metrics**: cosine-similarity proxies are skipped for generative families (`llm`, `transformer_decoder`, `seq2seq`); score those models with a custom `--infer-fn` (perplexity, task benchmarks).
|
|
457
|
+
- **Distributed multi-GPU (DDP / tensor parallelism / pipeline parallelism)**: aphex sweeps single-process `nn.DataParallel` candidates (`pytorch_dp{2,4,8}_{fp32,fp16,bf16}`) when ≥2 CUDA devices are detected, but real DDP / `torchrun` orchestration and tensor- or pipeline-parallel sharding are out of scope. DP variants are throughput-oriented and require `--batch-size >= N`.
|
|
458
|
+
|
|
459
|
+
## Requirements
|
|
460
|
+
|
|
461
|
+
- Python 3.12+
|
|
462
|
+
- At least one framework extra (`aphex[torch]`, `aphex[sklearn]`, etc.)
|
|
463
|
+
- For remote execution: `ssh` and `scp` on the local machine, `aphex` installed on the remote
|
|
464
|
+
|
|
465
|
+
## License
|
|
466
|
+
|
|
467
|
+
MIT
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
infermap/__init__.py,sha256=ciHUaE-DSNcb0vTh3UmkcDUAFr5-KU-dNJ8PkV-LlKI,95
|
|
2
|
+
infermap/benchmark.py,sha256=6HvNGxa4hVBfNBvcrPTtAPYvT5MOZmRM9430S5QJUio,38409
|
|
3
|
+
infermap/candidates.py,sha256=856VSq2pHz0dye_HzSRcyI4_rs9C2AfpyNnPYkaO1es,10013
|
|
4
|
+
infermap/checker.py,sha256=Iz7ygPUWrbFpuGSTlV1riq4Gcbg3Q9AYZ4VajOjN3nM,3636
|
|
5
|
+
infermap/cli.py,sha256=unTrkQ0eNFR1nKZ7WpdUy9nZV4ISd34bIBgLBySLNK8,73355
|
|
6
|
+
infermap/converter.py,sha256=rFMiFgfK6TCeoLNFnr9L7sDVRHmHR8vECPRBa_FQbXA,12846
|
|
7
|
+
infermap/cost_model.py,sha256=1x6zPEtxssTYwyeD-UJnTMktOvP8c8_ECqzBWE_buhY,7300
|
|
8
|
+
infermap/deployment.py,sha256=KoIZyQubo8rczsDpJU80HIV-tAYAdO98V7zVvDvx6eo,7558
|
|
9
|
+
infermap/distillation.py,sha256=zQq5alnf4e904hfpSLi0cCvhOdt1EvuNApZc9rbqvhM,9861
|
|
10
|
+
infermap/evaluator.py,sha256=pLpe99ZztYR1RJxWATes87kz-8-v-uP3Hf_PbY4LjUk,32655
|
|
11
|
+
infermap/inspector.py,sha256=3sjuI1cHwaPKcXBb2rAXAkimgcXwke0kMaDixYdr8Ak,18289
|
|
12
|
+
infermap/pareto.py,sha256=207HKuefnO-n69uQ81WMDVAqMuwdCgEkG7Cy3D3_kls,2017
|
|
13
|
+
infermap/plugin.py,sha256=h7Z6q_NB6jLiQisqS3I92Q8ixESLOrFJuCkjkBxENgg,1871
|
|
14
|
+
infermap/preflight.py,sha256=bUIIzMLTzGkCqfxmg6UydXj8iov4Vbv6yjYYvwEyiNc,6705
|
|
15
|
+
infermap/profiler.py,sha256=AnCKZJxSM9ExAQizEZ_Ubg2W3zhEA2EXUhONpAgor7o,7044
|
|
16
|
+
infermap/pruning.py,sha256=TZ6EjPgCOR7CtUJVSasGPVE9BfzfUWd3174XmmDngFQ,5939
|
|
17
|
+
infermap/recommender.py,sha256=n1NYcYX-4EeXqx8lmO8c5EQBxQ5A5YwqqqxAa2GvAe8,4582
|
|
18
|
+
infermap/registry.py,sha256=Yd5Na1h2UNlexRvMIH2xeXlF33tbp6ctNlD_bGFi_Jk,1758
|
|
19
|
+
infermap/report.py,sha256=sD7VCawAfFrTk0wqncjL0vNqXH_9LqhUPxekHtHlCwY,6904
|
|
20
|
+
infermap/selector.py,sha256=AV7WHHynte_CIrE5G8avs1JPkUjlhAsTMMu3COVSYUk,7255
|
|
21
|
+
infermap/system_recommender.py,sha256=LukCmI70muC6fFUwOOw_jLMmPY8lS4QI85exFSrn2Rg,4693
|
|
22
|
+
infermap/cloud/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
+
infermap/cloud/config.py,sha256=HzhmuYQXx9FLrOW7nxHWbyX-9Nu-5doqtvpvCk0GMs4,963
|
|
24
|
+
infermap/cloud/instances.py,sha256=lC6eBu-zWBjttZKcxKuBSCo-NaUY4UpRri5HS4T9t3c,9115
|
|
25
|
+
infermap/cloud/registry.py,sha256=9Wg7budmLqPjY3lxliq7uWm7UwaAFQPjS2DszBYgaIo,4025
|
|
26
|
+
infermap/cloud/remote.py,sha256=pBKlJzm1Wsg83eSCH2TX-n2nkZ-IxoJHKUZuXxdlh1o,5983
|
|
27
|
+
infermap/cloud/storage.py,sha256=WnA0pPcDDHDf-VGEB-Mqv83wL57YBFYCtBk2W9FCHdU,6990
|
|
28
|
+
infermap/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
+
infermap/plugins/llm.py,sha256=k4KCvFV4oQK9mPRl0yIDD_604gqmO-FZPAquAUCrrBc,16663
|
|
30
|
+
infermap/plugins/pytorch.py,sha256=x7mqlDlaGvucgRVAEvpeaDGHANfN_q445eCp4tH4lIQ,2061
|
|
31
|
+
infermap/plugins/sklearn.py,sha256=O83QpKe3mGVd4j4XdEEZSEmLxvcMExppKvJsOAGI4qc,19716
|
|
32
|
+
infermap/serving/__init__.py,sha256=KK1tNYe-a8m64y-yQEC7L6mkJWZZetvEp3XwiDpTUBg,1015
|
|
33
|
+
infermap/serving/base.py,sha256=j8_Nf471d5NCfqC3W9d8v8QX253Tmy9UvNdN1qZx5Vo,382
|
|
34
|
+
infermap/serving/bentoml.py,sha256=uezpTZ_3woLY3pC3NsdBoHnuATN9FAcPxXqze93SjZM,2557
|
|
35
|
+
infermap/serving/fastapi.py,sha256=7wiMxugHY4dYfzzy-qWCqp979zWP464Bvk0izwgwUws,2765
|
|
36
|
+
infermap/serving/torchserve.py,sha256=JFBWt-ImJ6FuoH-Di_etFEQhEI2Vc_bIoRhT4tzj7II,2341
|
|
37
|
+
infermap/serving/triton.py,sha256=aWcsjRB7DfE4_2vxerJA7VMSoshURCg2rHVNSaS5KVE,3045
|
|
38
|
+
aphex_ml-0.1.0a1.dist-info/METADATA,sha256=fJgPYud5ox1aYzLz5-Mya4E6RLqAoGk7E8lLW1oh9wI,20960
|
|
39
|
+
aphex_ml-0.1.0a1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
40
|
+
aphex_ml-0.1.0a1.dist-info/entry_points.txt,sha256=aZlTMK8iceHb9XwJsqcwVJd4mRnxswiEGbDDVO2qGEA,43
|
|
41
|
+
aphex_ml-0.1.0a1.dist-info/licenses/LICENSE,sha256=xcXzVkD_a7_AUdj4aE2tE9H9bSNR7CTDyfqqiW_wDGs,1070
|
|
42
|
+
aphex_ml-0.1.0a1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Rayansh Singh
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
infermap/__init__.py
ADDED